diff --git a/_quarto.yml b/_quarto.yml index e9326b4f..a5f1907b 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -739,8 +739,11 @@ quartodoc: - calendar.holiday.get_holiday_features - calendar.holiday.create_holiday_adjacency_df - calendar.holiday.get_holiday_adjacency_features + - calendar.holiday.create_day_type_df + - calendar.holiday.get_day_type_features - calendar.features.get_calendar_features - calendar.features.get_day_night_features + - calendar.features.get_ephemeris_features # ── Weather ─────────────────────────────────────────────────────────────── - title: "Weather" diff --git a/docs/reference/calendar.features.get_ephemeris_features.qmd b/docs/reference/calendar.features.get_ephemeris_features.qmd new file mode 100644 index 00000000..f1b6aec6 --- /dev/null +++ b/docs/reference/calendar.features.get_ephemeris_features.qmd @@ -0,0 +1,71 @@ +# calendar.features.get_ephemeris_features { #spotforecast2_safe.calendar.features.get_ephemeris_features } + +```python +calendar.features.get_ephemeris_features( + start, + cov_end, + location, + freq='h', + timezone='UTC', +) +``` + +Create continuous solar-geometry features from the ephemeris. + +Unlike `get_day_night_features` (which rounds sunrise/sunset to whole +hours and emits a binary daylight flag), this builder exposes the +*continuous* solar geometry the hour-of-day RBFs only encode implicitly: +the per-hour solar elevation, the exact daylight duration, and the signed +time relative to sunrise and sunset. These linearise lighting-load timing +and the midday PV offset, are purely deterministic from the date and the +fixed coordinates, add no dependency, and leak nothing for any forecast +hour (Xie & Hong 2018, ``xieh18a``; López 2020, ``lope20a``). + +The returned DataFrame contains four ``float64`` columns: + +- ``solar_elevation`` — solar elevation angle in degrees (negative at + night, peaking at solar noon). +- ``daylight_duration_h`` — exact sunset−sunrise span for the date, hours. +- ``hours_since_sunrise`` — signed hours since that date's sunrise + (negative before sunrise). +- ``hours_to_sunset`` — signed hours until that date's sunset + (negative after sunset). + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|----------|-----------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------|------------| +| start | [Union](`typing.Union`)\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\] | Start of the time range. String values are parsed with ``utc=True``. | _required_ | +| cov_end | [Union](`typing.Union`)\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\] | Inclusive end of the time range. String values are parsed with ``utc=True``. | _required_ | +| location | [LocationInfo](`astral.LocationInfo`) | `LocationInfo` describing the geographic location. | _required_ | +| freq | [str](`str`) | Pandas-compatible frequency string for the output index. Defaults to ``"h"`` (hourly). | `'h'` | +| timezone | [str](`str`) | Timezone label applied to the generated index. Defaults to ``"UTC"``. | `'UTC'` | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|------------------------------------------------|---------------------------------------------------------------------| +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | pd.DataFrame: Columns ``solar_elevation``, ``daylight_duration_h``, | +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | ``hours_since_sunrise``, ``hours_to_sunset``; tz-aware | +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | `DatetimeIndex` with the requested ``freq``. | + +## Examples {.doc-section .doc-section-examples} + +```{python} +import pandas as pd +from astral import LocationInfo +from spotforecast2_safe.calendar import get_ephemeris_features + +start = pd.Timestamp("2024-06-21", tz="UTC") +cov_end = pd.Timestamp("2024-06-21 23:00", tz="UTC") +location = LocationInfo(latitude=51.5136, longitude=7.4653, timezone="UTC") + +feats = get_ephemeris_features(start, cov_end, location) +print("columns:", feats.columns.tolist()) +print("shape:", feats.shape) +# Summer solstice: long day and a high midday sun in Dortmund. +print("max elevation:", round(feats["solar_elevation"].max(), 1)) +assert feats.shape == (24, 4) +assert feats["solar_elevation"].max() > 50.0 +assert feats["daylight_duration_h"].iloc[0] > 14.0 +``` \ No newline at end of file diff --git a/docs/reference/calendar.holiday.create_day_type_df.qmd b/docs/reference/calendar.holiday.create_day_type_df.qmd new file mode 100644 index 00000000..697a7913 --- /dev/null +++ b/docs/reference/calendar.holiday.create_day_type_df.qmd @@ -0,0 +1,63 @@ +# calendar.holiday.create_day_type_df { #spotforecast2_safe.calendar.holiday.create_day_type_df } + +```python +calendar.holiday.create_day_type_df( + start, + end, + tz='UTC', + freq='h', + country_code='DE', + state='NW', +) +``` + +Create a day-type refinement of the public-holiday column. + +Returns two integer columns derived purely from the weekday and the +public-holiday calendar (pure calendar arithmetic — known years ahead, +leakage-free): + +- ``is_workday``: ``1`` when the day is Monday–Friday **and** not a public + holiday, else ``0``. +- ``day_type``: an integer class with public-holiday precedence — + ``0`` working day, ``1`` Saturday (non-holiday), ``2`` Sunday + (non-holiday), ``3`` public holiday (any weekday). A public holiday that + falls on a weekend is still classed as ``3``. + +These remove some of the worst single-day errors a plain holiday flag +leaves behind (Ziel 2018, ``ziel18a``). + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|--------------|----------------------------------------------------------------|------------------------------------------------------|------------| +| start | [str](`str`) \| [pd](`pandas`).[Timestamp](`pandas.Timestamp`) | Start date/datetime. | _required_ | +| end | [str](`str`) \| [pd](`pandas`).[Timestamp](`pandas.Timestamp`) | End date/datetime. | _required_ | +| tz | [str](`str`) | Timezone to use if not inferred from start/end. | `'UTC'` | +| freq | [str](`str`) | Frequency of the resulting DataFrame. | `'h'` | +| country_code | [str](`str`) | Country code for holidays (e.g. ``"DE"``, ``"US"``). | `'DE'` | +| state | [str](`str`) | State code for holidays (e.g. ``"NW"``, ``"CA"``). | `'NW'` | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|------------------------------------------------|----------------------------------------------------------------------| +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | pd.DataFrame: Index covering ``[start, end]`` at *freq* with integer | +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | columns ``is_workday`` and ``day_type``; no NaNs. | + +## Examples {.doc-section .doc-section-examples} + +```{python} +import pandas as pd +from spotforecast2_safe.calendar import create_day_type_df + +# 2024-01-01 Mon = New Year (holiday), 02 Tue = workday, +# 06 Sat, 07 Sun. +df = create_day_type_df("2024-01-01", "2024-01-07", freq="D") +print(df["is_workday"].tolist()) +print(df["day_type"].tolist()) +assert df.loc["2024-01-01", "day_type"] == 3 # holiday +assert df.loc["2024-01-02", "is_workday"] == 1 +assert df.loc["2024-01-06", "day_type"] == 1 # Saturday +assert df.loc["2024-01-07", "day_type"] == 2 # Sunday +``` \ No newline at end of file diff --git a/docs/reference/calendar.holiday.get_day_type_features.qmd b/docs/reference/calendar.holiday.get_day_type_features.qmd new file mode 100644 index 00000000..1990e26b --- /dev/null +++ b/docs/reference/calendar.holiday.get_day_type_features.qmd @@ -0,0 +1,71 @@ +# calendar.holiday.get_day_type_features { #spotforecast2_safe.calendar.holiday.get_day_type_features } + +```python +calendar.holiday.get_day_type_features( + data, + start, + cov_end, + forecast_horizon, + tz='UTC', + freq='h', + country_code='DE', + state='NW', +) +``` + +Build day-type indicators and align them to a regular time grid. + +Generates ``is_workday`` and ``day_type`` via `create_day_type_df()`, +validates temporal coverage with `curate_holidays()`, and reindexes onto +the full ``[start, cov_end]`` grid. Trailing/leading grid cells outside the +generated range are filled with the working-day defaults +(``is_workday=0`` is wrong for a workday, so the grid is generated to fully +cover the request and no fill is expected; ``fill_value`` only guards the +degenerate empty-overlap case). + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|------------------|-----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|------------| +| data | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | Reference time series DataFrame used for temporal coverage validation inside `curate_holidays()`. | _required_ | +| start | [Union](`typing.Union`)\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\] | Start timestamp. String values are parsed with ``utc=True``. | _required_ | +| cov_end | [Union](`typing.Union`)\[[str](`str`), [pd](`pandas`).[Timestamp](`pandas.Timestamp`)\] | Inclusive end timestamp (should cover the full forecast horizon). String values are parsed with ``utc=True``. | _required_ | +| forecast_horizon | [int](`int`) | Number of forecast steps ahead; passed to `curate_holidays()`. | _required_ | +| tz | [str](`str`) | Timezone applied to the generated index. Defaults to ``"UTC"``. | `'UTC'` | +| freq | [str](`str`) | Pandas-compatible frequency string. Defaults to ``"h"``. | `'h'` | +| country_code | [str](`str`) | ISO 3166-1 alpha-2 country code. Defaults to ``"DE"``. | `'DE'` | +| state | [str](`str`) | Sub-national state/region code. Defaults to ``"NW"``. | `'NW'` | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|------------------------------------------------|-------------------------------------------------------------------------| +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | pd.DataFrame: Integer columns ``is_workday`` and ``day_type``; tz-aware | +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | `DatetimeIndex` with the requested *freq*. | + +## Examples {.doc-section .doc-section-examples} + +```{python} +import pandas as pd +from spotforecast2_safe.calendar import get_day_type_features + +forecast_horizon = 24 +n_data = 48 +data = pd.DataFrame( + {"load": range(n_data)}, + index=pd.date_range("2024-01-01", periods=n_data, freq="h", tz="UTC"), +) +start = data.index[0] +cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + +feats = get_day_type_features( + data=data, start=start, cov_end=cov_end, + forecast_horizon=forecast_horizon, +) +print("columns:", feats.columns.tolist()) +print("shape:", feats.shape) +# 2024-01-01 is New Year (holiday) → day_type 3, not a workday. +assert feats.loc["2024-01-01 00:00:00+00:00", "day_type"] == 3 +assert feats.loc["2024-01-01 00:00:00+00:00", "is_workday"] == 0 +assert feats.shape == (n_data + forecast_horizon, 2) +``` \ No newline at end of file diff --git a/docs/reference/configurator.config_multi.ConfigMulti.qmd b/docs/reference/configurator.config_multi.ConfigMulti.qmd index 55abb55d..94c7e9f1 100644 --- a/docs/reference/configurator.config_multi.ConfigMulti.qmd +++ b/docs/reference/configurator.config_multi.ConfigMulti.qmd @@ -33,6 +33,8 @@ configurator.config_multi.ConfigMulti( include_apparent_temperature=False, degree_hours_base_heating=15.0, degree_hours_base_cooling=22.0, + include_ephemeris_features=False, + include_day_type_features=False, poly_features_degree=1, max_poly_features=10, poly_mi_n_jobs=-1, diff --git a/docs/reference/index.qmd b/docs/reference/index.qmd index 4b994e00..f30fcd49 100644 --- a/docs/reference/index.qmd +++ b/docs/reference/index.qmd @@ -233,8 +233,11 @@ construction. | [calendar.holiday.get_holiday_features](calendar.holiday.get_holiday_features.qmd#spotforecast2_safe.calendar.holiday.get_holiday_features) | Build public-holiday indicators and align them to a regular time grid. | | [calendar.holiday.create_holiday_adjacency_df](calendar.holiday.create_holiday_adjacency_df.qmd#spotforecast2_safe.calendar.holiday.create_holiday_adjacency_df) | Create a DataFrame with binary adjacency indicators for public holidays. | | [calendar.holiday.get_holiday_adjacency_features](calendar.holiday.get_holiday_adjacency_features.qmd#spotforecast2_safe.calendar.holiday.get_holiday_adjacency_features) | Build holiday-adjacency indicators and align them to a regular time grid. | +| [calendar.holiday.create_day_type_df](calendar.holiday.create_day_type_df.qmd#spotforecast2_safe.calendar.holiday.create_day_type_df) | Create a day-type refinement of the public-holiday column. | +| [calendar.holiday.get_day_type_features](calendar.holiday.get_day_type_features.qmd#spotforecast2_safe.calendar.holiday.get_day_type_features) | Build day-type indicators and align them to a regular time grid. | | [calendar.features.get_calendar_features](calendar.features.get_calendar_features.qmd#spotforecast2_safe.calendar.features.get_calendar_features) | Create calendar-based features for a contiguous time range. | | [calendar.features.get_day_night_features](calendar.features.get_day_night_features.qmd#spotforecast2_safe.calendar.features.get_day_night_features) | Create day/night features using astronomical sunrise and sunset times. | +| [calendar.features.get_ephemeris_features](calendar.features.get_ephemeris_features.qmd#spotforecast2_safe.calendar.features.get_ephemeris_features) | Create continuous solar-geometry features from the ephemeris. | ## Weather diff --git a/src/spotforecast2_safe/calendar/__init__.py b/src/spotforecast2_safe/calendar/__init__.py index cf23aa54..0e86b947 100644 --- a/src/spotforecast2_safe/calendar/__init__.py +++ b/src/spotforecast2_safe/calendar/__init__.py @@ -13,19 +13,25 @@ from spotforecast2_safe.calendar.features import ( get_calendar_features, get_day_night_features, + get_ephemeris_features, ) from spotforecast2_safe.calendar.holiday import ( + create_day_type_df, create_holiday_adjacency_df, create_holiday_df, + get_day_type_features, get_holiday_adjacency_features, get_holiday_features, ) __all__ = [ + "create_day_type_df", "create_holiday_adjacency_df", "create_holiday_df", "get_calendar_features", "get_day_night_features", + "get_day_type_features", + "get_ephemeris_features", "get_holiday_adjacency_features", "get_holiday_features", ] diff --git a/src/spotforecast2_safe/calendar/features.py b/src/spotforecast2_safe/calendar/features.py index 309b0f43..9211f90e 100644 --- a/src/spotforecast2_safe/calendar/features.py +++ b/src/spotforecast2_safe/calendar/features.py @@ -10,6 +10,11 @@ - `get_day_night_features()` — derive sunrise hour, sunset hour, daylight hours, and an ``is_daylight`` indicator using the `astral `_ library. +- `get_ephemeris_features()` — per-hour solar elevation plus daylight + duration and signed sunrise/sunset-relative time. Purely deterministic + (date + fixed coordinates), leakage-free, no extra dependency; sharpens + the lighting-load timing and midday PV offset the hour-of-day RBFs only + encode implicitly. """ from typing import List, Optional, Union @@ -17,7 +22,7 @@ import numpy as np import pandas as pd from astral import LocationInfo -from astral.sun import sun +from astral.sun import elevation, sun from feature_engine.datetime import DatetimeFeatures from spotforecast2_safe.utils.convert_to_utc import to_utc_timestamp @@ -211,3 +216,121 @@ def get_day_night_features( 0, ) return features + + +def get_ephemeris_features( + start: Union[str, pd.Timestamp], + cov_end: Union[str, pd.Timestamp], + location: LocationInfo, + freq: str = "h", + timezone: str = "UTC", +) -> pd.DataFrame: + """Create continuous solar-geometry features from the ephemeris. + + Unlike `get_day_night_features` (which rounds sunrise/sunset to whole + hours and emits a binary daylight flag), this builder exposes the + *continuous* solar geometry the hour-of-day RBFs only encode implicitly: + the per-hour solar elevation, the exact daylight duration, and the signed + time relative to sunrise and sunset. These linearise lighting-load timing + and the midday PV offset, are purely deterministic from the date and the + fixed coordinates, add no dependency, and leak nothing for any forecast + hour (Xie & Hong 2018, ``xieh18a``; López 2020, ``lope20a``). + + The returned DataFrame contains four ``float64`` columns: + + - ``solar_elevation`` — solar elevation angle in degrees (negative at + night, peaking at solar noon). + - ``daylight_duration_h`` — exact sunset−sunrise span for the date, hours. + - ``hours_since_sunrise`` — signed hours since that date's sunrise + (negative before sunrise). + - ``hours_to_sunset`` — signed hours until that date's sunset + (negative after sunset). + + Args: + start: Start of the time range. String values are parsed with + ``utc=True``. + cov_end: Inclusive end of the time range. String values are parsed + with ``utc=True``. + location: `LocationInfo` describing the geographic location. + freq: Pandas-compatible frequency string for the output index. + Defaults to ``"h"`` (hourly). + timezone: Timezone label applied to the generated index. Defaults to + ``"UTC"``. + + Returns: + pd.DataFrame: Columns ``solar_elevation``, ``daylight_duration_h``, + ``hours_since_sunrise``, ``hours_to_sunset``; tz-aware + `DatetimeIndex` with the requested ``freq``. + + Examples: + ```{python} + import pandas as pd + from astral import LocationInfo + from spotforecast2_safe.calendar import get_ephemeris_features + + start = pd.Timestamp("2024-06-21", tz="UTC") + cov_end = pd.Timestamp("2024-06-21 23:00", tz="UTC") + location = LocationInfo(latitude=51.5136, longitude=7.4653, timezone="UTC") + + feats = get_ephemeris_features(start, cov_end, location) + print("columns:", feats.columns.tolist()) + print("shape:", feats.shape) + # Summer solstice: long day and a high midday sun in Dortmund. + print("max elevation:", round(feats["solar_elevation"].max(), 1)) + assert feats.shape == (24, 4) + assert feats["solar_elevation"].max() > 50.0 + assert feats["daylight_duration_h"].iloc[0] > 14.0 + ``` + """ + start = to_utc_timestamp(start) + cov_end = to_utc_timestamp(cov_end) + + extended_index = pd.date_range(start=start, end=cov_end, freq=freq, tz=timezone) + if len(extended_index) == 0: + return pd.DataFrame( + columns=[ + "solar_elevation", + "daylight_duration_h", + "hours_since_sunrise", + "hours_to_sunset", + ], + index=extended_index, + dtype="float64", + ) + + normalized_dates = extended_index.normalize() + unique_dates = normalized_dates.unique() + + # Cache sunrise/sunset per unique date so astral.sun runs once per day. + sun_by_date = { + d: sun(location.observer, date=d, tzinfo=location.timezone) + for d in unique_dates + } + sunrise = pd.DatetimeIndex( + [sun_by_date[d]["sunrise"] for d in normalized_dates] + ).tz_convert("UTC") + sunset = pd.DatetimeIndex( + [sun_by_date[d]["sunset"] for d in normalized_dates] + ).tz_convert("UTC") + + idx_utc = extended_index.tz_convert("UTC") + hour = pd.Timedelta(hours=1) + hours_since_sunrise = (idx_utc - sunrise) / hour + hours_to_sunset = (sunset - idx_utc) / hour + daylight_duration_h = (sunset - sunrise) / hour + + # Per-hour solar elevation; astral computes it analytically from the + # observer and the exact timestamp (deterministic, no network). + solar_elevation = [ + elevation(location.observer, ts.to_pydatetime()) for ts in extended_index + ] + + return pd.DataFrame( + { + "solar_elevation": np.asarray(solar_elevation, dtype="float64"), + "daylight_duration_h": np.asarray(daylight_duration_h, dtype="float64"), + "hours_since_sunrise": np.asarray(hours_since_sunrise, dtype="float64"), + "hours_to_sunset": np.asarray(hours_to_sunset, dtype="float64"), + }, + index=extended_index, + ) diff --git a/src/spotforecast2_safe/calendar/holiday.py b/src/spotforecast2_safe/calendar/holiday.py index b1a20880..12e21499 100644 --- a/src/spotforecast2_safe/calendar/holiday.py +++ b/src/spotforecast2_safe/calendar/holiday.py @@ -16,6 +16,10 @@ ``is_after_holiday``, all disjoint from ``is_holiday``. - `get_holiday_adjacency_features()` — align those adjacency indicators to a forecast grid, mirroring the contract of `get_holiday_features()`. +- `create_day_type_df()` / `get_day_type_features()` — a day-type refinement + of the holiday column: a binary working-day indicator and an integer + day-type class (working day / Saturday / Sunday / public holiday), derived + purely from the weekday and the public-holiday calendar. """ from typing import Union @@ -425,3 +429,181 @@ def get_holiday_adjacency_features( extended_index = pd.date_range(start=start, end=cov_end, freq=freq, tz=tz) return adjacency_df.reindex(extended_index, fill_value=0).astype(int) + + +# Integer codes for the ``day_type`` column. Ordering is by "working intensity" +# (a working day differs most from a holiday), but tree models split on the +# values regardless of order. +DAY_TYPE_WORKDAY = 0 +DAY_TYPE_SATURDAY = 1 +DAY_TYPE_SUNDAY = 2 +DAY_TYPE_HOLIDAY = 3 + + +def create_day_type_df( + start: str | pd.Timestamp, + end: str | pd.Timestamp, + tz: str = "UTC", + freq: str = "h", + country_code: str = "DE", + state: str = "NW", +) -> pd.DataFrame: + """Create a day-type refinement of the public-holiday column. + + Returns two integer columns derived purely from the weekday and the + public-holiday calendar (pure calendar arithmetic — known years ahead, + leakage-free): + + - ``is_workday``: ``1`` when the day is Monday–Friday **and** not a public + holiday, else ``0``. + - ``day_type``: an integer class with public-holiday precedence — + ``0`` working day, ``1`` Saturday (non-holiday), ``2`` Sunday + (non-holiday), ``3`` public holiday (any weekday). A public holiday that + falls on a weekend is still classed as ``3``. + + These remove some of the worst single-day errors a plain holiday flag + leaves behind (Ziel 2018, ``ziel18a``). + + Args: + start: Start date/datetime. + end: End date/datetime. + tz: Timezone to use if not inferred from start/end. + freq: Frequency of the resulting DataFrame. + country_code: Country code for holidays (e.g. ``"DE"``, ``"US"``). + state: State code for holidays (e.g. ``"NW"``, ``"CA"``). + + Returns: + pd.DataFrame: Index covering ``[start, end]`` at *freq* with integer + columns ``is_workday`` and ``day_type``; no NaNs. + + Examples: + ```{python} + import pandas as pd + from spotforecast2_safe.calendar import create_day_type_df + + # 2024-01-01 Mon = New Year (holiday), 02 Tue = workday, + # 06 Sat, 07 Sun. + df = create_day_type_df("2024-01-01", "2024-01-07", freq="D") + print(df["is_workday"].tolist()) + print(df["day_type"].tolist()) + assert df.loc["2024-01-01", "day_type"] == 3 # holiday + assert df.loc["2024-01-02", "is_workday"] == 1 + assert df.loc["2024-01-06", "day_type"] == 1 # Saturday + assert df.loc["2024-01-07", "day_type"] == 2 # Sunday + ``` + """ + inferred_tz = None + if isinstance(start, pd.Timestamp) and start.tz is not None: + inferred_tz = str(start.tz) + elif isinstance(end, pd.Timestamp) and end.tz is not None: + inferred_tz = str(end.tz) + + if inferred_tz is not None: + full_index = pd.date_range(start=start, end=end, freq=freq) + else: + full_index = pd.date_range(start=start, end=end, freq=freq, tz=tz) + + cal = holidays.country_holidays(country_code, subdiv=state) + unique_days = pd.DatetimeIndex(full_index.normalize().unique()) + + def _day_type(d: pd.Timestamp) -> int: + if d.date() in cal: + return DAY_TYPE_HOLIDAY + dow = d.dayofweek + if dow == 5: + return DAY_TYPE_SATURDAY + if dow == 6: + return DAY_TYPE_SUNDAY + return DAY_TYPE_WORKDAY + + day_type_series = pd.Series({d: _day_type(d) for d in unique_days}, dtype="int64") + is_workday_series = (day_type_series == DAY_TYPE_WORKDAY).astype(int) + + df_full = pd.DataFrame(index=full_index) + norm = full_index.normalize() + df_full["is_workday"] = norm.map(is_workday_series).fillna(0).astype(int) + df_full["day_type"] = norm.map(day_type_series).fillna(DAY_TYPE_WORKDAY).astype(int) + return df_full + + +def get_day_type_features( + data: pd.DataFrame, + start: Union[str, pd.Timestamp], + cov_end: Union[str, pd.Timestamp], + forecast_horizon: int, + tz: str = "UTC", + freq: str = "h", + country_code: str = "DE", + state: str = "NW", +) -> pd.DataFrame: + """Build day-type indicators and align them to a regular time grid. + + Generates ``is_workday`` and ``day_type`` via `create_day_type_df()`, + validates temporal coverage with `curate_holidays()`, and reindexes onto + the full ``[start, cov_end]`` grid. Trailing/leading grid cells outside the + generated range are filled with the working-day defaults + (``is_workday=0`` is wrong for a workday, so the grid is generated to fully + cover the request and no fill is expected; ``fill_value`` only guards the + degenerate empty-overlap case). + + Args: + data: Reference time series DataFrame used for temporal coverage + validation inside `curate_holidays()`. + start: Start timestamp. String values are parsed with ``utc=True``. + cov_end: Inclusive end timestamp (should cover the full forecast + horizon). String values are parsed with ``utc=True``. + forecast_horizon: Number of forecast steps ahead; passed to + `curate_holidays()`. + tz: Timezone applied to the generated index. Defaults to ``"UTC"``. + freq: Pandas-compatible frequency string. Defaults to ``"h"``. + country_code: ISO 3166-1 alpha-2 country code. Defaults to ``"DE"``. + state: Sub-national state/region code. Defaults to ``"NW"``. + + Returns: + pd.DataFrame: Integer columns ``is_workday`` and ``day_type``; tz-aware + `DatetimeIndex` with the requested *freq*. + + Examples: + ```{python} + import pandas as pd + from spotforecast2_safe.calendar import get_day_type_features + + forecast_horizon = 24 + n_data = 48 + data = pd.DataFrame( + {"load": range(n_data)}, + index=pd.date_range("2024-01-01", periods=n_data, freq="h", tz="UTC"), + ) + start = data.index[0] + cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + + feats = get_day_type_features( + data=data, start=start, cov_end=cov_end, + forecast_horizon=forecast_horizon, + ) + print("columns:", feats.columns.tolist()) + print("shape:", feats.shape) + # 2024-01-01 is New Year (holiday) → day_type 3, not a workday. + assert feats.loc["2024-01-01 00:00:00+00:00", "day_type"] == 3 + assert feats.loc["2024-01-01 00:00:00+00:00", "is_workday"] == 0 + assert feats.shape == (n_data + forecast_horizon, 2) + ``` + """ + from spotforecast2_safe.preprocessing.curate_data import curate_holidays + + start = to_utc_timestamp(start) + cov_end = to_utc_timestamp(cov_end) + + day_type_df = create_day_type_df( + start=start, + end=cov_end, + tz=tz, + freq=freq, + country_code=country_code, + state=state, + ) + + curate_holidays(day_type_df, data, forecast_horizon=forecast_horizon) + + extended_index = pd.date_range(start=start, end=cov_end, freq=freq, tz=tz) + return day_type_df.reindex(extended_index, fill_value=0).astype(int) diff --git a/src/spotforecast2_safe/configurator/config_multi.py b/src/spotforecast2_safe/configurator/config_multi.py index cd704709..c25b9c6b 100644 --- a/src/spotforecast2_safe/configurator/config_multi.py +++ b/src/spotforecast2_safe/configurator/config_multi.py @@ -325,6 +325,14 @@ class ConfigMulti: include_apparent_temperature: bool = False degree_hours_base_heating: float = 15.0 degree_hours_base_cooling: float = 22.0 + # Ephemeris (continuous solar geometry) and day-type calendar refinements + # (consumed by spotforecast2.multitask.base.build_exogenous_features). Both + # default off so the pipeline stays byte-identical to the baseline. + # ``include_ephemeris_features`` adds solar_elevation + daylight_duration_h + # + signed sunrise/sunset-relative time; ``include_day_type_features`` adds + # is_workday + day_type (workday/Saturday/Sunday/holiday class). + include_ephemeris_features: bool = False + include_day_type_features: bool = False poly_features_degree: int = 1 max_poly_features: int = 10 poly_mi_n_jobs: Optional[int] = -1 diff --git a/src/spotforecast2_safe/multitask/base.py b/src/spotforecast2_safe/multitask/base.py index b69c42c9..c2f809ee 100644 --- a/src/spotforecast2_safe/multitask/base.py +++ b/src/spotforecast2_safe/multitask/base.py @@ -31,6 +31,8 @@ from spotforecast2_safe.calendar import ( get_calendar_features, get_day_night_features, + get_day_type_features, + get_ephemeris_features, get_holiday_adjacency_features, get_holiday_features, ) @@ -1107,6 +1109,18 @@ def build_exogenous_features(self) -> "BaseTask": ) self.logger.info(" Day/night features: %s", sun_light_features.shape) + # 4c-bis. Ephemeris (continuous solar geometry), opt-in. + ephemeris_features = None + if getattr(self.config, "include_ephemeris_features", False): + ephemeris_features = get_ephemeris_features( + start=self.run_state.data_start, + cov_end=self.run_state.cov_end, + location=location, + freq="h", + timezone=self.config.timezone, + ) + self.logger.info(" Ephemeris features: %s", ephemeris_features.shape) + # 4d. Holidays holiday_features = get_holiday_features( data=self.df_pipeline, @@ -1127,6 +1141,21 @@ def build_exogenous_features(self) -> "BaseTask": weather_features, holiday_features, ] + if ephemeris_features is not None: + concat_frames.append(ephemeris_features) + if getattr(self.config, "include_day_type_features", False): + day_type_features = get_day_type_features( + data=self.df_pipeline, + start=self.run_state.data_start, + cov_end=self.run_state.cov_end, + forecast_horizon=self.config.predict_size, + tz=self.config.timezone, + freq="h", + country_code=self.config.country_code, + state=self.config.state, + ) + self.logger.info(" Day-type features: %s", day_type_features.shape) + concat_frames.append(day_type_features) if self.config.include_holiday_adjacency_features: holiday_adjacency_features = get_holiday_adjacency_features( data=self.df_pipeline, diff --git a/tests/test_calendar_day_type.py b/tests/test_calendar_day_type.py new file mode 100644 index 00000000..b4570b27 --- /dev/null +++ b/tests/test_calendar_day_type.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: 2026 bartzbeielstein +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Tests for the day-type calendar refinement (is_workday + day_type).""" + +import pandas as pd +import pytest + +from spotforecast2_safe.calendar import create_day_type_df, get_day_type_features +from spotforecast2_safe.calendar.holiday import ( + DAY_TYPE_HOLIDAY, + DAY_TYPE_SATURDAY, + DAY_TYPE_SUNDAY, + DAY_TYPE_WORKDAY, +) + + +class TestCreateDayTypeDf: + def test_classes_over_a_week(self): + # 2024-01-01 Mon = New Year (holiday); 02–05 workdays; 06 Sat; 07 Sun. + df = create_day_type_df("2024-01-01", "2024-01-07", freq="D") + assert df.loc["2024-01-01", "day_type"] == DAY_TYPE_HOLIDAY + assert df.loc["2024-01-02", "day_type"] == DAY_TYPE_WORKDAY + assert df.loc["2024-01-06", "day_type"] == DAY_TYPE_SATURDAY + assert df.loc["2024-01-07", "day_type"] == DAY_TYPE_SUNDAY + + def test_is_workday_matches_day_type(self): + df = create_day_type_df("2024-01-01", "2024-01-31", freq="D") + assert ((df["day_type"] == DAY_TYPE_WORKDAY) == (df["is_workday"] == 1)).all() + + def test_holiday_on_weekend_is_holiday_class(self): + # 2025-01-01 is a Wednesday; pick a holiday landing on a weekend: + # 2022-01-01 (New Year) fell on a Saturday → still classed as holiday. + df = create_day_type_df("2022-01-01", "2022-01-01", freq="D") + assert df.loc["2022-01-01", "day_type"] == DAY_TYPE_HOLIDAY + assert df.loc["2022-01-01", "is_workday"] == 0 + + def test_hourly_broadcast(self): + df = create_day_type_df("2024-01-02", "2024-01-02 23:00", freq="h") + assert df.shape == (24, 2) + # Whole workday: every hour identical. + assert (df["is_workday"] == 1).all() + assert (df["day_type"] == DAY_TYPE_WORKDAY).all() + + def test_no_nans(self): + df = create_day_type_df("2024-01-01", "2024-12-31", freq="D") + assert not df.isna().any().any() + + def test_deterministic(self): + a = create_day_type_df("2024-05-01", "2024-05-31", freq="h") + b = create_day_type_df("2024-05-01", "2024-05-31", freq="h") + pd.testing.assert_frame_equal(a, b) + + +class TestGetDayTypeFeatures: + def test_shape_and_alignment(self): + forecast_horizon = 24 + n_data = 48 + data = pd.DataFrame( + {"load": range(n_data)}, + index=pd.date_range("2024-01-01", periods=n_data, freq="h", tz="UTC"), + ) + start = data.index[0] + cov_end = start + pd.Timedelta(hours=(n_data + forecast_horizon - 1)) + feats = get_day_type_features( + data=data, + start=start, + cov_end=cov_end, + forecast_horizon=forecast_horizon, + ) + assert feats.shape == (n_data + forecast_horizon, 2) + assert feats.columns.tolist() == ["is_workday", "day_type"] + # 2024-01-01 is New Year (holiday). + assert feats.loc["2024-01-01 00:00:00+00:00", "day_type"] == DAY_TYPE_HOLIDAY + assert feats.loc["2024-01-01 00:00:00+00:00", "is_workday"] == 0 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_calendar_ephemeris.py b/tests/test_calendar_ephemeris.py new file mode 100644 index 00000000..e7818cea --- /dev/null +++ b/tests/test_calendar_ephemeris.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: 2026 bartzbeielstein +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Tests for continuous ephemeris (solar-geometry) features.""" + +import pandas as pd +import pytest +from astral import LocationInfo + +from spotforecast2_safe.calendar import get_ephemeris_features + +DORTMUND = LocationInfo(latitude=51.5136, longitude=7.4653, timezone="UTC") + + +def _day(date: str) -> tuple[pd.Timestamp, pd.Timestamp]: + start = pd.Timestamp(date, tz="UTC") + return start, start + pd.Timedelta(hours=23) + + +class TestEphemerisFeatures: + def test_columns_and_shape(self): + start, cov_end = _day("2024-06-21") + feats = get_ephemeris_features(start, cov_end, DORTMUND) + assert feats.columns.tolist() == [ + "solar_elevation", + "daylight_duration_h", + "hours_since_sunrise", + "hours_to_sunset", + ] + assert feats.shape == (24, 4) + assert not feats.isna().any().any() + + def test_summer_longer_than_winter(self): + s_start, s_end = _day("2024-06-21") + w_start, w_end = _day("2024-12-21") + summer = get_ephemeris_features(s_start, s_end, DORTMUND) + winter = get_ephemeris_features(w_start, w_end, DORTMUND) + assert summer["daylight_duration_h"].iloc[0] > 15.0 + assert winter["daylight_duration_h"].iloc[0] < 9.0 + assert summer["solar_elevation"].max() > winter["solar_elevation"].max() + + def test_elevation_negative_at_night_positive_at_noon(self): + start, cov_end = _day("2024-06-21") + feats = get_ephemeris_features(start, cov_end, DORTMUND) + # Midnight/01:00 UTC: sun below horizon. Midday: well above. + assert feats["solar_elevation"].iloc[1] < 0.0 + assert feats["solar_elevation"].iloc[12] > 0.0 + + def test_sunrise_relative_sign(self): + start, cov_end = _day("2024-06-21") + feats = get_ephemeris_features(start, cov_end, DORTMUND) + # Before sunrise hours_since_sunrise is negative; late day positive. + assert feats["hours_since_sunrise"].iloc[0] < 0.0 + assert feats["hours_since_sunrise"].iloc[-1] > 0.0 + # hours_to_sunset is positive early, negative after sunset. + assert feats["hours_to_sunset"].iloc[0] > 0.0 + assert feats["hours_to_sunset"].iloc[-1] < 0.0 + + def test_deterministic(self): + start, cov_end = _day("2024-03-10") + pd.testing.assert_frame_equal( + get_ephemeris_features(start, cov_end, DORTMUND), + get_ephemeris_features(start, cov_end, DORTMUND), + ) + + def test_empty_range(self): + start = pd.Timestamp("2024-06-21", tz="UTC") + feats = get_ephemeris_features(start, start - pd.Timedelta(hours=1), DORTMUND) + assert feats.shape == (0, 4) + assert feats.columns.tolist() == [ + "solar_elevation", + "daylight_duration_h", + "hours_since_sunrise", + "hours_to_sunset", + ] + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])