diff --git a/packages/gooddata-pandas/src/gooddata_pandas/arrow_convertor.py b/packages/gooddata-pandas/src/gooddata_pandas/arrow_convertor.py index 330d1f797..2192be2e0 100644 --- a/packages/gooddata-pandas/src/gooddata_pandas/arrow_convertor.py +++ b/packages/gooddata-pandas/src/gooddata_pandas/arrow_convertor.py @@ -1,10 +1,12 @@ # (C) 2026 GoodData Corporation from __future__ import annotations +import logging from typing import Callable import orjson import pandas +from gooddata_sdk.type_converter import AttributeConverterStore from gooddata_pandas.arrow_types import TypesMapper @@ -42,6 +44,63 @@ _REQUIRED_SCHEMA_KEYS = (_META_XTAB, _META_MODEL, _META_VIEW) +logger = logging.getLogger(__name__) + + +def read_model_labels(table: pa.Table) -> dict: + """Return the ``labels`` dict from the Arrow table's ``x-gdc-model-v1`` schema metadata. + + Returns an empty dict when the metadata key is absent so callers can use it + unconditionally without extra None-checks. + """ + if not table.schema.metadata or b"x-gdc-model-v1" not in table.schema.metadata: + return {} + return orjson.loads(table.schema.metadata[b"x-gdc-model-v1"]).get("labels", {}) + + +def _get_date_converter_for_label(label_id: str, model_labels: dict): + """Return a type Converter for date-granularity labels, or None for plain text attributes. + + Reads the ``granularity`` field from Arrow model metadata (``x-gdc-model-v1``) and + looks up the matching converter in ``AttributeConverterStore``. + + - ``DAY`` / ``MONTH`` / ``YEAR`` → ``DateConverter`` (→ ``pandas.Timestamp`` via external fn) + - ``WEEK`` / ``QUARTER`` → ``StringConverter`` (no-op) + - ``MINUTE`` / ``HOUR`` → ``DatetimeConverter`` + - No granularity (text attrs) → ``None`` (caller skips conversion) + """ + info = model_labels.get(label_id, {}) + granularity = info.get("granularity") + if not granularity: + return None + return AttributeConverterStore.find_converter("DATE", granularity.upper()) + + +def convert_label_values(label_id: str, values: list, model_labels: dict) -> list: + """Apply date-granularity type conversion to a list of attribute values from an Arrow column. + + Mirrors the non-Arrow execution path (``AttributeConverterStore`` in ``_typed_attribute_value``): + + - ``DAY`` / ``MONTH`` / ``YEAR`` granularity → ``pandas.Timestamp`` + - ``WEEK`` / ``QUARTER`` → ``str`` (unchanged) + - No granularity (text attributes) → values returned as the **same object** + + ``None`` values are passed through unchanged. + + Args: + label_id: Arrow column name / GoodData label local ID. + values: Raw values from ``table.column(label_id).to_pylist()``. + model_labels: The ``labels`` dict from ``x-gdc-model-v1`` schema metadata + (as returned by :func:`read_model_labels`). + + Returns: + Converted list, or the original *values* object when no conversion is needed. + """ + converter = _get_date_converter_for_label(label_id, model_labels) + if converter is None: + return values + return [converter.to_external_type(v) if v is not None else None for v in values] + def build_metric_field_index(table: pa.Table) -> dict[int, str]: """Return {metric_dimension_index: arrow_field_name} from the table schema. @@ -77,9 +136,14 @@ def _parse_schema_metadata(table: pa.Table) -> dict: raise ValueError( "Arrow table has no schema metadata. Expected GoodData metadata keys: " + ", ".join(_REQUIRED_SCHEMA_KEYS) ) - schema_meta = { - k.decode(): orjson.loads(v) for k, v in table.schema.metadata.items() if k.decode() in _REQUIRED_SCHEMA_KEYS - } + schema_meta = {} + for _k, _v in table.schema.metadata.items(): + try: + _k_str = _k.decode() + except UnicodeDecodeError: + continue + if _k_str in _REQUIRED_SCHEMA_KEYS: + schema_meta[_k_str] = orjson.loads(_v) missing = [k for k in _REQUIRED_SCHEMA_KEYS if k not in schema_meta] if missing: raise ValueError( @@ -186,10 +250,15 @@ def _build_inline_index( totals_meta = xtab_meta.get("totalsMetadata", {}) total_ref_vals: list = [None] * table.num_rows if totals_meta: - for field in table.schema: - if field.name.startswith(_COL_TOTAL_REF_PREFIX): - total_ref_vals = table.column(field.name).to_pylist() - break + total_ref_cols = [f.name for f in table.schema if f.name.startswith(_COL_TOTAL_REF_PREFIX)] + if total_ref_cols: + if len(total_ref_cols) > 1: + logger.warning( + "Arrow table has %d __total_ref* columns; only %r is used for aggregation names.", + len(total_ref_cols), + total_ref_cols[0], + ) + total_ref_vals = table.column(total_ref_cols[0]).to_pylist() # Precompute per-row aggregation name and kept-label set for total rows. agg_for_row: list[str | None] = [None] * table.num_rows @@ -212,16 +281,17 @@ def _build_inline_index( values = table.column(lid).to_pylist() processed = [] for i, v in enumerate(values): - if row_types[i] != 0 and isinstance(v, str): - if ref in kept_labels_for_row[i]: - # Outer label kept as real attribute value in a subtotal row. - processed.append(v) - elif v == "": - # Aggregated level left empty by the server — fill with agg name. - processed.append(agg_for_row[i] if agg_for_row[i] else v) + if row_types[i] != 0: + if isinstance(v, str): + if ref in kept_labels_for_row[i]: + processed.append(v) + elif v == "": + processed.append(agg_for_row[i] if agg_for_row[i] else v) + else: + processed.append(v.upper()) else: - # Aggregation function marker (e.g. 'sum') — uppercase it. - processed.append(v.upper()) + # Non-string value in a total row — replace with the aggregation name when available. + processed.append(agg_for_row[i] if agg_for_row[i] is not None else v) else: processed.append(v) arrays.append(processed) @@ -410,6 +480,11 @@ def _label_ids_in_dim(dim: dict) -> set: (dim for dim in execution_dims if col_ref_label_ids <= _label_ids_in_dim(dim)), {}, ) + if not col_dim and execution_dims: + logger.warning( + "No execution dimension contains column label IDs %s; column_totals_indexes will be empty.", + col_ref_label_ids, + ) else: col_dim = next( (dim for dim in execution_dims if any("measureGroupHeaders" in h for h in dim.get("headers", []))), @@ -486,6 +561,11 @@ def _label_ids_in_dim(dim: dict) -> set: (dim for dim in execution_dims if ref_label_ids <= _label_ids_in_dim(dim)), {}, ) + if not row_dim and execution_dims: + logger.warning( + "No execution dimension contains row label IDs %s; row_totals_indexes will be empty.", + ref_label_ids, + ) else: # Metrics-only: the dimension containing measureGroupHeaders is the output-row dim. row_dim = next( diff --git a/packages/gooddata-pandas/src/gooddata_pandas/arrow_types.py b/packages/gooddata-pandas/src/gooddata_pandas/arrow_types.py index 997437e4f..683e1b3fa 100644 --- a/packages/gooddata-pandas/src/gooddata_pandas/arrow_types.py +++ b/packages/gooddata-pandas/src/gooddata_pandas/arrow_types.py @@ -47,8 +47,12 @@ class ArrowConfig: custom_mapping is not provided. custom_mapping: Arrow type → pandas dtype mapping dict. Only used when types_mapper=TypesMapper.CUSTOM, ignored otherwise. + max_bytes: Optional byte-size limit for the Arrow response body. When set, + ``read_result_arrow`` raises ``ResultSizeBytesLimitExceeded`` if the + raw IPC payload exceeds this value before parsing begins. """ self_destruct: bool = False types_mapper: TypesMapper = TypesMapper.DEFAULT custom_mapping: dict | None = field(default=None) + max_bytes: int | None = field(default=None) diff --git a/packages/gooddata-pandas/src/gooddata_pandas/data_access.py b/packages/gooddata-pandas/src/gooddata_pandas/data_access.py index 844ca1ba8..2ca5b94c8 100644 --- a/packages/gooddata-pandas/src/gooddata_pandas/data_access.py +++ b/packages/gooddata-pandas/src/gooddata_pandas/data_access.py @@ -18,11 +18,6 @@ ) from gooddata_sdk.utils import IdObjType -try: - from gooddata_pandas.arrow_convertor import build_metric_field_index -except ImportError: - pass # Only needed when use_arrow=True; callers guard with _ARROW_AVAILABLE checks - from gooddata_pandas.utils import ( ColumnsDef, IndexDef, @@ -34,6 +29,12 @@ get_catalog_attributes_for_extract, ) +_ARROW_IMPORT_ERROR: ImportError | None = None +try: + from gooddata_pandas.arrow_convertor import build_metric_field_index, convert_label_values, read_model_labels +except ImportError as _e: # pragma: no cover + _ARROW_IMPORT_ERROR = _e # pragma: no cover + class ExecutionDefinitionBuilder: _DEFAULT_INDEX_NAME: str = "0" @@ -429,20 +430,30 @@ def _extract_from_arrow( col_to_attr_idx: dict[str, int], col_to_metric_idx: dict[str, int], index_to_attr_idx: dict[str, int], + max_bytes: int | None = None, ) -> tuple[dict, dict]: """ Arrow-path extraction for indexed() / not_indexed(). Reads the full result in one shot via the binary endpoint, then slices columns - by Arrow field name (metrics) or label id (attributes). No catalog fetch needed. + by Arrow field name (metrics) or label id (attributes). + + Date-granularity attribute columns (year/month/day) are converted to + ``pandas.Timestamp`` to match the behaviour of the non-Arrow path. + Week and quarter values remain as strings (same as non-Arrow). """ - table = execution.bare_exec_response.read_result_arrow() + if _ARROW_IMPORT_ERROR is not None: + raise ImportError( + "pyarrow is required for Arrow support. Install it with: pip install gooddata-pandas[arrow]" + ) from _ARROW_IMPORT_ERROR + table = execution.bare_exec_response.read_result_arrow(max_bytes=max_bytes) exec_def = execution.exec_def if table.num_rows == 0: return {col: [] for col in cols}, {idx: [] for idx in index_to_attr_idx} metric_dim_idx_to_field = build_metric_field_index(table) + model_labels = read_model_labels(table) data: dict[str, list] = {} for col in cols: @@ -451,12 +462,14 @@ def _extract_from_arrow( data[col] = table.column(field_name).to_pylist() else: attr = exec_def.attributes[col_to_attr_idx[col]] - data[col] = table.column(attr.label.id).to_pylist() + label_id = attr.label.id + data[col] = convert_label_values(label_id, table.column(label_id).to_pylist(), model_labels) index: dict[str, list] = {} for idx_name, attr_idx in index_to_attr_idx.items(): attr = exec_def.attributes[attr_idx] - index[idx_name] = table.column(attr.label.id).to_pylist() + label_id = attr.label.id + index[idx_name] = convert_label_values(label_id, table.column(label_id).to_pylist(), model_labels) return data, index @@ -471,6 +484,7 @@ def compute_and_extract( is_cancellable: bool = False, result_page_len: int | None = None, use_arrow: bool = False, + max_bytes: int | None = None, ) -> tuple[dict, dict]: """ Convenience function that computes and extracts data from the execution response. @@ -489,6 +503,8 @@ def compute_and_extract( Defaults to 1000. Larger values can improve performance for large result sets. use_arrow (bool, optional): When True, fetches the result via the Arrow IPC binary endpoint in one shot instead of paginating through JSON. Requires pyarrow. + max_bytes (Optional[int]): Maximum response body size in bytes for the Arrow path. + Raises ResultSizeBytesLimitExceeded when exceeded. Ignored when use_arrow=False. Returns: tuple: A tuple containing the following dictionaries: @@ -522,6 +538,7 @@ def compute_and_extract( col_to_attr_idx, col_to_metric_idx, index_to_attr_idx, + max_bytes=max_bytes, ) elif not exec_def.has_attributes(): return _extract_for_metrics_only(execution, cols, col_to_metric_idx), dict() diff --git a/packages/gooddata-pandas/src/gooddata_pandas/dataframe.py b/packages/gooddata-pandas/src/gooddata_pandas/dataframe.py index 1539a16d3..6d48f4ed0 100644 --- a/packages/gooddata-pandas/src/gooddata_pandas/dataframe.py +++ b/packages/gooddata-pandas/src/gooddata_pandas/dataframe.py @@ -159,6 +159,7 @@ def indexed( is_cancellable=is_cancellable, result_page_len=result_page_len, use_arrow=use_arrow, + max_bytes=self._arrow_config.max_bytes if use_arrow else None, ) _idx = make_pandas_index(index) @@ -210,6 +211,7 @@ def not_indexed( is_cancellable=is_cancellable, result_page_len=result_page_len, use_arrow=use_arrow, + max_bytes=self._arrow_config.max_bytes if use_arrow else None, ) return pandas.DataFrame(data=data) @@ -539,7 +541,7 @@ def for_exec_def_arrow( on_execution_submitted(execution) exec_response = execution.bare_exec_response - table = exec_response.read_result_arrow() + table = exec_response.read_result_arrow(max_bytes=self._arrow_config.max_bytes) return self._table_to_df_and_metadata(table, exec_response, label_overrides, grand_totals_position) def for_arrow_table( @@ -684,7 +686,7 @@ def for_exec_result_id( result_cache_metadata.execution_response, _check_type=False ), ) - table = exec_response.read_result_arrow() + table = exec_response.read_result_arrow(max_bytes=self._arrow_config.max_bytes) return self._table_to_df_and_metadata(table, exec_response, label_overrides, grand_totals_position) return convert_execution_response_to_dataframe( diff --git a/packages/gooddata-pandas/tests/dataframe/fixtures/arrow/manifest.json b/packages/gooddata-pandas/tests/dataframe/fixtures/arrow/manifest.json index d81fa4763..bd996f8ef 100644 --- a/packages/gooddata-pandas/tests/dataframe/fixtures/arrow/manifest.json +++ b/packages/gooddata-pandas/tests/dataframe/fixtures/arrow/manifest.json @@ -1,7 +1,7 @@ [ { "name": "flat_attrs_and_metrics", - "description": "Single dim with attributes and measureGroup \u2014 flat table", + "description": "Single dim with attributes and measureGroup — flat table", "shape": [ 96, 1 @@ -11,7 +11,7 @@ }, { "name": "two_dim_metrics_in_rows", - "description": "measureGroup in dim0 \u2192 each metric is a row; attributes fan out as columns", + "description": "measureGroup in dim0 → each metric is a row; attributes fan out as columns", "shape": [ 2, 17 @@ -21,7 +21,7 @@ }, { "name": "two_dim_metrics_in_cols", - "description": "Attributes in dim0 (rows), measureGroup in dim1 (columns) \u2014 most common layout", + "description": "Attributes in dim0 (rows), measureGroup in dim1 (columns) — most common layout", "shape": [ 48, 8 @@ -31,7 +31,7 @@ }, { "name": "wide_few_rows_many_cols", - "description": "Single attribute in dim0, multiple attributes + measureGroup in dim1 \u2192 wide DataFrame", + "description": "Single attribute in dim0, multiple attributes + measureGroup in dim1 → wide DataFrame", "shape": [ 4, 96 @@ -41,7 +41,7 @@ }, { "name": "metrics_only", - "description": "No attributes \u2014 single row of aggregate metric values", + "description": "No attributes — single row of aggregate metric values", "shape": [ 2, 1 @@ -51,7 +51,7 @@ }, { "name": "single_metric_many_rows", - "description": "Three attributes + one metric \u2014 tall narrow table", + "description": "Three attributes + one metric — tall narrow table", "shape": [ 182, 1 @@ -81,7 +81,7 @@ }, { "name": "totals_grand_row_two_col_labels", - "description": "Grand total rows; column dim has two labels \u2014 tests index padding", + "description": "Grand total rows; column dim has two labels — tests index padding", "shape": [ 96, 18 @@ -91,7 +91,7 @@ }, { "name": "totals_grand_col_two_row_labels", - "description": "Grand total columns; row dim has two labels \u2014 tests column index padding", + "description": "Grand total columns; row dim has two labels — tests column index padding", "shape": [ 18, 96 @@ -131,7 +131,7 @@ }, { "name": "dim_r_m", - "description": "dim0=[region], dim1=[measureGroup] \u2014 1 row attr, metrics in cols", + "description": "dim0=[region], dim1=[measureGroup] — 1 row attr, metrics in cols", "shape": [ 5, 2 @@ -161,7 +161,7 @@ }, { "name": "dim_m_c", - "description": "dim0=[measureGroup], dim1=[products.category] \u2014 metrics in rows, 1 col attr", + "description": "dim0=[measureGroup], dim1=[products.category] — metrics in rows, 1 col attr", "shape": [ 2, 4 @@ -251,7 +251,7 @@ }, { "name": "tot_d0_sub", - "description": "Base A: subtotal per region (rolls up category) \u2192 extra rows in dim0", + "description": "Base A: subtotal per region (rolls up category) → extra rows in dim0", "shape": [ 44, 4 @@ -261,7 +261,7 @@ }, { "name": "tot_d0_grand", - "description": "Base A: grand total of dim0 (all items) \u2192 extra column in dim1", + "description": "Base A: grand total of dim0 (all items) → extra column in dim1", "shape": [ 34, 4 @@ -281,7 +281,7 @@ }, { "name": "tot_d1_sub", - "description": "Base B: subtotal per order_status (rolls up date.year) \u2192 extra columns in dim1", + "description": "Base B: subtotal per order_status (rolls up date.year) → extra columns in dim1", "shape": [ 18, 36 @@ -291,7 +291,7 @@ }, { "name": "tot_d1_grand", - "description": "Base B: grand total of dim1 (all items) \u2192 extra row in dim0", + "description": "Base B: grand total of dim1 (all items) → extra row in dim0", "shape": [ 18, 30 @@ -321,7 +321,7 @@ }, { "name": "tot_d0sub_d1grand", - "description": "Base C: row subtotals per region + grand total of dim1 \u2192 extra col rows + SUM row", + "description": "Base C: row subtotals per region + grand total of dim1 → extra col rows + SUM row", "shape": [ 44, 16 @@ -358,5 +358,75 @@ ], "result_id": "b592a3f77e55d5e3d4450891c0d5c1fbedcac173:f9ed5a4e4b8cf21716cc409cd87664ef7da3444d4f07c4864989916d420bf85f", "dir": "totals_both_dims" + }, + { + "name": "date_year_in_rows", + "description": "Date attribute (YEAR granularity) in row dimension — Arrow path date→Timestamp parity", + "shape": [ + 3, + 1 + ], + "result_id": "0000000000000000000000000000000000000000:date_year_in_rows", + "dir": "date_year_in_rows" + }, + { + "name": "date_month_in_rows", + "description": "Date attribute (MONTH granularity) in row dimension — Arrow path date→Timestamp parity", + "shape": [ + 3, + 1 + ], + "result_id": "0000000000000000000000000000000000000000:date_month_in_rows", + "dir": "date_month_in_rows" + }, + { + "name": "date_day_in_rows", + "description": "Date attribute (DAY granularity) in row dimension — Arrow path date→Timestamp parity", + "shape": [ + 2, + 1 + ], + "result_id": "0000000000000000000000000000000000000000:date_day_in_rows", + "dir": "date_day_in_rows" + }, + { + "name": "date_week_in_rows", + "description": "Date attribute (WEEK granularity) in row dimension — stays string, no Timestamp conversion", + "shape": [ + 2, + 1 + ], + "result_id": "0000000000000000000000000000000000000000:date_week_in_rows", + "dir": "date_week_in_rows" + }, + { + "name": "date_quarter_in_rows", + "description": "Date attribute (QUARTER granularity) in row dimension — stays string, no Timestamp conversion", + "shape": [ + 2, + 1 + ], + "result_id": "0000000000000000000000000000000000000000:date_quarter_in_rows", + "dir": "date_quarter_in_rows" + }, + { + "name": "empty_two_dim_attrs_metrics", + "description": "Standard two-dim layout (attr rows, metric cols), 0 data rows", + "shape": [ + 0, + 2 + ], + "result_id": "0000000000000000000000000000000000000000:empty_two_dim_attrs_metrics", + "dir": "empty_two_dim_attrs_metrics" + }, + { + "name": "empty_flat_attrs_metrics", + "description": "Single-dim (flat) layout, attr and metrics co-dim, 0 data rows", + "shape": [ + 0, + 2 + ], + "result_id": "0000000000000000000000000000000000000000:empty_flat_attrs_metrics", + "dir": "empty_flat_attrs_metrics" } ] diff --git a/packages/gooddata-pandas/tests/dataframe/test_dataframe_for_exec_def_arrow.py b/packages/gooddata-pandas/tests/dataframe/test_dataframe_for_exec_def_arrow.py index cd0e20511..02d9ec14e 100644 --- a/packages/gooddata-pandas/tests/dataframe/test_dataframe_for_exec_def_arrow.py +++ b/packages/gooddata-pandas/tests/dataframe/test_dataframe_for_exec_def_arrow.py @@ -2,11 +2,14 @@ from __future__ import annotations import ast +import io import json +import logging import re from pathlib import Path from unittest.mock import MagicMock, patch +import gooddata_pandas.data_access as _da import numpy import pandas import pyarrow as pa @@ -16,15 +19,20 @@ _build_inline_index, _compute_primary_labels_from_fields, _compute_primary_labels_from_inline, + _metric_title, compute_column_totals_indexes, compute_primary_labels, compute_row_totals_indexes, convert_arrow_table_to_dataframe, + convert_label_values, + read_model_labels, reorder_grand_totals, ) from gooddata_pandas.arrow_types import ArrowConfig, TypesMapper from gooddata_pandas.data_access import ExecutionDefinitionBuilder from gooddata_pandas.dataframe import DataFrameFactory +from gooddata_sdk import ExecutionDefinition +from gooddata_sdk.compute.model.execution import BareExecutionResponse, ResultSizeBytesLimitExceeded from pyarrow import ipc _ARROW_FIXTURES = Path(__file__).parent / "fixtures" / "arrow" @@ -1180,8 +1188,6 @@ def test_for_exec_def_arrow_label_overrides_none_equivalent_to_empty() -> None: mock_sdk.compute.for_exec_def.return_value = mock_exec gdf = DataFrameFactory(mock_sdk, "workspace") - from gooddata_sdk import ExecutionDefinition - exec_def = MagicMock(spec=ExecutionDefinition) df_none, _ = gdf.for_exec_def_arrow(exec_def, label_overrides=None) df_empty, _ = gdf.for_exec_def_arrow(exec_def, label_overrides={}) @@ -1212,8 +1218,6 @@ def test_compute_row_totals_indexes_tolerates_field_without_metadata() -> None: table, _, meta = _load_case("tot_d0_sub") # Strip metadata from one grand_total field to simulate the defensive path. - import pyarrow as _pa - fields = [] for i in range(len(table.schema)): f = table.schema.field(i) @@ -1223,12 +1227,10 @@ def test_compute_row_totals_indexes_tolerates_field_without_metadata() -> None: fields.append(f.with_metadata(None)) else: fields.append(f) - new_schema = _pa.schema(fields, metadata=table.schema.metadata) + new_schema = pa.schema(fields, metadata=table.schema.metadata) table_no_meta = table.cast(new_schema) # Should not raise; total indexes for the stripped field will be empty. - from gooddata_pandas.arrow_convertor import compute_row_totals_indexes - result = compute_row_totals_indexes(table_no_meta, meta["dimensions"]) assert isinstance(result, list) @@ -1334,8 +1336,6 @@ def test_build_field_index_label_values_overflow() -> None: def test_metric_title_out_of_range_raises() -> None: """_metric_title raises ValueError when metric_idx exceeds requestedShape.metrics length.""" - from gooddata_pandas.arrow_convertor import _metric_title - model_meta = {"requestedShape": {"metrics": ["price"]}, "metrics": {}} with pytest.raises(ValueError, match="out of range"): _metric_title(5, model_meta, {}) @@ -1390,3 +1390,571 @@ def test_for_exec_result_id_arrow_types_mapper(tmp_path: Path) -> None: # Shape must be identical; string columns differ in dtype only. assert df_default.shape == df_arrow_strings.shape + + +# --------------------------------------------------------------------------- +# convert_label_values — date granularity type conversion +# +# Mirrors the non-Arrow path (AttributeConverterStore in _typed_attribute_value): +# DAY / MONTH / YEAR → pandas.Timestamp +# WEEK / QUARTER → str (unchanged) +# no granularity → values unchanged (same object returned) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "granularity, raw, expected", + [ + ("year", ["2023", "2024"], [pandas.Timestamp("2023-01-01"), pandas.Timestamp("2024-01-01")]), + ("month", ["2023-01", "2023-03"], [pandas.Timestamp("2023-01-01"), pandas.Timestamp("2023-03-01")]), + ("day", ["2023-01-15", "2024-06-30"], [pandas.Timestamp("2023-01-15"), pandas.Timestamp("2024-06-30")]), + ("week", ["2025-1", "2025-49"], ["2025-1", "2025-49"]), + ("quarter", ["2025-1", "2025-4"], ["2025-1", "2025-4"]), + ], +) +def test_convert_label_values_date_granularities(granularity: str, raw: list, expected: list) -> None: + """convert_label_values converts DAY/MONTH/YEAR to Timestamp; WEEK/QUARTER stays string.""" + model_labels = {"date_attr": {"granularity": granularity}} + result = convert_label_values("date_attr", raw, model_labels) + assert result == expected + + +def test_convert_label_values_no_granularity_returns_same_object() -> None: + """Plain text attributes (no granularity) are returned as the same list object — zero copy.""" + model_labels = {"region": {"granularity": None, "labelType": "TEXT"}} + values = ["East", "West"] + assert convert_label_values("region", values, model_labels) is values + + +def test_convert_label_values_unknown_label_id_returns_same_object() -> None: + """Label ID absent from model_labels (no metadata) is treated as no-op.""" + values = ["foo", "bar"] + assert convert_label_values("unknown", values, {}) is values + + +def test_convert_label_values_none_passthrough() -> None: + """None values inside a date column are preserved as None (sparse rows).""" + model_labels = {"date.year": {"granularity": "year"}} + result = convert_label_values("date.year", ["2023", None, "2025"], model_labels) + assert result[0] == pandas.Timestamp("2023-01-01") + assert result[1] is None + assert result[2] == pandas.Timestamp("2025-01-01") + + +def test_convert_label_values_empty_list() -> None: + """Empty input list returns empty list without error.""" + model_labels = {"date.year": {"granularity": "year"}} + assert convert_label_values("date.year", [], model_labels) == [] + + +# --------------------------------------------------------------------------- +# read_model_labels — schema metadata parsing +# --------------------------------------------------------------------------- + + +def test_read_model_labels_returns_labels_dict() -> None: + """read_model_labels extracts the labels sub-dict from x-gdc-model-v1.""" + model_meta = {"labels": {"region": {"granularity": None}}, "metrics": {}} + schema_meta = {b"x-gdc-model-v1": json.dumps(model_meta).encode()} + table = pa.table({"col": [1]}).replace_schema_metadata(schema_meta) + assert read_model_labels(table) == {"region": {"granularity": None}} + + +def test_read_model_labels_missing_key_returns_empty_dict() -> None: + """read_model_labels returns {} when x-gdc-model-v1 is absent.""" + table = pa.table({"col": [1]}) + assert read_model_labels(table) == {} + + +def test_read_model_labels_no_labels_key_returns_empty_dict() -> None: + """read_model_labels returns {} when model metadata has no 'labels' key.""" + schema_meta = {b"x-gdc-model-v1": json.dumps({"metrics": {}}).encode()} + table = pa.table({"col": [1]}).replace_schema_metadata(schema_meta) + assert read_model_labels(table) == {} + + +# --------------------------------------------------------------------------- +# indexed() / not_indexed() with use_arrow=True — date granularity parity +# +# The non-Arrow path converts year/month/day → datetime via AttributeConverterStore. +# The Arrow path must produce identical types for the same columns/index. +# +# We build a minimal synthetic Arrow table with the required schema metadata +# (x-gdc-model-v1 with granularity field) and wire it into a mocked execution. +# --------------------------------------------------------------------------- + + +def _make_date_attr_table( + label_id: str, + granularity: str, + str_values: list, + metric_values: list[float] | None = None, +) -> pa.Table: + """Build a minimal Arrow table for a single date attribute + one revenue metric. + + The table has the schema metadata (x-gdc-model-v1, x-gdc-xtab-v1, x-gdc-view-v1) + that _extract_from_arrow reads to determine the label's granularity. + + Args: + label_id: GoodData label local ID (used as the Arrow column name). + granularity: Lowercase granularity string, e.g. ``"year"``, ``"month"``. + str_values: Raw string values for the date attribute column. + metric_values: Optional metric values; defaults to 0.0 per row. + """ + n = len(str_values) + if metric_values is None: + metric_values = [float(i) for i in range(n)] + + model_meta = { + "labels": { + label_id: { + "granularity": granularity, + "labelTitle": label_id.capitalize(), + "labelType": None, + "primaryLabelId": label_id, + "attributeId": label_id, + } + }, + "requestedShape": {"metrics": ["revenue"]}, + "metrics": {"revenue": {"title": "Revenue"}}, + } + xtab_meta = { + "labelMetadata": {"l0": {"labelId": label_id, "primaryLabelId": label_id}}, + "computedShape": {"metrics": ["m0"], "rows": [], "cols": []}, + "totalsMetadata": {}, + } + view_meta = {"isTransposed": False} + schema_meta = { + b"x-gdc-model-v1": json.dumps(model_meta).encode(), + b"x-gdc-xtab-v1": json.dumps(xtab_meta).encode(), + b"x-gdc-view-v1": json.dumps(view_meta).encode(), + } + gdc_metric = {b"gdc": json.dumps({"type": "metric", "index": 0}).encode()} + schema = pa.schema( + [ + pa.field("__row_type", pa.int8()), + pa.field(label_id, pa.string()), + pa.field("metric_group_0", pa.float64(), metadata=gdc_metric), + ], + metadata=schema_meta, + ) + return pa.table( + { + "__row_type": pa.array([0] * n, type=pa.int8()), + label_id: pa.array(str_values, type=pa.string()), + "metric_group_0": pa.array(metric_values, type=pa.float64()), + }, + schema=schema, + ) + + +@pytest.mark.parametrize( + "case_name, index_by, columns, expected_timestamps", + [ + ( + "date_year_in_rows", + {"date": "label/date.year"}, + {"revenue": "metric/revenue"}, + [pandas.Timestamp("2023-01-01"), pandas.Timestamp("2024-01-01"), pandas.Timestamp("2025-01-01")], + ), + ( + "date_month_in_rows", + {"date": "label/date.month"}, + {"revenue": "metric/revenue"}, + [pandas.Timestamp("2023-01-01"), pandas.Timestamp("2023-06-01"), pandas.Timestamp("2023-12-01")], + ), + ( + "date_day_in_rows", + {"date": "label/date.day"}, + {"revenue": "metric/revenue"}, + [pandas.Timestamp("2023-01-15"), pandas.Timestamp("2023-06-30")], + ), + ], +) +def test_indexed_use_arrow_date_to_timestamp( + case_name: str, + index_by: dict, + columns: dict, + expected_timestamps: list, +) -> None: + """indexed() with use_arrow=True converts DAY/MONTH/YEAR date attributes to Timestamp. + + Matches the non-Arrow path behaviour where AttributeConverterStore applies + DateConverter → pandas.to_datetime for these granularities. + """ + if case_name not in _cases(): + pytest.skip(f"fixture {case_name!r} not available") + table, _, _ = _load_case(case_name) + mock_sdk, _, _ = _mock_execution(table, columns, index_by) + + gdf = DataFrameFactory(mock_sdk, "workspace", use_arrow=True) + df = gdf.indexed(index_by=index_by, columns=columns) + + assert df.index.tolist() == expected_timestamps + + +@pytest.mark.parametrize( + "case_name, index_by, columns, expected_strings", + [ + ( + "date_week_in_rows", + {"date": "label/date.week"}, + {"revenue": "metric/revenue"}, + ["2025-1", "2025-49"], + ), + ( + "date_quarter_in_rows", + {"date": "label/date.quarter"}, + {"revenue": "metric/revenue"}, + ["2025-1", "2025-4"], + ), + ], +) +def test_indexed_use_arrow_week_quarter_stays_string( + case_name: str, + index_by: dict, + columns: dict, + expected_strings: list, +) -> None: + """indexed() with use_arrow=True: WEEK and QUARTER values remain as strings. + + Matches the non-Arrow path where StringConverter is registered for these granularities. + """ + if case_name not in _cases(): + pytest.skip(f"fixture {case_name!r} not available") + table, _, _ = _load_case(case_name) + mock_sdk, _, _ = _mock_execution(table, columns, index_by) + + gdf = DataFrameFactory(mock_sdk, "workspace", use_arrow=True) + df = gdf.indexed(index_by=index_by, columns=columns) + + assert df.index.tolist() == expected_strings + + +def test_not_indexed_use_arrow_year_column_to_timestamp() -> None: + """not_indexed() with use_arrow=True converts a year-granularity column to Timestamp.""" + if "date_year_in_rows" not in _cases(): + pytest.skip("fixture date_year_in_rows not available") + table, _, _ = _load_case("date_year_in_rows") + columns = {"year_col": "label/date.year", "revenue": "metric/revenue"} + mock_sdk, _, _ = _mock_execution(table, columns) + + gdf = DataFrameFactory(mock_sdk, "workspace", use_arrow=True) + df = gdf.not_indexed(columns=columns) + + assert df["year_col"].tolist() == [ + pandas.Timestamp("2023-01-01"), + pandas.Timestamp("2024-01-01"), + pandas.Timestamp("2025-01-01"), + ] + + +def test_indexed_use_arrow_date_none_values_preserved() -> None: + """indexed() with use_arrow=True preserves None (null) entries in date columns.""" + table = _make_date_attr_table("date.year", "year", ["2023", None, "2025"], metric_values=[1.0, 2.0, 3.0]) + columns = {"revenue": "metric/revenue"} + index_by = {"date": "label/date.year"} + mock_sdk, _, _ = _mock_execution(table, columns, index_by) + + gdf = DataFrameFactory(mock_sdk, "workspace", use_arrow=True) + df = gdf.indexed(index_by=index_by, columns=columns) + + idx = df.index.tolist() + assert idx[0] == pandas.Timestamp("2023-01-01") + assert idx[1] is None or pandas.isna(idx[1]) + assert idx[2] == pandas.Timestamp("2025-01-01") + + +def test_indexed_use_arrow_text_attr_unchanged() -> None: + """indexed() with use_arrow=True: text attributes (no granularity) stay as strings.""" + # Use a fixture that has a plain text attribute (region). + if "dim_r_m" not in _cases(): + pytest.skip("fixture dim_r_m not available") + table, _, _ = _load_case("dim_r_m") + columns = {"price": "metric/price", "order_amount": "metric/order_amount"} + index_by = {"region": "label/region"} + mock_sdk, _, _ = _mock_execution(table, columns, index_by) + + gdf = DataFrameFactory(mock_sdk, "workspace", use_arrow=True) + df = gdf.indexed(index_by=index_by, columns=columns) + + # All index values should be plain strings. + assert all(isinstance(v, str) for v in df.index.tolist()) + + +def test_indexed_use_arrow_empty_result_preserves_structure() -> None: + """indexed(use_arrow=True) on a zero-row Arrow table returns empty DataFrame with correct names.""" + model_meta = { + "labels": {"region": {"granularity": None, "labelTitle": "Region", "primaryLabelId": "region"}}, + "requestedShape": {"metrics": ["revenue"]}, + "metrics": {"revenue": {"title": "Revenue"}}, + } + xtab_meta = { + "labelMetadata": {"l0": {"labelId": "region", "primaryLabelId": "region"}}, + "computedShape": {"rows": [], "cols": [], "metrics": ["m0"]}, + "totalsMetadata": {}, + } + schema_meta = { + b"x-gdc-model-v1": json.dumps(model_meta).encode(), + b"x-gdc-xtab-v1": json.dumps(xtab_meta).encode(), + b"x-gdc-view-v1": json.dumps({"isTransposed": False}).encode(), + } + gdc_metric = {b"gdc": json.dumps({"type": "metric", "index": 0}).encode()} + schema = pa.schema( + [ + pa.field("__row_type", pa.int8()), + pa.field("region", pa.string()), + pa.field("metric_group_0", pa.float64(), metadata=gdc_metric), + ], + metadata=schema_meta, + ) + empty_table = pa.table( + { + "__row_type": pa.array([], type=pa.int8()), + "region": pa.array([], type=pa.string()), + "metric_group_0": pa.array([], type=pa.float64()), + }, + schema=schema, + ) + + columns = {"revenue": "metric/revenue"} + index_by = {"reg": "label/region"} + mock_sdk, _, _ = _mock_execution(empty_table, columns, index_by) + + gdf = DataFrameFactory(mock_sdk, "workspace", use_arrow=True) + df = gdf.indexed(index_by=index_by, columns=columns) + + assert len(df) == 0 + assert list(df.columns) == ["revenue"] + assert df.index.name == "reg" + + +def test_extract_from_arrow_without_pyarrow_raises_import_error() -> None: + """_extract_from_arrow raises ImportError (not NameError) when pyarrow is unavailable.""" + original = _da._ARROW_IMPORT_ERROR + try: + _da._ARROW_IMPORT_ERROR = ImportError("pyarrow not installed") + with pytest.raises(ImportError, match="pyarrow"): + _da._extract_from_arrow(MagicMock(), [], {}, {}, {}) + finally: + _da._ARROW_IMPORT_ERROR = original + + +def test_parse_schema_metadata_non_utf8_key_is_skipped() -> None: + """_parse_schema_metadata skips non-UTF-8 byte keys without raising UnicodeDecodeError.""" + if "dim_r_m" not in _cases(): + pytest.skip("fixture dim_r_m not available") + table, _, _ = _load_case("dim_r_m") + non_utf8_meta = {b"\xff\xfe": b"some value", **table.schema.metadata} + table_with_bad_key = table.replace_schema_metadata(non_utf8_meta) + # Should not raise despite the non-UTF-8 key. + df = convert_arrow_table_to_dataframe(table_with_bad_key) + assert df is not None + + +def test_build_inline_index_total_row_numeric_label_uses_agg_name() -> None: + """Total rows with non-string (numeric/null) label values are replaced with the aggregation name.""" + table = pa.table( + { + "__row_type": pa.array([0, 2], type=pa.int8()), + "year": pa.array([2023.0, None], type=pa.float64()), + # Data row has no total ref; total row refers to "t0" in totalsMetadata. + "__total_ref": pa.array([None, [0]], type=pa.list_(pa.int32())), + } + ) + xtab_meta = { + "labelMetadata": {"l0": {"labelId": "year", "primaryLabelId": "year"}}, + "totalsMetadata": {"t0": {"aggregation": "sum", "rowLabels": []}}, + } + model_meta = { + "labels": {"year": {"labelTitle": "Year"}}, + "requestedShape": {"metrics": []}, + } + idx = _build_inline_index( + table, + row_label_refs=["l0"], + label_ref_to_id={"l0": "year"}, + model_meta=model_meta, + xtab_meta=xtab_meta, + ) + assert idx is not None + assert idx[0] == 2023.0 + assert idx[1] == "SUM" + + +def test_arrow_config_max_bytes_forwarded_to_read_result_arrow() -> None: + """ArrowConfig.max_bytes is passed through to read_result_arrow() by for_exec_def_arrow().""" + if "dim_r_m" not in _cases(): + pytest.skip("fixture dim_r_m not available") + table, _, meta = _load_case("dim_r_m") + + mock_exec = MagicMock() + mock_exec.bare_exec_response.read_result_arrow.return_value = table + mock_exec.bare_exec_response.dimensions = meta["dimensions"] + mock_sdk = MagicMock() + mock_sdk.compute.for_exec_def.return_value = mock_exec + + gdf = DataFrameFactory(mock_sdk, "workspace", arrow_config=ArrowConfig(max_bytes=10_000_000)) + gdf.for_exec_def_arrow(MagicMock(spec=ExecutionDefinition)) + + mock_exec.bare_exec_response.read_result_arrow.assert_called_once_with(max_bytes=10_000_000) + + +def test_arrow_config_max_bytes_raises_when_exceeded() -> None: + """ResultSizeBytesLimitExceeded from read_result_arrow propagates out of for_exec_def_arrow().""" + mock_exec = MagicMock() + mock_exec.bare_exec_response.read_result_arrow.side_effect = ResultSizeBytesLimitExceeded(100, 200) + mock_sdk = MagicMock() + mock_sdk.compute.for_exec_def.return_value = mock_exec + + gdf = DataFrameFactory(mock_sdk, "workspace", arrow_config=ArrowConfig(max_bytes=100)) + with pytest.raises(ResultSizeBytesLimitExceeded): + gdf.for_exec_def_arrow(MagicMock(spec=ExecutionDefinition)) + + +def test_build_inline_index_multiple_total_ref_columns_warns(caplog: pytest.LogCaptureFixture) -> None: + """_build_inline_index warns when the Arrow table has more than one __total_ref* column.""" + table = pa.table( + { + "__row_type": pa.array([0, 2], type=pa.int8()), + "region": pa.array(["East", "sum"], type=pa.string()), + "__total_ref": pa.array([None, [0]], type=pa.list_(pa.int32())), + "__total_ref_x": pa.array([None, [0]], type=pa.list_(pa.int32())), + } + ) + xtab_meta = { + "labelMetadata": {"l0": {"labelId": "region", "primaryLabelId": "region"}}, + "totalsMetadata": {"t0": {"aggregation": "sum", "rowLabels": []}}, + } + model_meta = {"labels": {"region": {"labelTitle": "Region"}}, "requestedShape": {"metrics": []}} + + with caplog.at_level(logging.WARNING, logger="gooddata_pandas.arrow_convertor"): + idx = _build_inline_index(table, ["l0"], {"l0": "region"}, model_meta, xtab_meta) + + assert idx is not None + assert idx[1] == "SUM" + assert any("__total_ref" in msg for msg in caplog.messages) + + +def test_compute_row_totals_indexes_no_matching_dim_warns(caplog: pytest.LogCaptureFixture) -> None: + """compute_row_totals_indexes warns when execution_dims is non-empty but no dim contains the row labels.""" + if "dim_r_m" not in _cases(): + pytest.skip("fixture dim_r_m not available") + table, _, _ = _load_case("dim_r_m") + bogus_dims = [{"headers": [{"measureGroupHeaders": [{"localIdentifier": "price"}]}]}] + + with caplog.at_level(logging.WARNING, logger="gooddata_pandas.arrow_convertor"): + result = compute_row_totals_indexes(table, bogus_dims) + + assert result == [] + assert any("row label IDs" in msg for msg in caplog.messages) + + +def test_read_result_arrow_max_bytes_raises_when_exceeded() -> None: + """read_result_arrow raises ResultSizeBytesLimitExceeded when the payload exceeds max_bytes.""" + # Build a valid Arrow IPC payload to pass through open_stream. + tiny_table = pa.table({"x": pa.array([1, 2, 3], type=pa.int32())}) + buf = io.BytesIO() + with ipc.new_stream(buf, tiny_table.schema) as writer: + writer.write_table(tiny_table) + payload = buf.getvalue() + + mock_response = MagicMock() + mock_response.read.return_value = payload + + mock_api_client = MagicMock() + mock_api_client.call_api.return_value = mock_response + + bare = object.__new__(BareExecutionResponse) + bare._actions_api = MagicMock() + bare._actions_api.api_client = mock_api_client + bare._workspace_id = "ws" + bare._exec_response = {"links": {"executionResult": "result-id"}} + bare._cancel_token = None + + # Below limit: succeeds and returns the table. + result = bare.read_result_arrow(max_bytes=len(payload) + 1000) + assert result.num_rows == 3 + + # At or below payload size: raises. + with pytest.raises(ResultSizeBytesLimitExceeded) as exc_info: + bare.read_result_arrow(max_bytes=1) + assert exc_info.value.result_size_bytes_limit == 1 + assert exc_info.value.actual_result_bytes_size == len(payload) + + +def test_indexed_use_arrow_mixed_date_and_text_index() -> None: + """indexed() with use_arrow=True: date attr → Timestamp, text attr → str in MultiIndex.""" + n = 3 + year_values = ["2023", "2024", "2025"] + region_values = ["East", "West", "South"] + + model_meta = { + "labels": { + "date.year": { + "granularity": "year", + "labelTitle": "Year", + "labelType": None, + "primaryLabelId": "date.year", + "attributeId": "date.year", + }, + "region": { + "granularity": None, + "labelTitle": "Region", + "labelType": "TEXT", + "primaryLabelId": "region", + "attributeId": "region", + }, + }, + "requestedShape": {"metrics": ["revenue"]}, + "metrics": {"revenue": {"title": "Revenue"}}, + } + xtab_meta = { + "labelMetadata": { + "l0": {"labelId": "date.year", "primaryLabelId": "date.year"}, + "l1": {"labelId": "region", "primaryLabelId": "region"}, + }, + "computedShape": {"metrics": ["m0"], "rows": [], "cols": []}, + "totalsMetadata": {}, + } + schema_meta = { + b"x-gdc-model-v1": json.dumps(model_meta).encode(), + b"x-gdc-xtab-v1": json.dumps(xtab_meta).encode(), + b"x-gdc-view-v1": json.dumps({"isTransposed": False}).encode(), + } + gdc_metric = {b"gdc": json.dumps({"type": "metric", "index": 0}).encode()} + schema = pa.schema( + [ + pa.field("__row_type", pa.int8()), + pa.field("date.year", pa.string()), + pa.field("region", pa.string()), + pa.field("metric_group_0", pa.float64(), metadata=gdc_metric), + ], + metadata=schema_meta, + ) + table = pa.table( + { + "__row_type": pa.array([0] * n, type=pa.int8()), + "date.year": pa.array(year_values, type=pa.string()), + "region": pa.array(region_values, type=pa.string()), + "metric_group_0": pa.array([10.0, 20.0, 30.0], type=pa.float64()), + }, + schema=schema, + ) + + columns = {"revenue": "metric/revenue"} + index_by = {"year": "label/date.year", "region": "label/region"} + mock_sdk, _, _ = _mock_execution(table, columns, index_by) + + gdf = DataFrameFactory(mock_sdk, "workspace", use_arrow=True) + df = gdf.indexed(index_by=index_by, columns=columns) + + assert isinstance(df.index, pandas.MultiIndex) + year_level = df.index.get_level_values("year").tolist() + region_level = df.index.get_level_values("region").tolist() + assert year_level == [ + pandas.Timestamp("2023-01-01"), + pandas.Timestamp("2024-01-01"), + pandas.Timestamp("2025-01-01"), + ] + assert region_level == ["East", "West", "South"] diff --git a/packages/gooddata-sdk/src/gooddata_sdk/compute/model/execution.py b/packages/gooddata-sdk/src/gooddata_sdk/compute/model/execution.py index d947bad60..df5284ec6 100644 --- a/packages/gooddata-sdk/src/gooddata_sdk/compute/model/execution.py +++ b/packages/gooddata-sdk/src/gooddata_sdk/compute/model/execution.py @@ -1,6 +1,7 @@ # (C) 2022 GoodData Corporation from __future__ import annotations +import io import logging from typing import TYPE_CHECKING, Any, Union @@ -382,12 +383,16 @@ def read_result( ) return ExecutionResult(execution_result) - def read_result_arrow(self) -> pyarrow.Table: + def read_result_arrow(self, max_bytes: int | None = None) -> pyarrow.Table: """ Reads the full execution result as a pyarrow Table. The binary endpoint returns the complete result in one shot (no paging). Requires pyarrow to be installed (pip install gooddata-sdk[arrow]). + + Args: + max_bytes: Optional byte-size limit. Raises ResultSizeBytesLimitExceeded when + the response body exceeds this value. """ if _ipc is None: raise ImportError( @@ -408,8 +413,21 @@ def read_result_arrow(self) -> pyarrow.Table: _return_http_data_only=True, ) try: + if max_bytes is not None: + # Buffer first so we can check size before parsing. + data = response.read() + if len(data) > max_bytes: + raise ResultSizeBytesLimitExceeded( + result_size_bytes_limit=max_bytes, + actual_result_bytes_size=len(data), + ) + return _ipc.open_stream(io.BytesIO(data)).read_all() return _ipc.open_stream(response).read_all() finally: + # Drain the HTTP chunked-encoding terminator (0\r\n\r\n) that pyarrow might + # leave unread after the IPC EOS marker, so the connection is fully + # consumed before being returned to the urllib3 pool. + response.read() response.release_conn() def cancel(self) -> None: