Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ jobs:
coverage report --format=markdown >> $GITHUB_STEP_SUMMARY

- name: Upload combined coverage to Codecov
uses: codecov/codecov-action@5a1091511ad55cbe89839c7260b706298ca349f7 # v5
uses: codecov/codecov-action@5975040f7f7d40edaff8d784b576fd65ae95c073 # v5.5.5
with:
token: ${{ secrets.CODECOV_TOKEN }}
slug: ${{ github.repository }}
Expand Down
18 changes: 13 additions & 5 deletions deepnote_toolkit/chart/deepnote_chart.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from deepnote_toolkit.chart.types import CHART_ROW_LIMIT, VEGA_5_MIME_TYPE, ChartError
from deepnote_toolkit.chart.utils import (
sanitize_dataframe_for_chart,
sanitize_polars_dataframe_for_chart,
serialize_values_list_for_json,
)
from deepnote_toolkit.logging import LoggerManager
Expand Down Expand Up @@ -141,11 +142,18 @@ def __init__(
if filtered_df.native_type == "pandas":
sanitized_pandas = sanitize_dataframe_for_chart(filtered_df.to_native())
oc_sanitized_df = oc.DataFrame.from_native(sanitized_pandas)
elif filtered_df.native_type in ("pyspark", "polars-eager"):
# We don't need to sanitize Spark DFs because they will processed by Spark itself and it can handle
# all data types by itself
# Polars is powered by Arrow, which is same format used internally by VegaFusion so there is no need
# to do any additional sanitization for it either
elif filtered_df.native_type == "polars-eager":
# Polars is Arrow-backed, so most types pass through to VegaFusion
# untouched. The exception is Object columns (e.g. uuid.UUID values),
# which convert to an Arrow type VegaFusion can't serialize, so they
# still need sanitizing.
sanitized_polars = sanitize_polars_dataframe_for_chart(
filtered_df.to_native()
)
oc_sanitized_df = oc.DataFrame.from_native(sanitized_polars)
elif filtered_df.native_type == "pyspark":
# Spark processes the data itself and handles all data types, so no
# sanitization is needed here.
oc_sanitized_df = filtered_df
else:
raise TypeError(
Expand Down
58 changes: 57 additions & 1 deletion deepnote_toolkit/chart/utils.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,50 @@
from typing import Any, List, Optional
import uuid
from typing import TYPE_CHECKING, Any, List, Optional

import pandas as pd

import deepnote_toolkit.ocelots as oc

if TYPE_CHECKING:
import polars as pl


def sanitize_dataframe_for_chart(pd_df: pd.DataFrame):
sanitized_dataframe = pd_df.copy()

oc.pandas.utils.deduplicate_columns(sanitized_dataframe)
_convert_timedelta_columns_to_seconds(sanitized_dataframe)
_convert_uuid_columns_to_string(sanitized_dataframe)
_convert_column_names_to_string(sanitized_dataframe)

return sanitized_dataframe


def sanitize_polars_dataframe_for_chart(pl_df: "pl.DataFrame") -> "pl.DataFrame":
"""
Coerce polars columns that VegaFusion cannot serialize into chart-friendly
types, returning a new DataFrame.

polars stores values it has no native type for (e.g. ``uuid.UUID`` objects)
in an ``Object`` column, which converts to an opaque Arrow ``FixedSizeBinary``
that VegaFusion cannot serialize to JSON. Such columns are not meaningfully
chartable as-is, so we stringify them -- the polars analogue of the UUID
handling in :func:`sanitize_dataframe_for_chart` for the pandas path.
"""
import polars as pl

object_columns = [
name for name, dtype in zip(pl_df.columns, pl_df.dtypes) if dtype == pl.Object
]
if not object_columns:
return pl_df

return pl_df.with_columns(
pl.col(name).map_elements(str, return_dtype=pl.String)
for name in object_columns
)


def _convert_column_names_to_string(pd_df: pd.DataFrame):
"""
Converts dataframe column names to strings.
Expand All @@ -24,6 +54,32 @@ def _convert_column_names_to_string(pd_df: pd.DataFrame):
pd_df.columns = pd_df.columns.astype(str)


def _convert_uuid_columns_to_string(pd_df: pd.DataFrame):
"""
Converts columns of ``uuid.UUID`` objects to strings.

Starting with pyarrow 24.0.0, Arrow conversion infers the canonical
``arrow.uuid`` extension type (backed by ``FixedSizeBinary(16)``) for object
columns holding ``uuid.UUID`` values; pyarrow <= 23 produced a serializable
result for the same data. VegaFusion's Arrow runtime cannot serialize
``FixedSizeBinary(16)`` to JSON (``Unsupported datatype for JSON
serialization: FixedSizeBinary(16)``), so we stringify such columns to keep
charting working across pyarrow versions.

WARNING: This function modifies the DataFrame in-place.
"""
for column in pd_df.columns:
col = pd_df[column]
if not pd.api.types.is_object_dtype(col.dtype):
continue
non_null = col.dropna()
if non_null.empty or not isinstance(non_null.iloc[0], uuid.UUID):
continue
pd_df[column] = col.map(
lambda value: str(value) if isinstance(value, uuid.UUID) else value
Comment thread
coderabbitai[bot] marked this conversation as resolved.
)


def _convert_timedelta_columns_to_seconds(pd_sanitized_df: pd.DataFrame):
"""
Converts timedelta columns to seconds.
Expand Down
Loading
Loading