Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/04_upgrading/upgrading_to_v4.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ Pickle could store arbitrary Python objects. JSON cannot, so the values in a req

- A `tuple` comes back as a `list`.
- Non-string `dict` keys come back as strings, so `{1: 'a'}` becomes `{'1': 'a'}`.
- A value JSON cannot represent (`datetime`, `set`, `Decimal`, a custom object) is no longer stored silently. The request is skipped and the failure is logged. Pydantic models are still supported and are dumped with `model_dump()`.
- A value JSON cannot represent (`datetime`, `set`, `Decimal`, a custom object) is no longer stored silently. The request is skipped and the failure is logged. Pydantic models are still supported and are dumped with `model_dump(mode='json')`, so model fields JSON cannot natively represent (such as `datetime`) are stored in their JSON form.

Convert such values to a JSON-friendly form before yielding the request:

Expand Down
9 changes: 6 additions & 3 deletions src/apify/scrapy/_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
Only `body` (`bytes`) and `headers` (`{bytes: [bytes]}`) are not natively JSON-serializable; both sit at fixed keys
and are base64-encoded in place. A `str` `body` is encoded as its UTF-8 bytes and comes back as `bytes`, matching
Scrapy, which always stores `body` as `bytes`. Pydantic models such as Crawlee's `UserData` are dumped via
`model_dump()`. Everything else, notably `meta` and `cb_kwargs`, must already be JSON-serializable, otherwise
`model_dump(mode='json')`, so model fields JSON cannot natively represent (e.g. `datetime`) are stored in their
JSON form. Everything else, notably `meta` and `cb_kwargs`, must already be JSON-serializable, otherwise
serialization fails with a clear error naming the offending value. No in-band sentinel is used, so no user value
can collide with the encoding.

Expand Down Expand Up @@ -60,7 +61,9 @@ def encode_to_json(data: dict[str, Any]) -> str:
# `ensure_ascii=False` keeps non-ASCII URLs/meta as their UTF-8 form instead of `\uXXXX` escapes, which
# would otherwise roughly double the size of non-Latin text in storage.
return json.dumps(safe, default=_json_default, ensure_ascii=False)
except TypeError as exc:
# `ValueError` covers pydantic's `PydanticSerializationError`, raised when a model field cannot be dumped
# to JSON even in JSON mode.
except (TypeError, ValueError) as exc:
raise TypeError(
'Failed to JSON-serialize a Scrapy request/response for storage on the Apify platform. '
'All values in `meta` and `cb_kwargs` must be JSON-serializable (str, int, float, bool, None, '
Expand Down Expand Up @@ -100,7 +103,7 @@ def _json_default(obj: Any) -> Any:
at the bad `meta`/`cb_kwargs` entry instead of just reporting that something failed.
"""
if isinstance(obj, BaseModel):
return obj.model_dump(by_alias=True)
return obj.model_dump(mode='json', by_alias=True)
value_repr = repr(obj)
if len(value_repr) > _MAX_ERROR_VALUE_REPR_LEN:
value_repr = value_repr[:_MAX_ERROR_VALUE_REPR_LEN] + '...'
Expand Down
2 changes: 1 addition & 1 deletion src/apify/scrapy/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
# None per this function's contract), rather than crashing the crawl.
try:
scrapy_request_json = encode_to_json(scrapy_request_dict)
except TypeError:
except (TypeError, ValueError):
logger.exception(
f'Failed to serialize Scrapy request {scrapy_request} for storage on the Apify platform; skipping it. '
'Ensure all values in `meta` and `cb_kwargs` are JSON-serializable.'
Expand Down
24 changes: 23 additions & 1 deletion tests/unit/scrapy/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from datetime import UTC, datetime

import pytest
from pydantic import BaseModel, Field
from pydantic import BaseModel, ConfigDict, Field

from apify.scrapy._serialization import _MAX_ERROR_VALUE_REPR_LEN, decode_from_json, encode_to_json

Expand Down Expand Up @@ -71,6 +71,28 @@ class Model(BaseModel):
assert decode_from_json(encoded)['meta']['m'] == {'First': 1}


def test_pydantic_model_with_datetime_field_round_trips() -> None:
"""A pydantic model with a `datetime` field is dumped in JSON mode, so the request is stored, not dropped."""

class Model(BaseModel):
when: datetime

encoded = encode_to_json({'meta': {'m': Model(when=datetime(2020, 1, 2, 3, 4, 5, tzinfo=UTC))}})
assert decode_from_json(encoded)['meta']['m'] == {'when': '2020-01-02T03:04:05Z'}


def test_pydantic_model_with_non_serializable_field_raises() -> None:
"""A model field that even JSON mode cannot dump raises the clear `TypeError`, not a bare pydantic error."""

class Model(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)

obj: object

with pytest.raises(TypeError, match='JSON-serializable'):
encode_to_json({'meta': {'m': Model(obj=object())}})


def test_tuple_is_coerced_to_list() -> None:
"""Documented limitation: JSON has no tuple type, so a tuple round-trips as a list."""
assert _round_trip({'meta': {'coords': (1, 2, 3)}})['meta']['coords'] == [1, 2, 3]
Expand Down
Loading