diff --git a/docs/04_upgrading/upgrading_to_v4.md b/docs/04_upgrading/upgrading_to_v4.md index 32cc13ca..a7854a07 100644 --- a/docs/04_upgrading/upgrading_to_v4.md +++ b/docs/04_upgrading/upgrading_to_v4.md @@ -218,7 +218,7 @@ Pickle could store arbitrary Python objects. JSON cannot, so the values in a req - A `tuple` comes back as a `list`. - Non-string `dict` keys come back as strings, so `{1: 'a'}` becomes `{'1': 'a'}`. -- A value JSON cannot represent (`datetime`, `set`, `Decimal`, a custom object) is no longer stored silently. The request is skipped and the failure is logged. Pydantic models are still supported and are dumped with `model_dump()`. +- Non-JSON-serializable values, such as `datetime`, `set`, `Decimal`, or custom objects, are skipped and logged. Pydantic models are supported via `model_dump(mode='json')`, which converts non-JSON-native fields into JSON-compatible values, such as ISO-8601 strings for `datetime` fields. Convert such values to a JSON-friendly form before yielding the request: @@ -230,4 +230,4 @@ yield scrapy.Request(url, meta={'since': datetime(2024, 1, 1)}) # After (v4): store a JSON-serializable value. yield scrapy.Request(url, meta={'since': datetime(2024, 1, 1).isoformat()}) -``` +```Pickle diff --git a/src/apify/scrapy/_serialization.py b/src/apify/scrapy/_serialization.py index 0cd66b35..7cac76bd 100644 --- a/src/apify/scrapy/_serialization.py +++ b/src/apify/scrapy/_serialization.py @@ -1,20 +1,23 @@ """JSON serialization of Scrapy requests and cached responses for storage on the Apify platform. -Scrapy requests and cached responses are stored in the Apify request queue and key-value store which hold JSON, -so they are serialized as JSON here rather than pickled. - -Only `body` (`bytes`) and `headers` (`{bytes: [bytes]}`) are not natively JSON-serializable; both sit at fixed keys -and are base64-encoded in place. A `str` `body` is encoded as its UTF-8 bytes and comes back as `bytes`, matching -Scrapy, which always stores `body` as `bytes`. Pydantic models such as Crawlee's `UserData` are dumped via -`model_dump()`. Everything else, notably `meta` and `cb_kwargs`, must already be JSON-serializable, otherwise -serialization fails with a clear error naming the offending value. No in-band sentinel is used, so no user value -can collide with the encoding. - -Known limitations of the pickle -> JSON switch (a documented breaking change): JSON has fewer types than pickle, -so values in `meta`/`cb_kwargs` are subject to JSON's coercions. A `tuple` round-trips as a `list` and non-string -`dict` keys round-trip as strings (e.g. `{1: 'a'}` becomes `{'1': 'a'}`). Values JSON cannot represent at all -(`datetime`, `set`, `Decimal`, arbitrary objects, ...) are not coerced silently: serialization raises and the request -is skipped loudly rather than stored in a corrupted form. +Scrapy requests and cached responses are stored in the Apify request queue and key-value store, which hold JSON. +They are therefore serialized as JSON here. + +Only `body` (`bytes`) and `headers` (`{bytes: [bytes]}`) are not natively JSON-serializable. Both live at fixed keys +and are base64-encoded in place. A `str` `body` is encoded as UTF-8 bytes and deserialized as `bytes`, matching +Scrapy, which always stores `body` as `bytes`. + +Pydantic models, such as Crawlee's `UserData`, are dumped with `model_dump(mode='json')`, which converts +non-JSON-native fields into JSON-compatible values. For example, `datetime` fields are stored as ISO-8601 strings. + +All other values, notably `meta` and `cb_kwargs`, must already be JSON-serializable. Non-JSON-serializable values, +such as `datetime`, `set`, `Decimal`, or arbitrary objects, fail serialization with a clear error naming the +offending value. The request is skipped rather than stored in a corrupted form. + +No in-band sentinel is used, so user values cannot collide with the encoding. + +Known JSON limitations: values in `meta` and `cb_kwargs` are subject to JSON coercions. A `tuple` round-trips as +a `list`, and non-string `dict` keys round-trip as strings; for example, `{1: 'a'}` becomes `{'1': 'a'}`. """ from __future__ import annotations @@ -60,7 +63,9 @@ def encode_to_json(data: dict[str, Any]) -> str: # `ensure_ascii=False` keeps non-ASCII URLs/meta as their UTF-8 form instead of `\uXXXX` escapes, which # would otherwise roughly double the size of non-Latin text in storage. return json.dumps(safe, default=_json_default, ensure_ascii=False) - except TypeError as exc: + # `ValueError` covers pydantic's `PydanticSerializationError`, raised when a model field cannot be dumped + # to JSON even in JSON mode. + except (TypeError, ValueError) as exc: raise TypeError( 'Failed to JSON-serialize a Scrapy request/response for storage on the Apify platform. ' 'All values in `meta` and `cb_kwargs` must be JSON-serializable (str, int, float, bool, None, ' @@ -100,7 +105,7 @@ def _json_default(obj: Any) -> Any: at the bad `meta`/`cb_kwargs` entry instead of just reporting that something failed. """ if isinstance(obj, BaseModel): - return obj.model_dump(by_alias=True) + return obj.model_dump(mode='json', by_alias=True) value_repr = repr(obj) if len(value_repr) > _MAX_ERROR_VALUE_REPR_LEN: value_repr = value_repr[:_MAX_ERROR_VALUE_REPR_LEN] + '...' diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 38d6648d..1524d7aa 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -128,7 +128,7 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ # None per this function's contract), rather than crashing the crawl. try: scrapy_request_json = encode_to_json(scrapy_request_dict) - except TypeError: + except (TypeError, ValueError): logger.exception( f'Failed to serialize Scrapy request {scrapy_request} for storage on the Apify platform; skipping it. ' 'Ensure all values in `meta` and `cb_kwargs` are JSON-serializable.' diff --git a/tests/unit/scrapy/test_serialization.py b/tests/unit/scrapy/test_serialization.py index e290af98..0658dc60 100644 --- a/tests/unit/scrapy/test_serialization.py +++ b/tests/unit/scrapy/test_serialization.py @@ -3,7 +3,7 @@ from datetime import UTC, datetime import pytest -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from apify.scrapy._serialization import _MAX_ERROR_VALUE_REPR_LEN, decode_from_json, encode_to_json @@ -71,6 +71,28 @@ class Model(BaseModel): assert decode_from_json(encoded)['meta']['m'] == {'First': 1} +def test_pydantic_model_with_datetime_field_round_trips() -> None: + """A pydantic model with a `datetime` field is dumped in JSON mode, so the request is stored, not dropped.""" + + class Model(BaseModel): + when: datetime + + encoded = encode_to_json({'meta': {'m': Model(when=datetime(2020, 1, 2, 3, 4, 5, tzinfo=UTC))}}) + assert decode_from_json(encoded)['meta']['m'] == {'when': '2020-01-02T03:04:05Z'} + + +def test_pydantic_model_with_non_serializable_field_raises() -> None: + """A model field that even JSON mode cannot dump raises the clear `TypeError`, not a bare pydantic error.""" + + class Model(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + obj: object + + with pytest.raises(TypeError, match='JSON-serializable'): + encode_to_json({'meta': {'m': Model(obj=object())}}) + + def test_tuple_is_coerced_to_list() -> None: """Documented limitation: JSON has no tuple type, so a tuple round-trips as a list.""" assert _round_trip({'meta': {'coords': (1, 2, 3)}})['meta']['coords'] == [1, 2, 3]