From d498da5c606d30da5bdc202694aae3a3f62bd077 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 11 Jun 2026 14:25:05 +0200 Subject: [PATCH 1/2] fix(scrapy): dump pydantic models in JSON mode when serializing requests A pydantic model in `meta`/`cb_kwargs` with a non-JSON-native field (`datetime`, `UUID`, `HttpUrl`) failed `json.dumps` and the request was dropped, contradicting the v4 upgrade guide. Also fixes the PyPI Release Notes URL (pointed at the v2 guide) and stale Python 3.10 claims in the docs. --- CONTRIBUTING.md | 2 +- docs/01_introduction/index.mdx | 2 +- docs/04_upgrading/upgrading_to_v4.md | 2 +- pyproject.toml | 2 +- src/apify/scrapy/_serialization.py | 9 ++++++--- src/apify/scrapy/requests.py | 2 +- tests/unit/scrapy/test_serialization.py | 24 +++++++++++++++++++++++- 7 files changed, 34 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 927d4a344..cbcaa8b21 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ Here you'll find a contributing guide to get started with development. ## Environment -For local development, it is required to have Python 3.10 (or a later version) installed. +For local development, it is required to have Python 3.11 (or a later version) installed. We use [uv](https://docs.astral.sh/uv/) for project management. Install it and set up your IDE accordingly. diff --git a/docs/01_introduction/index.mdx b/docs/01_introduction/index.mdx index 2e8039799..27f8cca0f 100644 --- a/docs/01_introduction/index.mdx +++ b/docs/01_introduction/index.mdx @@ -29,7 +29,7 @@ Explore the Guides section in the sidebar for a deeper understanding of the SDK' ## Installation -The Apify SDK for Python requires Python version 3.10 or above. It is typically installed when you create a new Actor project using the [Apify CLI](https://docs.apify.com/cli). To install it manually in an existing project, use: +The Apify SDK for Python requires Python version 3.11 or above. It is typically installed when you create a new Actor project using the [Apify CLI](https://docs.apify.com/cli). To install it manually in an existing project, use: ```bash pip install apify diff --git a/docs/04_upgrading/upgrading_to_v4.md b/docs/04_upgrading/upgrading_to_v4.md index 32cc13ca7..421d38808 100644 --- a/docs/04_upgrading/upgrading_to_v4.md +++ b/docs/04_upgrading/upgrading_to_v4.md @@ -218,7 +218,7 @@ Pickle could store arbitrary Python objects. JSON cannot, so the values in a req - A `tuple` comes back as a `list`. - Non-string `dict` keys come back as strings, so `{1: 'a'}` becomes `{'1': 'a'}`. -- A value JSON cannot represent (`datetime`, `set`, `Decimal`, a custom object) is no longer stored silently. The request is skipped and the failure is logged. Pydantic models are still supported and are dumped with `model_dump()`. +- A value JSON cannot represent (`datetime`, `set`, `Decimal`, a custom object) is no longer stored silently. The request is skipped and the failure is logged. Pydantic models are still supported and are dumped with `model_dump(mode='json')`, so model fields JSON cannot natively represent (such as `datetime`) are stored in their JSON form. Convert such values to a JSON-friendly form before yielding the request: diff --git a/pyproject.toml b/pyproject.toml index be95eda8d..6ce930bfb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ scrapy = ["scrapy>=2.14.0"] "Documentation" = "https://docs.apify.com/sdk/python/docs/overview" "Homepage" = "https://docs.apify.com/sdk/python/" "Issue Tracker" = "https://github.com/apify/apify-sdk-python/issues" -"Release Notes" = "https://docs.apify.com/sdk/python/docs/upgrading/upgrading-to-v2" +"Release Notes" = "https://docs.apify.com/sdk/python/docs/upgrading/upgrading-to-v4" "Source Code" = "https://github.com/apify/apify-sdk-python" [dependency-groups] diff --git a/src/apify/scrapy/_serialization.py b/src/apify/scrapy/_serialization.py index 0cd66b35d..f96ed2115 100644 --- a/src/apify/scrapy/_serialization.py +++ b/src/apify/scrapy/_serialization.py @@ -6,7 +6,8 @@ Only `body` (`bytes`) and `headers` (`{bytes: [bytes]}`) are not natively JSON-serializable; both sit at fixed keys and are base64-encoded in place. A `str` `body` is encoded as its UTF-8 bytes and comes back as `bytes`, matching Scrapy, which always stores `body` as `bytes`. Pydantic models such as Crawlee's `UserData` are dumped via -`model_dump()`. Everything else, notably `meta` and `cb_kwargs`, must already be JSON-serializable, otherwise +`model_dump(mode='json')`, so model fields JSON cannot natively represent (e.g. `datetime`) are stored in their +JSON form. Everything else, notably `meta` and `cb_kwargs`, must already be JSON-serializable, otherwise serialization fails with a clear error naming the offending value. No in-band sentinel is used, so no user value can collide with the encoding. @@ -60,7 +61,9 @@ def encode_to_json(data: dict[str, Any]) -> str: # `ensure_ascii=False` keeps non-ASCII URLs/meta as their UTF-8 form instead of `\uXXXX` escapes, which # would otherwise roughly double the size of non-Latin text in storage. return json.dumps(safe, default=_json_default, ensure_ascii=False) - except TypeError as exc: + # `ValueError` covers pydantic's `PydanticSerializationError`, raised when a model field cannot be dumped + # to JSON even in JSON mode. + except (TypeError, ValueError) as exc: raise TypeError( 'Failed to JSON-serialize a Scrapy request/response for storage on the Apify platform. ' 'All values in `meta` and `cb_kwargs` must be JSON-serializable (str, int, float, bool, None, ' @@ -100,7 +103,7 @@ def _json_default(obj: Any) -> Any: at the bad `meta`/`cb_kwargs` entry instead of just reporting that something failed. """ if isinstance(obj, BaseModel): - return obj.model_dump(by_alias=True) + return obj.model_dump(mode='json', by_alias=True) value_repr = repr(obj) if len(value_repr) > _MAX_ERROR_VALUE_REPR_LEN: value_repr = value_repr[:_MAX_ERROR_VALUE_REPR_LEN] + '...' diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 38d6648df..1524d7aab 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -128,7 +128,7 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ # None per this function's contract), rather than crashing the crawl. try: scrapy_request_json = encode_to_json(scrapy_request_dict) - except TypeError: + except (TypeError, ValueError): logger.exception( f'Failed to serialize Scrapy request {scrapy_request} for storage on the Apify platform; skipping it. ' 'Ensure all values in `meta` and `cb_kwargs` are JSON-serializable.' diff --git a/tests/unit/scrapy/test_serialization.py b/tests/unit/scrapy/test_serialization.py index e290af986..0658dc60f 100644 --- a/tests/unit/scrapy/test_serialization.py +++ b/tests/unit/scrapy/test_serialization.py @@ -3,7 +3,7 @@ from datetime import UTC, datetime import pytest -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from apify.scrapy._serialization import _MAX_ERROR_VALUE_REPR_LEN, decode_from_json, encode_to_json @@ -71,6 +71,28 @@ class Model(BaseModel): assert decode_from_json(encoded)['meta']['m'] == {'First': 1} +def test_pydantic_model_with_datetime_field_round_trips() -> None: + """A pydantic model with a `datetime` field is dumped in JSON mode, so the request is stored, not dropped.""" + + class Model(BaseModel): + when: datetime + + encoded = encode_to_json({'meta': {'m': Model(when=datetime(2020, 1, 2, 3, 4, 5, tzinfo=UTC))}}) + assert decode_from_json(encoded)['meta']['m'] == {'when': '2020-01-02T03:04:05Z'} + + +def test_pydantic_model_with_non_serializable_field_raises() -> None: + """A model field that even JSON mode cannot dump raises the clear `TypeError`, not a bare pydantic error.""" + + class Model(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + obj: object + + with pytest.raises(TypeError, match='JSON-serializable'): + encode_to_json({'meta': {'m': Model(obj=object())}}) + + def test_tuple_is_coerced_to_list() -> None: """Documented limitation: JSON has no tuple type, so a tuple round-trips as a list.""" assert _round_trip({'meta': {'coords': (1, 2, 3)}})['meta']['coords'] == [1, 2, 3] From 140fff96a5f0007aede940d9e37412b2f497cc04 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 11 Jun 2026 14:29:37 +0200 Subject: [PATCH 2/2] revert: Move docs and metadata fixes to PR #921 --- CONTRIBUTING.md | 2 +- docs/01_introduction/index.mdx | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cbcaa8b21..927d4a344 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ Here you'll find a contributing guide to get started with development. ## Environment -For local development, it is required to have Python 3.11 (or a later version) installed. +For local development, it is required to have Python 3.10 (or a later version) installed. We use [uv](https://docs.astral.sh/uv/) for project management. Install it and set up your IDE accordingly. diff --git a/docs/01_introduction/index.mdx b/docs/01_introduction/index.mdx index 27f8cca0f..2e8039799 100644 --- a/docs/01_introduction/index.mdx +++ b/docs/01_introduction/index.mdx @@ -29,7 +29,7 @@ Explore the Guides section in the sidebar for a deeper understanding of the SDK' ## Installation -The Apify SDK for Python requires Python version 3.11 or above. It is typically installed when you create a new Actor project using the [Apify CLI](https://docs.apify.com/cli). To install it manually in an existing project, use: +The Apify SDK for Python requires Python version 3.10 or above. It is typically installed when you create a new Actor project using the [Apify CLI](https://docs.apify.com/cli). To install it manually in an existing project, use: ```bash pip install apify diff --git a/pyproject.toml b/pyproject.toml index 6ce930bfb..be95eda8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ scrapy = ["scrapy>=2.14.0"] "Documentation" = "https://docs.apify.com/sdk/python/docs/overview" "Homepage" = "https://docs.apify.com/sdk/python/" "Issue Tracker" = "https://github.com/apify/apify-sdk-python/issues" -"Release Notes" = "https://docs.apify.com/sdk/python/docs/upgrading/upgrading-to-v4" +"Release Notes" = "https://docs.apify.com/sdk/python/docs/upgrading/upgrading-to-v2" "Source Code" = "https://github.com/apify/apify-sdk-python" [dependency-groups]