Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio

from crawlee import SkippedReason
from crawlee import Request, SkippedReason
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
Expand All @@ -18,7 +18,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
# highlight-start
# This handler is called when a request is skipped
@crawler.on_skipped_request
async def skipped_request_handler(url: str, reason: SkippedReason) -> None:
async def skipped_request_handler(request: Request, reason: SkippedReason) -> None:
url = request.url

# Check if the request was skipped due to robots.txt rules
if reason == 'robots_txt':
crawler.log.info(f'Skipped {url} due to robots.txt rules.')
Expand Down
39 changes: 26 additions & 13 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,18 @@ async def extract_links(
**kwargs: Unpack[EnqueueLinksKwargs],
) -> list[Request]:
requests = list[Request]()
skipped = list[Request]()

def create_request(request_options: RequestOptions) -> Request | None:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

create_request is now duplicated verbatim (including the multi-line debug message) with PlaywrightCrawler.extract_links (_playwright_crawler.py:464). Both crawlers extend BasicCrawler, so this looks like a good candidate for a single shared helper in crawlee/_utils (taking a logger) to keep the two copies from drifting.

try:
return Request.from_url(**request_options)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. '
'This may be caused by a malformed URL or unsupported URL scheme. '
'Please ensure the URL is correct and retry.'

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that this helper is also used for robots-skipped, auto-discovered links (not just user enqueues), the "Please ensure the URL is correct and retry." wording is misleading — the operator never submitted this URL and there's nothing to retry. Consider a neutral message, or a separate one for the skip path.

)
return None

base_user_data = user_data or {}

Expand All @@ -226,11 +238,19 @@ async def extract_links(
else context.request.loaded_url or context.request.url
)
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
skipped_iterator = iter([])

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: skipped_iterator = iter([]) followed by a conditional reassignment inside if robots_txt_file: reads a little awkwardly (and iter([]) infers Iterator[Never]). The previous if/else, or guarding the skipped-building loop under the same if robots_txt_file:, is clearer.


if robots_txt_file:
skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
else:
skipped = iter([])
skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)

for url in skipped_iterator:
request_options = RequestOptions(
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
)
request = create_request(request_options)

if request is not None:
skipped.append(request)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two things about this skipped-building loop:

  1. Behavior change / silent drop: previously every robots-disallowed URL reached the callback as a raw string; now they go through Request.from_url, which enforces http/https (validate_http_url). A disallowed-but-non-http(s) or malformed-but-absolute URL (one that passes to_absolute_url_iterator but fails from_url) is now silently dropped and the skip callback never fires for it. Intended?

  2. Consistency: these skipped requests are built directly and don't pass through transform_request_function, while the enqueued ones do. So a skipped Request carries the base label/user_data/enqueue_strategy but not the user's per-request transform — inconsistent with enqueued requests from the same links.


for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
request_options = RequestOptions(
Expand All @@ -244,17 +264,10 @@ async def extract_links(
if transform_request_options != 'unchanged':
request_options = transform_request_options

try:
request = Request.from_url(**request_options)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{url}" due to invalid format: {exc}. '
'This may be caused by a malformed URL or unsupported URL scheme. '
'Please ensure the URL is correct and retry.'
)
continue
request = create_request(request_options)

requests.append(request)
if request is not None:
requests.append(request)

skipped_tasks = [
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
Expand Down
8 changes: 3 additions & 5 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@

ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
SkippedRequestCallback = Callable[[Request, SkippedReason], Awaitable[None]]

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a breaking change to a public API. Existing callbacks like async def cb(url: str, reason) will now receive a Request; string usage like url.startswith(...) will crash. It should work with both str and Request.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you can start sending Request instances into existing callbacks that expect str without breaking BC without runtime type inspection — do we really want to go that way?



class _BasicCrawlerOptions(TypedDict):
Expand Down Expand Up @@ -1210,17 +1210,15 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling
raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e

async def _handle_skipped_request(
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
self, request: Request, reason: SkippedReason, *, need_mark: bool = False
) -> None:
if need_mark and isinstance(request, Request):
request.state = RequestState.SKIPPED
await self._mark_request_as_handled(request)

url = request.url if isinstance(request, Request) else request

if self._on_skipped_request:
try:
await self._on_skipped_request(url, reason)
await self._on_skipped_request(request, reason)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This now forwards request straight to the callback, but add_requests() (just below, ~line 841) wasn't updated — it still builds skipped from the original Sequence[str | Request] and passes a possibly-str item here:

for request in requests:
    check_url = request.url if isinstance(request, Request) else request
    if await self._is_allowed_based_on_robots_txt_file(check_url):
        allowed_requests.append(request)
    else:
        skipped.append(request)  # <- can be a plain str

So with respect_robots_txt_file=True, await crawler.add_requests(['https://disallowed/...']) on a robots-disallowed URL delivers a str to the callback, and request.url (as in the docs example) raises AttributeErrorUserDefinedErrorHandlerError.

Suggest normalizing str → Request once at the choke point (here, or in add_requests) instead of only in the two extract_links impls. That also makes the isinstance(request, Request) guard a few lines up dead code (it now contradicts the request: Request annotation). This path also has no test coverage.

except Exception as e:
raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e

Expand Down
40 changes: 27 additions & 13 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,18 @@ async def extract_links(
The `PlaywrightCrawler` implementation of the `ExtractLinksFunction` function.
"""
requests = list[Request]()
skipped = list[Request]()

def create_request(request_options: RequestOptions) -> Request | None:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as the HTTP crawler: this create_request is a verbatim duplicate (candidate for a shared helper), and the skipped-building loop below has the same silent-drop and transform_request_function-not-applied behavior. See the comments on _abstract_http_crawler.py.

try:
return Request.from_url(**request_options)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. '
'This may be caused by a malformed URL or unsupported URL scheme. '
'Please ensure the URL is correct and retry.'
)
return None

base_user_data = user_data or {}

Expand All @@ -478,10 +490,19 @@ async def extract_links(

links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)

skipped_iterator = iter([])

if robots_txt_file:
skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
else:
skipped = iter([])
skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)

for url in skipped_iterator:
request_options = RequestOptions(
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
)
request = create_request(request_options)

if request is not None:
skipped.append(request)

for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
request_options = RequestOptions(
Expand All @@ -495,17 +516,10 @@ async def extract_links(
if transform_request_options != 'unchanged':
request_options = transform_request_options

try:
request = Request.from_url(**request_options)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{url}" due to invalid format: {exc}. '
'This may be caused by a malformed URL or unsupported URL scheme. '
'Please ensure the URL is correct and retry.'
)
continue
request = create_request(request_options)

requests.append(request)
if request is not None:
requests.append(request)

skipped_tasks = [
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
Expand Down
22 changes: 13 additions & 9 deletions tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,18 +246,22 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await context.enqueue_links()

@crawler.on_skipped_request
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
skip(url)
async def skipped_hook(request: Request, _reason: SkippedReason) -> None:
skip(request)

await crawler.run([str(server_url / 'start_enqueue')])

expected_skip_calls = [
mock.call(str(server_url / 'page_1')),
mock.call(str(server_url / 'page_2')),
mock.call(str(server_url / 'page_3')),
mock.call(str(server_url / 'page_4')),
]
skip.assert_has_calls(expected_skip_calls, any_order=True)
expected_skip_urls = {
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
}

requests = [call.args[0] for call in skip.call_args_list]

all(isinstance(request, Request) for request in requests)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No assert?

assert {request.url for request in requests} == expected_skip_urls


async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:
Expand Down
22 changes: 13 additions & 9 deletions tests/unit/crawlers/_parsel/test_parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,18 +330,22 @@ async def request_handler(context: ParselCrawlingContext) -> None:
await context.enqueue_links()

@crawler.on_skipped_request
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
skip(url)
async def skipped_hook(request: Request, _reason: SkippedReason) -> None:
skip(request)

await crawler.run([str(server_url / 'start_enqueue')])

expected_skip_calls = [
mock.call(str(server_url / 'page_1')),
mock.call(str(server_url / 'page_2')),
mock.call(str(server_url / 'page_3')),
mock.call(str(server_url / 'page_4')),
]
skip.assert_has_calls(expected_skip_calls, any_order=True)
expected_skip_urls = {
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
}

requests = [call.args[0] for call in skip.call_args_list]

all(isinstance(request, Request) for request in requests)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No assert?

assert {request.url for request in requests} == expected_skip_urls


async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:
Expand Down
22 changes: 13 additions & 9 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,18 +765,22 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await context.enqueue_links()

@crawler.on_skipped_request
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
skip(url)
async def skipped_hook(request: Request, _reason: SkippedReason) -> None:
skip(request)

await crawler.run([str(server_url / 'start_enqueue')])

expected_skip_calls = [
mock.call(str(server_url / 'page_1')),
mock.call(str(server_url / 'page_2')),
mock.call(str(server_url / 'page_3')),
mock.call(str(server_url / 'page_4')),
]
skip.assert_has_calls(expected_skip_calls, any_order=True)
expected_skip_urls = {
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
}

requests = [call.args[0] for call in skip.call_args_list]

all(isinstance(request, Request) for request in requests)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No assert?

assert {request.url for request in requests} == expected_skip_urls


async def test_send_request(server_url: URL) -> None:
Expand Down
Loading