-
Notifications
You must be signed in to change notification settings - Fork 749
feat: provide Request instances in skipped request callbacks #1927
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -206,6 +206,18 @@ async def extract_links( | |
| **kwargs: Unpack[EnqueueLinksKwargs], | ||
| ) -> list[Request]: | ||
| requests = list[Request]() | ||
| skipped = list[Request]() | ||
|
|
||
| def create_request(request_options: RequestOptions) -> Request | None: | ||
| try: | ||
| return Request.from_url(**request_options) | ||
| except ValidationError as exc: | ||
| context.log.debug( | ||
| f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. ' | ||
| 'This may be caused by a malformed URL or unsupported URL scheme. ' | ||
| 'Please ensure the URL is correct and retry.' | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that this helper is also used for robots-skipped, auto-discovered links (not just user enqueues), the "Please ensure the URL is correct and retry." wording is misleading — the operator never submitted this URL and there's nothing to retry. Consider a neutral message, or a separate one for the skip path. |
||
| ) | ||
| return None | ||
|
|
||
| base_user_data = user_data or {} | ||
|
|
||
|
|
@@ -226,11 +238,19 @@ async def extract_links( | |
| else context.request.loaded_url or context.request.url | ||
| ) | ||
| links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) | ||
| skipped_iterator = iter([]) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor: |
||
|
|
||
| if robots_txt_file: | ||
| skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) | ||
| else: | ||
| skipped = iter([]) | ||
| skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) | ||
|
|
||
| for url in skipped_iterator: | ||
| request_options = RequestOptions( | ||
| url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy | ||
| ) | ||
| request = create_request(request_options) | ||
|
|
||
| if request is not None: | ||
| skipped.append(request) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Two things about this skipped-building loop:
|
||
|
|
||
| for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs): | ||
| request_options = RequestOptions( | ||
|
|
@@ -244,17 +264,10 @@ async def extract_links( | |
| if transform_request_options != 'unchanged': | ||
| request_options = transform_request_options | ||
|
|
||
| try: | ||
| request = Request.from_url(**request_options) | ||
| except ValidationError as exc: | ||
| context.log.debug( | ||
| f'Skipping URL "{url}" due to invalid format: {exc}. ' | ||
| 'This may be caused by a malformed URL or unsupported URL scheme. ' | ||
| 'Please ensure the URL is correct and retry.' | ||
| ) | ||
| continue | ||
| request = create_request(request_options) | ||
|
|
||
| requests.append(request) | ||
| if request is not None: | ||
| requests.append(request) | ||
|
|
||
| skipped_tasks = [ | ||
| asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -110,7 +110,7 @@ | |
|
|
||
| ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]] | ||
| FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]] | ||
| SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]] | ||
| SkippedRequestCallback = Callable[[Request, SkippedReason], Awaitable[None]] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a breaking change to a public API. Existing callbacks like
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you can start sending |
||
|
|
||
|
|
||
| class _BasicCrawlerOptions(TypedDict): | ||
|
|
@@ -1210,17 +1210,15 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling | |
| raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e | ||
|
|
||
| async def _handle_skipped_request( | ||
| self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False | ||
| self, request: Request, reason: SkippedReason, *, need_mark: bool = False | ||
| ) -> None: | ||
| if need_mark and isinstance(request, Request): | ||
| request.state = RequestState.SKIPPED | ||
| await self._mark_request_as_handled(request) | ||
|
|
||
| url = request.url if isinstance(request, Request) else request | ||
|
|
||
| if self._on_skipped_request: | ||
| try: | ||
| await self._on_skipped_request(url, reason) | ||
| await self._on_skipped_request(request, reason) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This now forwards for request in requests:
check_url = request.url if isinstance(request, Request) else request
if await self._is_allowed_based_on_robots_txt_file(check_url):
allowed_requests.append(request)
else:
skipped.append(request) # <- can be a plain strSo with Suggest normalizing |
||
| except Exception as e: | ||
| raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -459,6 +459,18 @@ async def extract_links( | |
| The `PlaywrightCrawler` implementation of the `ExtractLinksFunction` function. | ||
| """ | ||
| requests = list[Request]() | ||
| skipped = list[Request]() | ||
|
|
||
| def create_request(request_options: RequestOptions) -> Request | None: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as the HTTP crawler: this |
||
| try: | ||
| return Request.from_url(**request_options) | ||
| except ValidationError as exc: | ||
| context.log.debug( | ||
| f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. ' | ||
| 'This may be caused by a malformed URL or unsupported URL scheme. ' | ||
| 'Please ensure the URL is correct and retry.' | ||
| ) | ||
| return None | ||
|
|
||
| base_user_data = user_data or {} | ||
|
|
||
|
|
@@ -478,10 +490,19 @@ async def extract_links( | |
|
|
||
| links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) | ||
|
|
||
| skipped_iterator = iter([]) | ||
|
|
||
| if robots_txt_file: | ||
| skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) | ||
| else: | ||
| skipped = iter([]) | ||
| skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) | ||
|
|
||
| for url in skipped_iterator: | ||
| request_options = RequestOptions( | ||
| url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy | ||
| ) | ||
| request = create_request(request_options) | ||
|
|
||
| if request is not None: | ||
| skipped.append(request) | ||
|
|
||
| for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs): | ||
| request_options = RequestOptions( | ||
|
|
@@ -495,17 +516,10 @@ async def extract_links( | |
| if transform_request_options != 'unchanged': | ||
| request_options = transform_request_options | ||
|
|
||
| try: | ||
| request = Request.from_url(**request_options) | ||
| except ValidationError as exc: | ||
| context.log.debug( | ||
| f'Skipping URL "{url}" due to invalid format: {exc}. ' | ||
| 'This may be caused by a malformed URL or unsupported URL scheme. ' | ||
| 'Please ensure the URL is correct and retry.' | ||
| ) | ||
| continue | ||
| request = create_request(request_options) | ||
|
|
||
| requests.append(request) | ||
| if request is not None: | ||
| requests.append(request) | ||
|
|
||
| skipped_tasks = [ | ||
| asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -246,18 +246,22 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: | |
| await context.enqueue_links() | ||
|
|
||
| @crawler.on_skipped_request | ||
| async def skipped_hook(url: str, _reason: SkippedReason) -> None: | ||
| skip(url) | ||
| async def skipped_hook(request: Request, _reason: SkippedReason) -> None: | ||
| skip(request) | ||
|
|
||
| await crawler.run([str(server_url / 'start_enqueue')]) | ||
|
|
||
| expected_skip_calls = [ | ||
| mock.call(str(server_url / 'page_1')), | ||
| mock.call(str(server_url / 'page_2')), | ||
| mock.call(str(server_url / 'page_3')), | ||
| mock.call(str(server_url / 'page_4')), | ||
| ] | ||
| skip.assert_has_calls(expected_skip_calls, any_order=True) | ||
| expected_skip_urls = { | ||
| str(server_url / 'page_1'), | ||
| str(server_url / 'page_2'), | ||
| str(server_url / 'page_3'), | ||
| str(server_url / 'page_4'), | ||
| } | ||
|
|
||
| requests = [call.args[0] for call in skip.call_args_list] | ||
|
|
||
| all(isinstance(request, Request) for request in requests) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No assert? |
||
| assert {request.url for request in requests} == expected_skip_urls | ||
|
|
||
|
|
||
| async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -330,18 +330,22 @@ async def request_handler(context: ParselCrawlingContext) -> None: | |
| await context.enqueue_links() | ||
|
|
||
| @crawler.on_skipped_request | ||
| async def skipped_hook(url: str, _reason: SkippedReason) -> None: | ||
| skip(url) | ||
| async def skipped_hook(request: Request, _reason: SkippedReason) -> None: | ||
| skip(request) | ||
|
|
||
| await crawler.run([str(server_url / 'start_enqueue')]) | ||
|
|
||
| expected_skip_calls = [ | ||
| mock.call(str(server_url / 'page_1')), | ||
| mock.call(str(server_url / 'page_2')), | ||
| mock.call(str(server_url / 'page_3')), | ||
| mock.call(str(server_url / 'page_4')), | ||
| ] | ||
| skip.assert_has_calls(expected_skip_calls, any_order=True) | ||
| expected_skip_urls = { | ||
| str(server_url / 'page_1'), | ||
| str(server_url / 'page_2'), | ||
| str(server_url / 'page_3'), | ||
| str(server_url / 'page_4'), | ||
| } | ||
|
|
||
| requests = [call.args[0] for call in skip.call_args_list] | ||
|
|
||
| all(isinstance(request, Request) for request in requests) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No assert? |
||
| assert {request.url for request in requests} == expected_skip_urls | ||
|
|
||
|
|
||
| async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -765,18 +765,22 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: | |
| await context.enqueue_links() | ||
|
|
||
| @crawler.on_skipped_request | ||
| async def skipped_hook(url: str, _reason: SkippedReason) -> None: | ||
| skip(url) | ||
| async def skipped_hook(request: Request, _reason: SkippedReason) -> None: | ||
| skip(request) | ||
|
|
||
| await crawler.run([str(server_url / 'start_enqueue')]) | ||
|
|
||
| expected_skip_calls = [ | ||
| mock.call(str(server_url / 'page_1')), | ||
| mock.call(str(server_url / 'page_2')), | ||
| mock.call(str(server_url / 'page_3')), | ||
| mock.call(str(server_url / 'page_4')), | ||
| ] | ||
| skip.assert_has_calls(expected_skip_calls, any_order=True) | ||
| expected_skip_urls = { | ||
| str(server_url / 'page_1'), | ||
| str(server_url / 'page_2'), | ||
| str(server_url / 'page_3'), | ||
| str(server_url / 'page_4'), | ||
| } | ||
|
|
||
| requests = [call.args[0] for call in skip.call_args_list] | ||
|
|
||
| all(isinstance(request, Request) for request in requests) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No assert? |
||
| assert {request.url for request in requests} == expected_skip_urls | ||
|
|
||
|
|
||
| async def test_send_request(server_url: URL) -> None: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
create_requestis now duplicated verbatim (including the multi-line debug message) withPlaywrightCrawler.extract_links(_playwright_crawler.py:464). Both crawlers extendBasicCrawler, so this looks like a good candidate for a single shared helper incrawlee/_utils(taking alogger) to keep the two copies from drifting.