Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/crawlee/_utils/robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ async def parse_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> Sitemap:
if not self._http_client:
raise ValueError('HTTP client is required to parse sitemaps.')

return await Sitemap.load(sitemaps, self._http_client, self._proxy_info)
return await Sitemap.load(
sitemaps, self._http_client, self._proxy_info, parse_sitemap_options={'enqueue_strategy': enqueue_strategy}
)

async def parse_urls_from_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
"""Parse the sitemaps in the robots.txt file and return a list URLs.
Expand Down
54 changes: 49 additions & 5 deletions src/crawlee/_utils/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@
from typing_extensions import NotRequired, override
from yarl import URL

from crawlee._utils.urls import filter_url
from crawlee._utils.web import is_status_code_successful
from crawlee.errors import ProxyError

if TYPE_CHECKING:
from collections.abc import AsyncGenerator
from xml.sax.xmlreader import AttributesImpl

from crawlee import EnqueueStrategy
from crawlee.http_clients import HttpClient
from crawlee.proxy_configuration import ProxyInfo

Expand Down Expand Up @@ -55,6 +57,7 @@ class ParseSitemapOptions(TypedDict, total=False):
emit_nested_sitemaps: bool
max_depth: int
sitemap_retries: int
enqueue_strategy: EnqueueStrategy
timeout: timedelta | None


Expand Down Expand Up @@ -230,6 +233,7 @@ async def _process_sitemap_item(
sources: list[SitemapSource],
*,
emit_nested_sitemaps: bool,
enqueue_strategy: EnqueueStrategy,
) -> AsyncGenerator[SitemapUrl | NestedSitemap | None, None]:
"""Process a sitemap item and yield appropriate results."""
item_copy = item.copy() # Work with a copy to avoid modifying the original
Expand All @@ -243,21 +247,35 @@ async def _process_sitemap_item(
if item_type == 'sitemap_url' and 'url' in item_copy:
sitemap_url = item_copy['url']
if sitemap_url and sitemap_url not in visited_sitemap_urls:
if parent_url := source.get('url'):
ok, reason = filter_url(target=sitemap_url, strategy=enqueue_strategy, origin=parent_url)
if not ok:
logger.warning(f'Skipping nested sitemap {sitemap_url!r} (parent {parent_url!r}): {reason}.')
return

# Add to processing queue
sources.append(SitemapSource(type='url', url=sitemap_url, depth=depth + 1))

# Output the nested sitemap reference if requested
if emit_nested_sitemaps:
yield NestedSitemap(loc=sitemap_url, origin_sitemap_url=None)
yield NestedSitemap(loc=sitemap_url, origin_sitemap_url=parent_url)

# Handle individual URL entries
elif item_type == 'url' and 'loc' in item_copy:
# Determine the origin sitemap URL for tracking purposes
origin_url = _get_origin_url(source)

loc = item_copy['loc']
parent_url = source.get('url')
if parent_url and loc:
ok, reason = filter_url(target=loc, strategy=enqueue_strategy, origin=parent_url)
if not ok:
logger.warning(f'Skipping sitemap URL {loc!r} (parent {parent_url!r}): {reason}.')
return

# Create and yield the sitemap URL object
yield SitemapUrl(
loc=item_copy['loc'],
loc=loc,
lastmod=item_copy.get('lastmod'),
changefreq=item_copy.get('changefreq'),
priority=item_copy.get('priority'),
Expand All @@ -272,6 +290,7 @@ async def _process_raw_source(
sources: list[SitemapSource],
*,
emit_nested_sitemaps: bool,
enqueue_strategy: EnqueueStrategy,
) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:
"""Process a raw content sitemap source."""
if 'content' not in source:
Expand All @@ -285,15 +304,27 @@ async def _process_raw_source(
# Process the content
async for item in parser.process_chunk(content):
async for result in _process_sitemap_item(
item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
item,
source,
depth,
visited_sitemap_urls,
sources,
emit_nested_sitemaps=emit_nested_sitemaps,
enqueue_strategy=enqueue_strategy,
):
if result:
yield result

# Process any remaining content
async for item in parser.flush():
async for result in _process_sitemap_item(
item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
item,
source,
depth,
visited_sitemap_urls,
sources,
emit_nested_sitemaps=emit_nested_sitemaps,
enqueue_strategy=enqueue_strategy,
):
if result:
yield result
Expand All @@ -314,6 +345,7 @@ async def _fetch_and_process_sitemap(
proxy_info: ProxyInfo | None = None,
timeout: timedelta | None = None,
emit_nested_sitemaps: bool,
enqueue_strategy: EnqueueStrategy,
) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:
"""Fetch a sitemap from a URL and process its content."""
if 'url' not in source:
Expand Down Expand Up @@ -354,6 +386,7 @@ async def _fetch_and_process_sitemap(
visited_sitemap_urls,
sources,
emit_nested_sitemaps=emit_nested_sitemaps,
enqueue_strategy=enqueue_strategy,
):
if result:
yield result
Expand All @@ -367,6 +400,7 @@ async def _fetch_and_process_sitemap(
visited_sitemap_urls,
sources,
emit_nested_sitemaps=emit_nested_sitemaps,
enqueue_strategy=enqueue_strategy,
):
if result:
yield result
Expand Down Expand Up @@ -438,13 +472,17 @@ async def parse_sitemap(
This function coordinates the process of fetching and parsing sitemaps,
handling both URL-based and raw content sources. It follows nested sitemaps
up to the specified maximum depth.

Default `ParseSitemapOptions.enqueue_strategy` is `same-hostname` which will skip cross-host URLs.
Use strategy `all` to process all links.
"""
# Set default options
options = options or {}
emit_nested_sitemaps = options.get('emit_nested_sitemaps', False)
max_depth = options.get('max_depth', float('inf'))
sitemap_retries = options.get('sitemap_retries', 3)
timeout = options.get('timeout', timedelta(seconds=30))
enqueue_strategy = options.get('enqueue_strategy', 'same-hostname')

# Setup working state
sources = list(initial_sources)
Expand All @@ -463,7 +501,12 @@ async def parse_sitemap(
# Process based on source type
if source['type'] == 'raw':
async for result in _process_raw_source(
source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
source,
depth,
visited_sitemap_urls,
sources,
emit_nested_sitemaps=emit_nested_sitemaps,
enqueue_strategy=enqueue_strategy,
):
yield result

Expand All @@ -482,6 +525,7 @@ async def parse_sitemap(
sources,
sitemap_retries,
emit_nested_sitemaps=emit_nested_sitemaps,
enqueue_strategy=enqueue_strategy,
proxy_info=proxy_info,
timeout=timeout,
):
Expand Down
7 changes: 6 additions & 1 deletion src/crawlee/request_loaders/_sitemap_request_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,12 @@ async def _load_sitemaps(self) -> None:
continue
state.in_progress_sitemap_url = sitemap_url

parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
parse_options = ParseSitemapOptions(
max_depth=0,
emit_nested_sitemaps=True,
sitemap_retries=3,
enqueue_strategy=self._enqueue_strategy,
)
parsed_sitemap_url = URL(sitemap_url)

async for item in parse_sitemap(
Expand Down
Loading
Loading