diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
index d4a5e00cbc..2c0d70b1c0 100644
--- a/src/crawlee/_utils/robots.py
+++ b/src/crawlee/_utils/robots.py
@@ -135,7 +135,9 @@ async def parse_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> Sitemap:
if not self._http_client:
raise ValueError('HTTP client is required to parse sitemaps.')
- return await Sitemap.load(sitemaps, self._http_client, self._proxy_info)
+ return await Sitemap.load(
+ sitemaps, self._http_client, self._proxy_info, parse_sitemap_options={'enqueue_strategy': enqueue_strategy}
+ )
async def parse_urls_from_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
"""Parse the sitemaps in the robots.txt file and return a list URLs.
diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py
index 05eaa6e726..d7844e9168 100644
--- a/src/crawlee/_utils/sitemap.py
+++ b/src/crawlee/_utils/sitemap.py
@@ -18,6 +18,7 @@
from typing_extensions import NotRequired, override
from yarl import URL
+from crawlee._utils.urls import filter_url
from crawlee._utils.web import is_status_code_successful
from crawlee.errors import ProxyError
@@ -25,6 +26,7 @@
from collections.abc import AsyncGenerator
from xml.sax.xmlreader import AttributesImpl
+ from crawlee import EnqueueStrategy
from crawlee.http_clients import HttpClient
from crawlee.proxy_configuration import ProxyInfo
@@ -55,6 +57,7 @@ class ParseSitemapOptions(TypedDict, total=False):
emit_nested_sitemaps: bool
max_depth: int
sitemap_retries: int
+ enqueue_strategy: EnqueueStrategy
timeout: timedelta | None
@@ -230,6 +233,7 @@ async def _process_sitemap_item(
sources: list[SitemapSource],
*,
emit_nested_sitemaps: bool,
+ enqueue_strategy: EnqueueStrategy,
) -> AsyncGenerator[SitemapUrl | NestedSitemap | None, None]:
"""Process a sitemap item and yield appropriate results."""
item_copy = item.copy() # Work with a copy to avoid modifying the original
@@ -243,21 +247,35 @@ async def _process_sitemap_item(
if item_type == 'sitemap_url' and 'url' in item_copy:
sitemap_url = item_copy['url']
if sitemap_url and sitemap_url not in visited_sitemap_urls:
+ if parent_url := source.get('url'):
+ ok, reason = filter_url(target=sitemap_url, strategy=enqueue_strategy, origin=parent_url)
+ if not ok:
+ logger.warning(f'Skipping nested sitemap {sitemap_url!r} (parent {parent_url!r}): {reason}.')
+ return
+
# Add to processing queue
sources.append(SitemapSource(type='url', url=sitemap_url, depth=depth + 1))
# Output the nested sitemap reference if requested
if emit_nested_sitemaps:
- yield NestedSitemap(loc=sitemap_url, origin_sitemap_url=None)
+ yield NestedSitemap(loc=sitemap_url, origin_sitemap_url=parent_url)
# Handle individual URL entries
elif item_type == 'url' and 'loc' in item_copy:
# Determine the origin sitemap URL for tracking purposes
origin_url = _get_origin_url(source)
+ loc = item_copy['loc']
+ parent_url = source.get('url')
+ if parent_url and loc:
+ ok, reason = filter_url(target=loc, strategy=enqueue_strategy, origin=parent_url)
+ if not ok:
+ logger.warning(f'Skipping sitemap URL {loc!r} (parent {parent_url!r}): {reason}.')
+ return
+
# Create and yield the sitemap URL object
yield SitemapUrl(
- loc=item_copy['loc'],
+ loc=loc,
lastmod=item_copy.get('lastmod'),
changefreq=item_copy.get('changefreq'),
priority=item_copy.get('priority'),
@@ -272,6 +290,7 @@ async def _process_raw_source(
sources: list[SitemapSource],
*,
emit_nested_sitemaps: bool,
+ enqueue_strategy: EnqueueStrategy,
) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:
"""Process a raw content sitemap source."""
if 'content' not in source:
@@ -285,7 +304,13 @@ async def _process_raw_source(
# Process the content
async for item in parser.process_chunk(content):
async for result in _process_sitemap_item(
- item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
+ item,
+ source,
+ depth,
+ visited_sitemap_urls,
+ sources,
+ emit_nested_sitemaps=emit_nested_sitemaps,
+ enqueue_strategy=enqueue_strategy,
):
if result:
yield result
@@ -293,7 +318,13 @@ async def _process_raw_source(
# Process any remaining content
async for item in parser.flush():
async for result in _process_sitemap_item(
- item, source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
+ item,
+ source,
+ depth,
+ visited_sitemap_urls,
+ sources,
+ emit_nested_sitemaps=emit_nested_sitemaps,
+ enqueue_strategy=enqueue_strategy,
):
if result:
yield result
@@ -314,6 +345,7 @@ async def _fetch_and_process_sitemap(
proxy_info: ProxyInfo | None = None,
timeout: timedelta | None = None,
emit_nested_sitemaps: bool,
+ enqueue_strategy: EnqueueStrategy,
) -> AsyncGenerator[SitemapUrl | NestedSitemap, None]:
"""Fetch a sitemap from a URL and process its content."""
if 'url' not in source:
@@ -354,6 +386,7 @@ async def _fetch_and_process_sitemap(
visited_sitemap_urls,
sources,
emit_nested_sitemaps=emit_nested_sitemaps,
+ enqueue_strategy=enqueue_strategy,
):
if result:
yield result
@@ -367,6 +400,7 @@ async def _fetch_and_process_sitemap(
visited_sitemap_urls,
sources,
emit_nested_sitemaps=emit_nested_sitemaps,
+ enqueue_strategy=enqueue_strategy,
):
if result:
yield result
@@ -438,6 +472,9 @@ async def parse_sitemap(
This function coordinates the process of fetching and parsing sitemaps,
handling both URL-based and raw content sources. It follows nested sitemaps
up to the specified maximum depth.
+
+ Default `ParseSitemapOptions.enqueue_strategy` is `same-hostname` which will skip cross-host URLs.
+ Use strategy `all` to process all links.
"""
# Set default options
options = options or {}
@@ -445,6 +482,7 @@ async def parse_sitemap(
max_depth = options.get('max_depth', float('inf'))
sitemap_retries = options.get('sitemap_retries', 3)
timeout = options.get('timeout', timedelta(seconds=30))
+ enqueue_strategy = options.get('enqueue_strategy', 'same-hostname')
# Setup working state
sources = list(initial_sources)
@@ -463,7 +501,12 @@ async def parse_sitemap(
# Process based on source type
if source['type'] == 'raw':
async for result in _process_raw_source(
- source, depth, visited_sitemap_urls, sources, emit_nested_sitemaps=emit_nested_sitemaps
+ source,
+ depth,
+ visited_sitemap_urls,
+ sources,
+ emit_nested_sitemaps=emit_nested_sitemaps,
+ enqueue_strategy=enqueue_strategy,
):
yield result
@@ -482,6 +525,7 @@ async def parse_sitemap(
sources,
sitemap_retries,
emit_nested_sitemaps=emit_nested_sitemaps,
+ enqueue_strategy=enqueue_strategy,
proxy_info=proxy_info,
timeout=timeout,
):
diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py
index 664686b23f..84051d2164 100644
--- a/src/crawlee/request_loaders/_sitemap_request_loader.py
+++ b/src/crawlee/request_loaders/_sitemap_request_loader.py
@@ -352,7 +352,12 @@ async def _load_sitemaps(self) -> None:
continue
state.in_progress_sitemap_url = sitemap_url
- parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
+ parse_options = ParseSitemapOptions(
+ max_depth=0,
+ emit_nested_sitemaps=True,
+ sitemap_retries=3,
+ enqueue_strategy=self._enqueue_strategy,
+ )
parsed_sitemap_url = URL(sitemap_url)
async for item in parse_sitemap(
diff --git a/tests/unit/_utils/test_sitemap.py b/tests/unit/_utils/test_sitemap.py
index f56094460a..8471331d7d 100644
--- a/tests/unit/_utils/test_sitemap.py
+++ b/tests/unit/_utils/test_sitemap.py
@@ -18,48 +18,11 @@
parse_sitemap,
)
from crawlee.http_clients._base import HttpClient, HttpResponse
+from tests.unit.utils import DEFAULT_URL, get_basic_results, get_basic_sitemap
if TYPE_CHECKING:
from collections.abc import AsyncIterator
-BASIC_SITEMAP = """
-
-
-
-http://not-exists.com/
-2005-02-03
-monthly
-0.8
-
-
-http://not-exists.com/catalog?item=12&desc=vacation_hawaii
-weekly
-
-
-http://not-exists.com/catalog?item=73&desc=vacation_new_zealand
-2004-12-23
-weekly
-
-
-http://not-exists.com/catalog?item=74&desc=vacation_newfoundland
-2004-12-23T18:00:15+00:00
-0.3
-
-
-http://not-exists.com/catalog?item=83&desc=vacation_usa
-2004-11-23
-
-
-""".strip()
-
-BASIC_RESULTS = {
- 'http://not-exists.com/',
- 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
- 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
- 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
- 'http://not-exists.com/catalog?item=83&desc=vacation_usa',
-}
-
def _make_mock_client(url_map: dict[str, tuple[int, bytes]]) -> AsyncMock:
async def send_request(url: str, **_kwargs: Any) -> HttpResponse:
@@ -115,24 +78,49 @@ def encode_base64(data: bytes) -> str:
async def test_sitemap(server_url: URL, http_client: HttpClient) -> None:
"""Test loading a basic sitemap."""
sitemap_url = (server_url / 'sitemap.xml').with_query(
- base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8'
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode()), c_type='application/xml; charset=utf-8'
+ )
+ sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
+
+ assert len(sitemap.urls) == 5
+ assert set(sitemap.urls) == get_basic_results(server_url)
+
+
+async def test_sitemap_different_url(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a basic sitemap when sitemap contains links to different url. Those should be ignored."""
+ different_url = 'https://other.com/'
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=different_url).encode()), c_type='application/xml; charset=utf-8'
)
sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
+ assert len(sitemap.urls) == 0
+
+
+async def test_sitemap_different_url_allowed(server_url: URL, http_client: HttpClient) -> None:
+ """Test loading a basic sitemap when sitemap contains links to different url, and it is explicitly allowed."""
+ different_url = 'https://other.com/'
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=different_url).encode()), c_type='application/xml; charset=utf-8'
+ )
+ sitemap = await Sitemap.load(
+ str(sitemap_url), http_client=http_client, parse_sitemap_options={'enqueue_strategy': 'all'}
+ )
+
assert len(sitemap.urls) == 5
- assert set(sitemap.urls) == BASIC_RESULTS
+ assert set(sitemap.urls) == get_basic_results(different_url)
async def test_extract_metadata_sitemap(server_url: URL, http_client: HttpClient) -> None:
"""Test extracting item metadata from a sitemap."""
sitemap_url = (server_url / 'sitemap.xml').with_query(
- base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/xml; charset=utf-8'
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode()), c_type='application/xml; charset=utf-8'
)
items = [item async for item in parse_sitemap([{'type': 'url', 'url': str(sitemap_url)}], http_client=http_client)]
assert len(items) == 5
assert items[0] == SitemapUrl(
- loc='http://not-exists.com/',
+ loc=str(server_url),
priority=0.8,
changefreq='monthly',
lastmod=datetime.fromisoformat('2005-02-03'),
@@ -142,16 +130,16 @@ async def test_extract_metadata_sitemap(server_url: URL, http_client: HttpClient
async def test_gzipped_sitemap(server_url: URL, http_client: HttpClient) -> None:
"""Test loading a gzipped sitemap with correct type and .xml.gz url."""
- gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))
+ gzipped_data = encode_base64(compress_gzip(get_basic_sitemap(url=server_url)))
sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=gzipped_data, c_type='application/gzip')
sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
assert len(sitemap.urls) == 5
- assert set(sitemap.urls) == BASIC_RESULTS
+ assert set(sitemap.urls) == get_basic_results(server_url)
async def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_client: HttpClient) -> None:
"""Test loading a invalid gzipped sitemap with correct type and .xml.gz url."""
- compress_data = compress_gzip(BASIC_SITEMAP)
+ compress_data = compress_gzip(get_basic_sitemap(url=server_url))
invalid_gzipped_data = encode_base64(compress_data[:30])
sitemap_url = (server_url / 'sitemap.xml.gz').with_query(base64=invalid_gzipped_data, c_type='application/gzip')
sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
@@ -163,34 +151,34 @@ async def test_gzipped_sitemap_with_invalid_data(server_url: URL, http_client: H
async def test_gz_sitemap_with_non_gzipped(server_url: URL, http_client: HttpClient) -> None:
"""Test loading a sitemap with gzip type and .xml.gz url, but without gzipped data."""
sitemap_url = (server_url / 'sitemap.xml.gz').with_query(
- base64=encode_base64(BASIC_SITEMAP.encode()), c_type='application/gzip'
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode()), c_type='application/gzip'
)
sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
assert len(sitemap.urls) == 5
- assert set(sitemap.urls) == BASIC_RESULTS
+ assert set(sitemap.urls) == get_basic_results(server_url)
async def test_gzipped_sitemap_with_bad_type(server_url: URL, http_client: HttpClient) -> None:
"""Test loading a gzipped sitemap with bad type and .xml.gz url."""
- gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))
+ gzipped_data = encode_base64(compress_gzip(get_basic_sitemap(url=server_url)))
sitemap_url = (server_url / 'sitemap.xml.gz').with_query(
base64=gzipped_data, c_type='application/xml; charset=utf-8'
)
sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
assert len(sitemap.urls) == 5
- assert set(sitemap.urls) == BASIC_RESULTS
+ assert set(sitemap.urls) == get_basic_results(server_url)
async def test_xml_sitemap_with_gzipped_data(server_url: URL, http_client: HttpClient) -> None:
"""Test loading a gzipped sitemap with correct type and .xml url."""
- gzipped_data = encode_base64(compress_gzip(BASIC_SITEMAP))
+ gzipped_data = encode_base64(compress_gzip(get_basic_sitemap(url=server_url)))
sitemap_url = (server_url / 'sitemap.xml').with_query(base64=gzipped_data, c_type='application/gzip')
sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
assert len(sitemap.urls) == 5
- assert set(sitemap.urls) == BASIC_RESULTS
+ assert set(sitemap.urls) == get_basic_results(server_url)
async def test_parent_sitemap(server_url: URL, http_client: HttpClient) -> None:
@@ -208,8 +196,12 @@ async def test_parent_sitemap(server_url: URL, http_client: HttpClient) -> None:
""".strip()
- child_sitemap = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
- child_sitemap_2 = (server_url / 'sitemap.xml.gz').with_query(base64=encode_base64(compress_gzip(BASIC_SITEMAP)))
+ child_sitemap = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode())
+ )
+ child_sitemap_2 = (server_url / 'sitemap.xml.gz').with_query(
+ base64=encode_base64(compress_gzip(get_basic_sitemap(url=server_url)))
+ )
parent_sitemap_content = parent_sitemap.format(child_sitemap=child_sitemap, child_sitemap_2=child_sitemap_2)
encoded_parent_sitemap_content = encode_base64(parent_sitemap_content.encode())
parent_sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encoded_parent_sitemap_content)
@@ -217,7 +209,7 @@ async def test_parent_sitemap(server_url: URL, http_client: HttpClient) -> None:
sitemap = await Sitemap.load(str(parent_sitemap_url), http_client=http_client)
assert len(sitemap.urls) == 10
- assert set(sitemap.urls) == BASIC_RESULTS
+ assert set(sitemap.urls) == get_basic_results(server_url)
async def test_non_sitemap_url(server_url: URL, http_client: HttpClient) -> None:
@@ -230,11 +222,11 @@ async def test_non_sitemap_url(server_url: URL, http_client: HttpClient) -> None
async def test_cdata_sitemap(server_url: URL, http_client: HttpClient) -> None:
"""Test loading a sitemap with CDATA sections."""
- cdata_sitemap = """
+ cdata_sitemap = f"""
-
+
""".strip()
@@ -244,14 +236,14 @@ async def test_cdata_sitemap(server_url: URL, http_client: HttpClient) -> None:
sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
assert len(sitemap.urls) == 1
- assert sitemap.urls == ['http://not-exists.com/catalog']
+ assert sitemap.urls == [f'{server_url}catalog']
async def test_txt_sitemap(server_url: URL, http_client: HttpClient) -> None:
"""Test loading a plain text sitemap."""
urls = [
- 'http://not-exists.com/catalog?item=78&desc=vacation_crete',
- 'http://not-exists.com/catalog?item=79&desc=vacation_somalia',
+ f'{server_url}catalog?item=78&desc=vacation_crete',
+ f'{server_url}catalog?item=79&desc=vacation_somalia',
]
txt_sitemap_content = '\n'.join(urls)
@@ -260,19 +252,19 @@ async def test_txt_sitemap(server_url: URL, http_client: HttpClient) -> None:
assert len(sitemap.urls) == 2
assert set(sitemap.urls) == {
- 'http://not-exists.com/catalog?item=78&desc=vacation_crete',
- 'http://not-exists.com/catalog?item=79&desc=vacation_somalia',
+ f'{server_url}catalog?item=78&desc=vacation_crete',
+ f'{server_url}catalog?item=79&desc=vacation_somalia',
}
async def test_sitemap_pretty(server_url: URL, http_client: HttpClient) -> None:
"""Test loading a pretty-printed sitemap."""
- pretty_sitemap = """
+ pretty_sitemap = f"""
- http://not-exists.com/catalog?item=80&desc=vacation_turkey
+ {server_url}catalog?item=80&desc=vacation_turkey
2005-02-03
@@ -293,37 +285,33 @@ async def test_sitemap_pretty(server_url: URL, http_client: HttpClient) -> None:
sitemap = await Sitemap.load(str(sitemap_url), http_client=http_client)
assert len(sitemap.urls) == 1
- assert sitemap.urls == ['http://not-exists.com/catalog?item=80&desc=vacation_turkey']
+ assert sitemap.urls == [f'{server_url}catalog?item=80&desc=vacation_turkey']
async def test_sitemap_from_string() -> None:
"""Test creating a Sitemap instance from an XML string."""
- sitemap = await Sitemap.from_xml_string(BASIC_SITEMAP)
+ sitemap = await Sitemap.from_xml_string(get_basic_sitemap())
assert len(sitemap.urls) == 5
- assert set(sitemap.urls) == BASIC_RESULTS
+ assert set(sitemap.urls) == get_basic_results()
async def test_sitemap_fetch_retries_on_transient_error() -> None:
"""Transient fetch errors are retried up to `sitemap_retries` times before giving up."""
- client, attempts = _make_flaky_stream_client(BASIC_SITEMAP.encode(), fail_times=2)
+ client, attempts = _make_flaky_stream_client(get_basic_sitemap().encode(), fail_times=2)
- items = [
- item async for item in parse_sitemap([{'type': 'url', 'url': 'http://not-exists.com/sitemap.xml'}], client)
- ]
+ items = [item async for item in parse_sitemap([{'type': 'url', 'url': f'{DEFAULT_URL}sitemap.xml'}], client)]
assert len(attempts) == 3
- assert {item.loc for item in items} == BASIC_RESULTS
+ assert {item.loc for item in items} == get_basic_results()
async def test_sitemap_fetch_raises_after_retries_exhausted() -> None:
"""A persistent fetch error is raised to the caller once all retries are exhausted."""
- client, attempts = _make_flaky_stream_client(BASIC_SITEMAP.encode(), fail_times=10)
+ client, attempts = _make_flaky_stream_client(get_basic_sitemap().encode(), fail_times=10)
with pytest.raises(ConnectionError):
- _ = [
- item async for item in parse_sitemap([{'type': 'url', 'url': 'http://not-exists.com/sitemap.xml'}], client)
- ]
+ _ = [item async for item in parse_sitemap([{'type': 'url', 'url': f'{DEFAULT_URL}sitemap.xml'}], client)]
assert len(attempts) == 3
@@ -331,9 +319,9 @@ async def test_sitemap_fetch_raises_after_retries_exhausted() -> None:
async def test_parse_sitemap_with_partial_options() -> None:
"""Test that missing keys in partial `ParseSitemapOptions` fall back to defaults."""
options = ParseSitemapOptions(timeout=timedelta(seconds=10))
- items = [item async for item in parse_sitemap([{'type': 'raw', 'content': BASIC_SITEMAP}], options=options)]
+ items = [item async for item in parse_sitemap([{'type': 'raw', 'content': get_basic_sitemap()}], options=options)]
- assert {item.loc for item in items} == BASIC_RESULTS
+ assert {item.loc for item in items} == get_basic_results()
async def test_discover_sitemap_from_robots_txt() -> None:
@@ -441,3 +429,23 @@ async def test_discover_sitemap_url_without_host_skipped() -> None:
urls = [url async for url in discover_valid_sitemaps(['not-a-valid-url'], http_client=http_client)]
assert urls == []
+
+
+async def test_raw_sitemap_index_processes_nested_sitemaps() -> None:
+ """Test that nested sitemap respects source url."""
+ raw_index = f"""
+
+
+
+ {DEFAULT_URL}child-sitemap.xml
+ 2004-12-23
+
+
+ """.strip()
+
+ # The child sitemap (same host as DEFAULT_URL) is fetched via the streaming client.
+ client, _ = _make_flaky_stream_client(get_basic_sitemap().encode(), fail_times=0)
+
+ items = [item async for item in parse_sitemap([{'type': 'raw', 'content': raw_index}], client)]
+
+ assert {item.loc for item in items} == get_basic_results()
diff --git a/tests/unit/request_loaders/test_sitemap_request_loader.py b/tests/unit/request_loaders/test_sitemap_request_loader.py
index 0fd77cae59..abfd83cd62 100644
--- a/tests/unit/request_loaders/test_sitemap_request_loader.py
+++ b/tests/unit/request_loaders/test_sitemap_request_loader.py
@@ -9,41 +9,11 @@
from crawlee.http_clients._base import HttpClient
from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader
from crawlee.storages import KeyValueStore
-from tests.unit.utils import poll_until_condition
+from tests.unit.utils import get_basic_results, get_basic_sitemap, poll_until_condition
if TYPE_CHECKING:
from crawlee._types import JsonSerializable
-BASIC_SITEMAP = """
-
-
-
-http://not-exists.com/
-2005-02-03
-monthly
-0.8
-
-
-http://not-exists.com/catalog?item=12&desc=vacation_hawaii
-weekly
-
-
-http://not-exists.com/catalog?item=73&desc=vacation_new_zealand
-2004-12-23
-weekly
-
-
-http://not-exists.com/catalog?item=74&desc=vacation_newfoundland
-2004-12-23T18:00:15+00:00
-0.3
-
-
-http://not-exists.com/catalog?item=83&desc=vacation_usa
-2004-11-23
-
-
-""".strip()
-
def compress_gzip(data: str) -> bytes:
"""Compress a string using gzip."""
@@ -56,7 +26,9 @@ def encode_base64(data: bytes) -> str:
async def test_sitemap_traversal(server_url: URL, http_client: HttpClient) -> None:
- sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode())
+ )
sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, enqueue_strategy='all')
while not await sitemap_loader.is_finished():
@@ -72,7 +44,9 @@ async def test_sitemap_traversal(server_url: URL, http_client: HttpClient) -> No
async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL, http_client: HttpClient) -> None:
- sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode())
+ )
sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, enqueue_strategy='all')
items = []
@@ -95,7 +69,9 @@ async def test_is_empty_does_not_depend_on_fetch_next_request(server_url: URL, h
async def test_abort_sitemap_loading(server_url: URL, http_client: HttpClient) -> None:
- sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode())
+ )
sitemap_loader = SitemapRequestLoader(
[str(sitemap_url)], max_buffer_size=2, http_client=http_client, enqueue_strategy='all'
)
@@ -119,7 +95,9 @@ async def test_abort_sitemap_loading(server_url: URL, http_client: HttpClient) -
async def test_create_persist_state_for_sitemap_loading(
server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore
) -> None:
- sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode())
+ )
persist_key = 'create_persist_state'
sitemap_loader = SitemapRequestLoader(
[str(sitemap_url)], http_client=http_client, persist_state_key=persist_key, enqueue_strategy='all'
@@ -137,7 +115,9 @@ async def test_create_persist_state_for_sitemap_loading(
async def test_data_persistence_for_sitemap_loading(
server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore
) -> None:
- sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode())
+ )
persist_key = 'data_persist_state'
sitemap_loader = SitemapRequestLoader(
[str(sitemap_url)], http_client=http_client, persist_state_key=persist_key, enqueue_strategy='all'
@@ -159,7 +139,9 @@ async def test_data_persistence_for_sitemap_loading(
async def test_recovery_data_persistence_for_sitemap_loading(
server_url: URL, http_client: HttpClient, key_value_store: KeyValueStore
) -> None:
- sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode())
+ )
persist_key = 'recovery_persist_state'
sitemap_loader = SitemapRequestLoader(
[str(sitemap_url)], http_client=http_client, persist_state_key=persist_key, enqueue_strategy='all'
@@ -188,7 +170,9 @@ async def test_recovery_data_persistence_for_sitemap_loading(
async def test_transform_request_function(server_url: URL, http_client: HttpClient) -> None:
- sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode())
+ )
def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
user_data: dict[str, JsonSerializable] = {'transformed': True}
@@ -215,17 +199,13 @@ def transform_request(request_options: RequestOptions) -> RequestOptions | Reque
await sitemap_loader.mark_request_as_handled(request)
assert len(extracted_urls) == 5
- assert extracted_urls == {
- 'http://not-exists.com/',
- 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
- 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
- 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
- 'http://not-exists.com/catalog?item=83&desc=vacation_usa',
- }
+ assert extracted_urls == get_basic_results(server_url)
async def test_transform_request_function_with_skip(server_url: URL, http_client: HttpClient) -> None:
- sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode())
+ )
def transform_request(_request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
return 'skip'
@@ -255,7 +235,9 @@ async def test_sitemap_loader_to_tandem(
server_url: URL,
http_client: HttpClient,
) -> None:
- sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode())
+ )
sitemap_loader = SitemapRequestLoader([str(sitemap_url)], http_client=http_client, enqueue_strategy='all')
request_manager = await sitemap_loader.to_tandem()
@@ -277,7 +259,9 @@ async def test_sitemap_loader_to_tandem_with_request_dropped(
server_url: URL,
http_client: HttpClient,
) -> None:
- sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
+ sitemap_url = (server_url / 'sitemap.xml').with_query(
+ base64=encode_base64(get_basic_sitemap(url=server_url).encode())
+ )
sitemap_loader = SitemapRequestLoader(
[str(sitemap_url)],
diff --git a/tests/unit/utils.py b/tests/unit/utils.py
index a965d3cc7b..02f3ece24b 100644
--- a/tests/unit/utils.py
+++ b/tests/unit/utils.py
@@ -11,6 +11,8 @@
if TYPE_CHECKING:
from collections.abc import Awaitable, Callable
+ from yarl import URL
+
T = TypeVar('T')
run_alone_on_mac = pytest.mark.run_alone if sys.platform == 'darwin' else lambda x: x
@@ -73,3 +75,48 @@ async def poll_until_condition(
delay *= backoff_factor
result = await maybe_await(fn())
return result
+
+
+DEFAULT_URL = 'http://not-exists.com/'
+
+
+def get_basic_sitemap(url: str | URL = DEFAULT_URL) -> str:
+ return """
+
+
+
+ {url}
+ 2005-02-03
+ monthly
+ 0.8
+
+
+ {url}catalog?item=12&desc=vacation_hawaii
+ weekly
+
+
+ {url}catalog?item=73&desc=vacation_new_zealand
+ 2004-12-23
+ weekly
+
+
+ {url}catalog?item=74&desc=vacation_newfoundland
+ 2004-12-23T18:00:15+00:00
+ 0.3
+
+
+ {url}catalog?item=83&desc=vacation_usa
+ 2004-11-23
+
+
+ """.strip().format(url=url)
+
+
+def get_basic_results(server_url: str | URL = DEFAULT_URL) -> set[str]:
+ return {
+ str(server_url),
+ f'{server_url}catalog?item=12&desc=vacation_hawaii',
+ f'{server_url}catalog?item=73&desc=vacation_new_zealand',
+ f'{server_url}catalog?item=74&desc=vacation_newfoundland',
+ f'{server_url}catalog?item=83&desc=vacation_usa',
+ }