import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.storages import RequestQueue
from urllib.parse import urlencode
async def main() -> None:
# 创建第一个爬虫实例,用于搜索关键词并将链接存放到队列中
search_crawler = BeautifulSoupCrawler(
max_requests_per_crawl=20, # 可根据需要调整最大请求数
max_session_rotations=10,
max_crawl_depth=0
)
# 创建请求队列
rq = await RequestQueue.open()
# 定义第一个爬虫的请求处理器
@search_crawler.router.default_handler
async def search_request_handler(context: BeautifulSoupCrawlingContext) -> None:
# 检查当前URL是否为百度搜索结果页
if 'baidu.com/s' in context.request.url:
context.log.info(f'这是搜索结果页,以下链接还需要进一步搜索才能获得实质内容')
# 从搜索结果中提取数据
search_results = []
for item in context.soup.select('.t a'): # 假设搜索结果项位于class为't'的div内的a标签中
result_data = {
'title': item.text.strip(), # 提取标题
'link': item['href'], # 提取链接,注意这可能是相对路径
}
# 如果链接是相对路径,则转换为绝对路径
if not result_data['link'].startswith('http'):
result_data['link'] = context.request.url_join(result_data['link'])
search_results.append(result_data)
# 将提取的数据推送到数据集,并将链接添加到请求队列中
for result in search_results:
context.log.info(f"Title: {result['title']}, Link: {result['link']}")
await context.push_data(result)
await rq.add_request(result['link'])
# 设置初始URL为包含搜索查询的百度搜索页
search_query = urlencode({'wd':'二部有向无环图'}) # 替换为您想要搜索的内容,此处为URL编码后的查询字符串
print(search_query)
initial_url = f'https://www.baidu.com/s?{search_query}'
await search_crawler.run([initial_url])
# 创建第二个爬虫实例,用于从队列中读取链接并进一步爬取这些链接
detail_crawler = BeautifulSoupCrawler(
max_requests_per_crawl=10, # 可根据需要调整最大请求数
max_session_rotations=10,
max_crawl_depth=1,
request_handler=rq
)
# 定义第二个爬虫的请求处理器
@detail_crawler.router.default_handler
async def detail_request_handler(context: BeautifulSoupCrawlingContext) -> None:
# 提取页面内容
context.log.info(f'这是实质内容页,以下是页面内容')
page_content = context.soup.get_text(separator='\n', strip=True)
context.log.info(f'Content from {context.request.url}: {page_content[:200]}...') # 打印前200个字符作为示例
await context.push_data({'url': context.request.url, 'content': page_content})
# 从请求队列中读取链接并运行第二个爬虫
await detail_crawler.run()
if __name__ == '__main__':
asyncio.run(main())
[crawlee._autoscaling.autoscaled_pool] INFO current_concurrency = 0; desired_concurrency = 2; cpu = 0; mem = 0; event_loop = 0.0; client_info = 0.0
[BeautifulSoupCrawler] INFO 这是搜索结果页,以下链接还需要进一步搜索才能获得实质内容
[BeautifulSoupCrawler] INFO Title: 有向无环图(数据结构领域术语) - 百度百科, Link: http://www.baidu.com/link?url=trQ8VhQSHfx_hwXeDJ7sf1TJgnS1KU8E06FzF7jKqcvL_-NqV-Oh826hbmGMatBFCMZIYkxicSikzlrqcgpr9u6vo7v4dNlfqfc-StOjBK-rz50_iM7wF--2VUOhIR8o
[BeautifulSoupCrawler] INFO Title: 因果推理初探(2)——有向无环图 - 知乎, Link: http://www.baidu.com/link?url=O7xe4jW4T6yPFqkHKcfh6VUx1kHza7a0JjBBSWssKRhoZzNIKJ1y05kU1n4YB8Ke
[BeautifulSoupCrawler] INFO Title: 数据结构——有向无环图描述表达式-CSDN博客, Link: http://www.baidu.com/link?url=l7zcHMbgwS0SbWKi6mlMvP6BQkwn1pMNEfeCk4jkUdSBuBVeOcZp3XwRYsMVpDZ1nwP855WAQ2V3ibz1WkvvYQo89nlo0QygQlXOWGeNhta
[BeautifulSoupCrawler] INFO Title: 图:有向无环图(DAG)(应用:拓扑,逆拓扑排序)-CSDN博客, Link: http://www.baidu.com/link?url=xKCmJGfEB6zN02QKSC92mGxH-tC0qSG78wK0cP75a0yKRKCKV77DR3UpjjRNpcRVm4Ceu-jKdF5kjwAEB8szOc6RI6xt9_Gdywuu-cphzN3
[BeautifulSoupCrawler] INFO Title: 有向无环图 | 机器之心, Link: http://www.baidu.com/link?url=NxSNkbUsAHXkuyAKZ4dZOL-8ePHZlFQ-uFW_1mQlNJTiK_3OXdZPkOUwXTKB90WqKVAQOY-rVPIa_v0Zz8AAbO5LJpMSKP5hpKGUwJeceu3z4FDDPLOhlHqpGB2Zriax
[BeautifulSoupCrawler] INFO Title: 图论II - 洛谷专栏, Link: http://www.baidu.com/link?url=ftAQlYK_QFyW7_pvlFOtqTQvrqwxFN58nkY4y8x0bCPQDTo4sWKCTnURTAxPb6DdGs8UiKPkusVO13lP2r9xEq
[BeautifulSoupCrawler] INFO Title: 第19讲 有向无环图.ppt-全文可读, Link: http://www.baidu.com/link?url=Jg0GmxAMqswaDXhycePjhsiVr1OU_OdGSkw1Qyo7ihG7ZRXu8IBqxaN8sD1P3o5lhGPeBY-uwAXHUHPT8cUWf_
[BeautifulSoupCrawler] INFO Title: 有向无环图 Directed acyclic graph - 集智百科, Link: http://www.baidu.com/link?url=S1jkrtMESozkuW68mCwRfUElRHRTfKK6_EUwwb9KN1dACh4kUUZEMefwnIcWVjkdMYEGXZhdL1_nbXcXyR-_V7Pim5dFKzMA3YMpPEf9DqUJCN7-X-7yjIBjRpNUvXPm-efI4oqRWa9jHb7WV7Kp1tDyEQ8CiAVbueFDzNojYUlie-QhmtRCEowNxQWl808F1JA0G629AGTciQ6XTjLGURBlBAfcg4e-MBGElAwaBt7
[BeautifulSoupCrawler] WARN Encountered a session error, rotating session and retrying
[BeautifulSoupCrawler] WARN Encountered a session error, rotating session and retrying
[BeautifulSoupCrawler] WARN Encountered a session error, rotating session and retrying
[BeautifulSoupCrawler] ERROR Request failed and reached maximum retries
Traceback (most recent call last):
File "D:\anaconda3\envs\paddle_env\lib\site-packages\crawlee\crawlers\_basic\_context_pipeline.py", line 65, in __call__
result = await middleware_instance.__anext__()
File "D:\anaconda3\envs\paddle_env\lib\site-packages\crawlee\crawlers\_abstract_http\_abstract_http_crawler.py", line 200, in _make_http_request
result = await self._http_client.crawl(
File "D:\anaconda3\envs\paddle_env\lib\site-packages\crawlee\http_clients\_httpx.py", line 159, in crawl
response = await client.send(http_request)
File "D:\anaconda3\envs\paddle_env\lib\site-packages\httpx\_client.py", line 1629, in send
response = await self._send_handling_auth(
File "D:\anaconda3\envs\paddle_env\lib\site-packages\httpx\_client.py", line 1657, in _send_handling_auth
response = await self._send_handling_redirects(
File "D:\anaconda3\envs\paddle_env\lib\site-packages\httpx\_client.py", line 1687, in _send_handling_redirects
raise TooManyRedirects(
httpx.TooManyRedirects: Exceeded maximum allowed redirects.
[BeautifulSoupCrawler] WARN Encountered a session error, rotating session and retrying
[BeautifulSoupCrawler] WARN Encountered a session error, rotating session and retrying
[BeautifulSoupCrawler] WARN Encountered a session error, rotating session and retrying
[BeautifulSoupCrawler] WARN Encountered a session error, rotating session and retrying
[BeautifulSoupCrawler] WARN Encountered a session error, rotating session and retrying
[BeautifulSoupCrawler] WARN Encountered a session error, rotating session and retrying
[BeautifulSoupCrawler] ERROR Request failed and reached maximum retries
Error as below:
Why and How to solve it?