Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ dependencies = [
"simpleeval>=1.0.3",
"jsonschema>=4.25.1",
"duckduckgo-search>=8.1.1",
"ddgs>=9.0.0",
"pydantic>=2.12.5",
"scrapegraph-py>=2.0.0",
]
Expand Down Expand Up @@ -96,6 +97,8 @@ dev-dependencies = [
"pre-commit>=3.6.0",
"mypy>=1.8.0",
"types-setuptools>=75.1.0",
"selenium>=4.0.0",
"pandas>=2.0.0",
]

[tool.black]
Expand Down
40 changes: 25 additions & 15 deletions scrapegraphai/docloaders/chromium.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@

logger = get_logger("web-loader")

try:
from undetected_playwright import Malenia
except ImportError:
Malenia = None


class ChromiumLoader(BaseLoader):
"""Scrapes HTML pages from URLs using a (headless) instance of the
Expand Down Expand Up @@ -75,6 +80,8 @@ def __init__(

async def scrape(self, url: str) -> str:
if self.backend == "playwright":
if self.requires_js_support:
return await self.ascrape_with_js_support(url)
return await self.ascrape_playwright(url)
elif self.backend == "selenium":
try:
Expand Down Expand Up @@ -206,7 +213,7 @@ async def ascrape_playwright_scroll(
# https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
# In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?

if timeout and timeout <= 0:
if timeout is not None and timeout <= 0:
raise ValueError(
"If set, timeout value for scrolling scraper must be greater than 0."
)
Expand All @@ -224,7 +231,6 @@ async def ascrape_playwright_scroll(
import time

from playwright.async_api import async_playwright
from undetected_playwright import Malenia

logger.info(f"Starting scraping with scrolling support for {url}...")

Expand Down Expand Up @@ -335,16 +341,15 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
ValueError: When an invalid browser name is provided
"""
from playwright.async_api import async_playwright
from undetected_playwright import Malenia

logger.info(f"Starting scraping with {self.backend}...")
results = ""
attempt = 0

while attempt < self.retry_limit:
browser = None
try:
async with async_playwright() as p, async_timeout.timeout(self.timeout):
browser = None
if browser_name == "chromium":
browser = await p.chromium.launch(
headless=self.headless,
Expand All @@ -369,7 +374,6 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
await page.wait_for_load_state(self.load_state)
results = await page.content()
logger.info("Content scraped")
await browser.close()
return results
except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
attempt += 1
Expand All @@ -378,6 +382,9 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
raise RuntimeError(
f"Failed to scrape after {self.retry_limit} attempts: {str(e)}"
)
finally:
if browser is not None:
await browser.close()

async def ascrape_with_js_support(
self, url: str, browser_name: str = "chromium"
Expand All @@ -397,6 +404,9 @@ async def ascrape_with_js_support(
"""
from playwright.async_api import async_playwright

if browser_name not in ("chromium", "firefox"):
raise ValueError(f"Invalid browser name: {browser_name}")

logger.info(f"Starting scraping with JavaScript support for {url}...")
attempt = 0

Expand Down Expand Up @@ -436,6 +446,14 @@ async def ascrape_with_js_support(
finally:
await browser.close()

def _get_scraping_fn(self):
"""Resolve the coroutine used to scrape a single URL for this backend."""
if self.requires_js_support:
return self.ascrape_with_js_support
if self.backend == "selenium":
return self.ascrape_undetected_chromedriver
return getattr(self, f"ascrape_{self.backend}")

def lazy_load(self) -> Iterator[Document]:
"""
Lazily load text content from the provided URLs.
Expand All @@ -446,11 +464,7 @@ def lazy_load(self) -> Iterator[Document]:
Yields:
Document: The scraped content encapsulated within a Document object.
"""
scraping_fn = (
self.ascrape_with_js_support
if self.requires_js_support
else getattr(self, f"ascrape_{self.backend}")
)
scraping_fn = self._get_scraping_fn()

for url in self.urls:
html_content = asyncio.run(scraping_fn(url))
Expand All @@ -470,11 +484,7 @@ async def alazy_load(self) -> AsyncIterator[Document]:
Document: A Document object containing the scraped content, along with its
source URL as metadata.
"""
scraping_fn = (
self.ascrape_with_js_support
if self.requires_js_support
else getattr(self, f"ascrape_{self.backend}")
)
scraping_fn = self._get_scraping_fn()

tasks = [scraping_fn(url) for url in self.urls]
results = await asyncio.gather(*tasks)
Expand Down
3 changes: 1 addition & 2 deletions scrapegraphai/docloaders/plasmate.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from langchain_community.document_loaders.base import BaseLoader
from langchain_core.documents import Document

from .chromium import ChromiumLoader
from ..utils import get_logger

logger = get_logger("plasmate-loader")
Expand Down Expand Up @@ -147,8 +148,6 @@ def _fetch_url(self, url: str) -> str:

def _fallback_fetch(self, url: str) -> str:
"""Fall back to ChromiumLoader when Plasmate returns empty content."""
from .chromium import ChromiumLoader

logger.info(f"[PlasmateLoader] Falling back to ChromiumLoader for: {url}")
loader = ChromiumLoader([url], **self.chrome_kwargs)
docs = loader.load()
Expand Down
9 changes: 5 additions & 4 deletions scrapegraphai/graphs/abstract_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,9 @@ def _create_llm(self, llm_config: dict) -> object:
]
if len(possible_providers) <= 0:
raise ValueError(
f"""Provider {llm_params["model_provider"]} is not supported.
If possible, try to use a model instance instead."""
f"""Provider for model "{llm_params["model"]}" could not be """
"""determined. Specify the provider explicitly using the """
""""<provider>/<model>" format, or use a model instance instead."""
)
llm_params["model_provider"] = possible_providers[0]
logger.info(
Expand All @@ -212,8 +213,8 @@ def _create_llm(self, llm_config: dict) -> object:
except KeyError:
logger.warning(
"Max input tokens for model %s/%s not found, "
"please specify the model_tokens parameter in the llm section of the graph configuration. "
"Using default token size: 8192",
"please specify the model_tokens parameter in the llm section of "
"the graph configuration. Using default token size: 8192",
llm_params["model_provider"],
llm_params["model"],
)
Expand Down
15 changes: 11 additions & 4 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ def __init__(
)

# Timeout in seconds for blocking operations (HTTP requests, PDF parsing, etc.).
# If set to None, no timeout will be applied.
self.timeout = None if node_config is None else node_config.get("timeout", 30)
# Defaults to 30 seconds. If explicitly set to None, no timeout will be applied.
self.timeout = 30 if node_config is None else node_config.get("timeout", 30)

self.cut = False if node_config is None else node_config.get("cut", True)

Expand Down Expand Up @@ -119,7 +119,9 @@ def execute(self, state):

if input_type in handlers:
return handlers[input_type](state, input_type, source)
elif input_type == "local_dir":
elif input_type in ("local_dir", "txt"):
# `txt` carries raw text/HTML content (not a path), handled the same
# way as a local source.
return self.handle_local_source(state, source)
elif input_type == "url":
return self.handle_web_source(state, source)
Expand Down Expand Up @@ -291,6 +293,10 @@ def handle_web_source(self, state, source):
if not response.text.strip():
raise ValueError("No HTML body content found in the response.")

# Default the parsed content to the raw response text so it is always
# defined, regardless of which optional processing branches run below.
parsed_content = response.text

if not self.cut:
parsed_content = cleanup_html(response, source)

Expand Down Expand Up @@ -400,7 +406,8 @@ def handle_web_source(self, state, source):
compressed_document = [
Document(page_content=parsed_content, metadata={"source": "html file"})
]
state["doc"] = document
# `document` is only produced by the loader-based (non-soup) path.
state["doc"] = document
state.update(
{
self.output[0]: compressed_document,
Expand Down
110 changes: 87 additions & 23 deletions scrapegraphai/utils/copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,40 +32,104 @@ def is_boto3_client(obj):
return False


def safe_deepcopy(obj: Any) -> Any:
def safe_deepcopy(obj: Any, memo: dict = None) -> Any:
"""
Safely create a deep copy of an object, handling special cases.

This performs a true recursive deep copy of containers and generic
objects (recursing into ``__dict__``) while tracking already-copied
objects in a ``memo`` dict so circular references are preserved
instead of causing infinite recursion. Objects that cannot be deep
or shallow copied (e.g. boto3 clients, thread locks) are returned
as-is.

Args:
obj: Object to copy
memo: Internal memoization dict mapping id(obj) -> copy, used to
handle circular references.

Returns:
Deep copy of the object

Raises:
DeepCopyError: If object cannot be deep copied
"""
if memo is None:
memo = {}

# Return already-copied objects to preserve identity / cycles.
obj_id = id(obj)
if obj_id in memo:
return memo[obj_id]

# Handle immutable / atomic cases first.
if obj is None or isinstance(obj, (str, int, float, bool)):
return obj

if isinstance(obj, list):
new_list = []
memo[obj_id] = new_list
for v in obj:
new_list.append(safe_deepcopy(v, memo))
return new_list

if isinstance(obj, set):
new_set = set()
memo[obj_id] = new_set
for v in obj:
new_set.add(safe_deepcopy(v, memo))
return new_set

if isinstance(obj, dict):
new_dict = {}
memo[obj_id] = new_dict
for k, v in obj.items():
new_dict[safe_deepcopy(k, memo)] = safe_deepcopy(v, memo)
return new_dict

if isinstance(obj, tuple):
new_tuple = tuple(safe_deepcopy(v, memo) for v in obj)
memo[obj_id] = new_tuple
return new_tuple

if isinstance(obj, frozenset):
new_frozenset = frozenset(safe_deepcopy(v, memo) for v in obj)
memo[obj_id] = new_frozenset
return new_frozenset

if is_boto3_client(obj):
return obj

# Generic objects: try a true deep copy first, then fall back to a
# shallow copy, then to returning the object unchanged. Only raise
# if nothing works.
try:
# Handle special cases first
if obj is None or isinstance(obj, (str, int, float, bool)):
return obj

if isinstance(obj, (list, set)):
return type(obj)(safe_deepcopy(v) for v in obj)

if isinstance(obj, dict):
return {k: safe_deepcopy(v) for k, v in obj.items()}

if isinstance(obj, tuple):
return tuple(safe_deepcopy(v) for v in obj)

if isinstance(obj, frozenset):
return frozenset(safe_deepcopy(v) for v in obj)
new_obj = copy.deepcopy(obj, memo)
memo[obj_id] = new_obj
return new_obj
except Exception:
pass

if is_boto3_client(obj):
return obj

return copy.copy(obj)

except Exception as e:
raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}") from e
try:
new_obj = copy.copy(obj)
memo[obj_id] = new_obj
return new_obj
except Exception:
pass

# As a last resort, recurse into the object's own __dict__ so that
# mutable attributes are deep copied even when copy.copy/deepcopy fail.
if hasattr(obj, "__dict__"):
try:
cls = obj.__class__
new_obj = cls.__new__(cls)
memo[obj_id] = new_obj
for attr, value in obj.__dict__.items():
setattr(new_obj, attr, safe_deepcopy(value, memo))
return new_obj
except Exception as e:
raise DeepCopyError(
f"Cannot deep copy object of type {type(obj)}"
) from e

raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}")
26 changes: 15 additions & 11 deletions scrapegraphai/utils/proxy_rotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,20 +192,24 @@ def parse_or_search_proxy(proxy: Proxy) -> ProxySettings:
"""
Parses a proxy configuration or searches for a matching one via broker.
"""
assert "server" in proxy, "Missing 'server' field in the proxy configuration."
assert "server" in proxy, "missing server in the proxy configuration"

server = proxy["server"]

parsed_url = urlparse(proxy["server"])
server_address = parsed_url.hostname
# the special keyword "broker" triggers a proxy search via the broker
if server == "broker":
return _search_proxy(proxy)

if server_address is None:
raise ValueError(f"Invalid proxy server format: {proxy['server']}")
# normalise the server so urlparse can extract the hostname even when no
# scheme is present (e.g. "192.168.1.1:8080" or "gate.nodemaven.com:8080")
to_parse = server if "://" in server else f"//{server}"
server_address = urlparse(to_parse).hostname

# Accept both IP addresses and domain names like 'gate.nodemaven.com'
if is_ipv4_address(server_address) or re.match(
r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", server_address
# accept both IPv4 addresses and domain names like 'gate.nodemaven.com'
if server_address is not None and (
is_ipv4_address(server_address)
or re.match(r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", server_address)
):
return _parse_proxy(proxy)

assert proxy["server"] == "broker", f"Unknown proxy server type: {proxy['server']}"

return _search_proxy(proxy)
assert False, f"unknown proxy server: {server}"
3 changes: 3 additions & 0 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,9 @@ def search_on_web(
res = research.run(config.query)
# Extract URLs using regex
results = re.findall(r"https?://[^\s,\]]+", res)
# DuckDuckGo treats max_results as a soft hint and may return more,
# so enforce the requested limit explicitly.
results = results[: config.max_results]

elif config.search_engine == "bing":
results = _search_bing(
Expand Down
Loading
Loading