ScrapeGraphAI · VinciGit00 · Jun 15, 2026 · Jun 15, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
     "simpleeval>=1.0.3",
     "jsonschema>=4.25.1",
     "duckduckgo-search>=8.1.1",
+    "ddgs>=9.0.0",
     "pydantic>=2.12.5",
     "scrapegraph-py>=2.0.0",
 ]
@@ -96,6 +97,8 @@ dev-dependencies = [
     "pre-commit>=3.6.0",
     "mypy>=1.8.0",
     "types-setuptools>=75.1.0",
+    "selenium>=4.0.0",
+    "pandas>=2.0.0",
 ]
 
 [tool.black]

diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
@@ -10,6 +10,11 @@
 
 logger = get_logger("web-loader")
 
+try:
+    from undetected_playwright import Malenia
+except ImportError:
+    Malenia = None
+
 
 class ChromiumLoader(BaseLoader):
     """Scrapes HTML pages from URLs using a (headless) instance of the
@@ -75,6 +80,8 @@ def __init__(
 
     async def scrape(self, url: str) -> str:
         if self.backend == "playwright":
+            if self.requires_js_support:
+                return await self.ascrape_with_js_support(url)
             return await self.ascrape_playwright(url)
         elif self.backend == "selenium":
             try:
@@ -206,7 +213,7 @@ async def ascrape_playwright_scroll(
         # https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
         # In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?
 
-        if timeout and timeout <= 0:
+        if timeout is not None and timeout <= 0:
             raise ValueError(
                 "If set, timeout value for scrolling scraper must be greater than 0."
             )
@@ -224,7 +231,6 @@ async def ascrape_playwright_scroll(
         import time
 
         from playwright.async_api import async_playwright
-        from undetected_playwright import Malenia
 
         logger.info(f"Starting scraping with scrolling support for {url}...")
 
@@ -335,16 +341,15 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
             ValueError: When an invalid browser name is provided
         """
         from playwright.async_api import async_playwright
-        from undetected_playwright import Malenia
 
         logger.info(f"Starting scraping with {self.backend}...")
         results = ""
         attempt = 0
 
         while attempt < self.retry_limit:
+            browser = None
             try:
                 async with async_playwright() as p, async_timeout.timeout(self.timeout):
-                    browser = None
                     if browser_name == "chromium":
                         browser = await p.chromium.launch(
                             headless=self.headless,
@@ -369,7 +374,6 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
                     await page.wait_for_load_state(self.load_state)
                     results = await page.content()
                     logger.info("Content scraped")
-                    await browser.close()
                     return results
             except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
                 attempt += 1
@@ -378,6 +382,9 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
                     raise RuntimeError(
                         f"Failed to scrape after {self.retry_limit} attempts: {str(e)}"
                     )
+            finally:
+                if browser is not None:
+                    await browser.close()
 
     async def ascrape_with_js_support(
         self, url: str, browser_name: str = "chromium"
@@ -397,6 +404,9 @@ async def ascrape_with_js_support(
         """
         from playwright.async_api import async_playwright
 
+        if browser_name not in ("chromium", "firefox"):
+            raise ValueError(f"Invalid browser name: {browser_name}")
+
         logger.info(f"Starting scraping with JavaScript support for {url}...")
         attempt = 0
 
@@ -436,6 +446,14 @@ async def ascrape_with_js_support(
             finally:
                 await browser.close()
 
+    def _get_scraping_fn(self):
+        """Resolve the coroutine used to scrape a single URL for this backend."""
+        if self.requires_js_support:
+            return self.ascrape_with_js_support
+        if self.backend == "selenium":
+            return self.ascrape_undetected_chromedriver
+        return getattr(self, f"ascrape_{self.backend}")
+
     def lazy_load(self) -> Iterator[Document]:
         """
         Lazily load text content from the provided URLs.
@@ -446,11 +464,7 @@ def lazy_load(self) -> Iterator[Document]:
         Yields:
             Document: The scraped content encapsulated within a Document object.
         """
-        scraping_fn = (
-            self.ascrape_with_js_support
-            if self.requires_js_support
-            else getattr(self, f"ascrape_{self.backend}")
-        )
+        scraping_fn = self._get_scraping_fn()
 
         for url in self.urls:
             html_content = asyncio.run(scraping_fn(url))
@@ -470,11 +484,7 @@ async def alazy_load(self) -> AsyncIterator[Document]:
             Document: A Document object containing the scraped content, along with its
             source URL as metadata.
         """
-        scraping_fn = (
-            self.ascrape_with_js_support
-            if self.requires_js_support
-            else getattr(self, f"ascrape_{self.backend}")
-        )
+        scraping_fn = self._get_scraping_fn()
 
         tasks = [scraping_fn(url) for url in self.urls]
         results = await asyncio.gather(*tasks)

diff --git a/scrapegraphai/docloaders/plasmate.py b/scrapegraphai/docloaders/plasmate.py
@@ -17,6 +17,7 @@
 from langchain_community.document_loaders.base import BaseLoader
 from langchain_core.documents import Document
 
+from .chromium import ChromiumLoader
 from ..utils import get_logger
 
 logger = get_logger("plasmate-loader")
@@ -147,8 +148,6 @@ def _fetch_url(self, url: str) -> str:
 
     def _fallback_fetch(self, url: str) -> str:
         """Fall back to ChromiumLoader when Plasmate returns empty content."""
-        from .chromium import ChromiumLoader
-
         logger.info(f"[PlasmateLoader] Falling back to ChromiumLoader for: {url}")
         loader = ChromiumLoader([url], **self.chrome_kwargs)
         docs = loader.load()

diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -186,8 +186,9 @@ def _create_llm(self, llm_config: dict) -> object:
             ]
             if len(possible_providers) <= 0:
                 raise ValueError(
-                    f"""Provider {llm_params["model_provider"]} is not supported.
-                                If possible, try to use a model instance instead."""
+                    f"""Provider for model "{llm_params["model"]}" could not be """
+                    """determined. Specify the provider explicitly using the """
+                    """"<provider>/<model>" format, or use a model instance instead."""
                 )
             llm_params["model_provider"] = possible_providers[0]
             logger.info(
@@ -212,8 +213,8 @@ def _create_llm(self, llm_config: dict) -> object:
             except KeyError:
                 logger.warning(
                     "Max input tokens for model %s/%s not found, "
-                    "please specify the model_tokens parameter in the llm section of the graph configuration. "
-                    "Using default token size: 8192",
+                    "please specify the model_tokens parameter in the llm section of "
+                    "the graph configuration. Using default token size: 8192",
                     llm_params["model_provider"],
                     llm_params["model"],
                 )

diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -70,8 +70,8 @@ def __init__(
         )
 
         # Timeout in seconds for blocking operations (HTTP requests, PDF parsing, etc.).
-        # If set to None, no timeout will be applied.
-        self.timeout = None if node_config is None else node_config.get("timeout", 30)
+        # Defaults to 30 seconds. If explicitly set to None, no timeout will be applied.
+        self.timeout = 30 if node_config is None else node_config.get("timeout", 30)
 
         self.cut = False if node_config is None else node_config.get("cut", True)
 
@@ -119,7 +119,9 @@ def execute(self, state):
 
         if input_type in handlers:
             return handlers[input_type](state, input_type, source)
-        elif input_type == "local_dir":
+        elif input_type in ("local_dir", "txt"):
+            # `txt` carries raw text/HTML content (not a path), handled the same
+            # way as a local source.
             return self.handle_local_source(state, source)
         elif input_type == "url":
             return self.handle_web_source(state, source)
@@ -291,6 +293,10 @@ def handle_web_source(self, state, source):
                 if not response.text.strip():
                     raise ValueError("No HTML body content found in the response.")
 
+                # Default the parsed content to the raw response text so it is always
+                # defined, regardless of which optional processing branches run below.
+                parsed_content = response.text
+
                 if not self.cut:
                     parsed_content = cleanup_html(response, source)
 
@@ -400,7 +406,8 @@ def handle_web_source(self, state, source):
             compressed_document = [
                 Document(page_content=parsed_content, metadata={"source": "html file"})
             ]
-        state["doc"] = document
+            # `document` is only produced by the loader-based (non-soup) path.
+            state["doc"] = document
         state.update(
             {
                 self.output[0]: compressed_document,

diff --git a/scrapegraphai/utils/copy.py b/scrapegraphai/utils/copy.py
@@ -32,40 +32,104 @@ def is_boto3_client(obj):
     return False
 
 
-def safe_deepcopy(obj: Any) -> Any:
+def safe_deepcopy(obj: Any, memo: dict = None) -> Any:
     """
     Safely create a deep copy of an object, handling special cases.
 
+    This performs a true recursive deep copy of containers and generic
+    objects (recursing into ``__dict__``) while tracking already-copied
+    objects in a ``memo`` dict so circular references are preserved
+    instead of causing infinite recursion. Objects that cannot be deep
+    or shallow copied (e.g. boto3 clients, thread locks) are returned
+    as-is.
+
     Args:
         obj: Object to copy
+        memo: Internal memoization dict mapping id(obj) -> copy, used to
+            handle circular references.
 
     Returns:
         Deep copy of the object
 
     Raises:
         DeepCopyError: If object cannot be deep copied
     """
+    if memo is None:
+        memo = {}
+
+    # Return already-copied objects to preserve identity / cycles.
+    obj_id = id(obj)
+    if obj_id in memo:
+        return memo[obj_id]
+
+    # Handle immutable / atomic cases first.
+    if obj is None or isinstance(obj, (str, int, float, bool)):
+        return obj
+
+    if isinstance(obj, list):
+        new_list = []
+        memo[obj_id] = new_list
+        for v in obj:
+            new_list.append(safe_deepcopy(v, memo))
+        return new_list
+
+    if isinstance(obj, set):
+        new_set = set()
+        memo[obj_id] = new_set
+        for v in obj:
+            new_set.add(safe_deepcopy(v, memo))
+        return new_set
+
+    if isinstance(obj, dict):
+        new_dict = {}
+        memo[obj_id] = new_dict
+        for k, v in obj.items():
+            new_dict[safe_deepcopy(k, memo)] = safe_deepcopy(v, memo)
+        return new_dict
+
+    if isinstance(obj, tuple):
+        new_tuple = tuple(safe_deepcopy(v, memo) for v in obj)
+        memo[obj_id] = new_tuple
+        return new_tuple
+
+    if isinstance(obj, frozenset):
+        new_frozenset = frozenset(safe_deepcopy(v, memo) for v in obj)
+        memo[obj_id] = new_frozenset
+        return new_frozenset
+
+    if is_boto3_client(obj):
+        return obj
+
+    # Generic objects: try a true deep copy first, then fall back to a
+    # shallow copy, then to returning the object unchanged. Only raise
+    # if nothing works.
     try:
-        # Handle special cases first
-        if obj is None or isinstance(obj, (str, int, float, bool)):
-            return obj
-
-        if isinstance(obj, (list, set)):
-            return type(obj)(safe_deepcopy(v) for v in obj)
-
-        if isinstance(obj, dict):
-            return {k: safe_deepcopy(v) for k, v in obj.items()}
-
-        if isinstance(obj, tuple):
-            return tuple(safe_deepcopy(v) for v in obj)
-
-        if isinstance(obj, frozenset):
-            return frozenset(safe_deepcopy(v) for v in obj)
+        new_obj = copy.deepcopy(obj, memo)
+        memo[obj_id] = new_obj
+        return new_obj
+    except Exception:
+        pass
 
-        if is_boto3_client(obj):
-            return obj
-
-        return copy.copy(obj)
-
-    except Exception as e:
-        raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}") from e
+    try:
+        new_obj = copy.copy(obj)
+        memo[obj_id] = new_obj
+        return new_obj
+    except Exception:
+        pass
+
+    # As a last resort, recurse into the object's own __dict__ so that
+    # mutable attributes are deep copied even when copy.copy/deepcopy fail.
+    if hasattr(obj, "__dict__"):
+        try:
+            cls = obj.__class__
+            new_obj = cls.__new__(cls)
+            memo[obj_id] = new_obj
+            for attr, value in obj.__dict__.items():
+                setattr(new_obj, attr, safe_deepcopy(value, memo))
+            return new_obj
+        except Exception as e:
+            raise DeepCopyError(
+                f"Cannot deep copy object of type {type(obj)}"
+            ) from e
+
+    raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}")
diff --git a/scrapegraphai/utils/proxy_rotation.py b/scrapegraphai/utils/proxy_rotation.py
@@ -192,20 +192,24 @@ def parse_or_search_proxy(proxy: Proxy) -> ProxySettings:
     """
     Parses a proxy configuration or searches for a matching one via broker.
     """
-    assert "server" in proxy, "Missing 'server' field in the proxy configuration."
+    assert "server" in proxy, "missing server in the proxy configuration"
+
+    server = proxy["server"]
 
-    parsed_url = urlparse(proxy["server"])
-    server_address = parsed_url.hostname
+    # the special keyword "broker" triggers a proxy search via the broker
+    if server == "broker":
+        return _search_proxy(proxy)
 
-    if server_address is None:
-        raise ValueError(f"Invalid proxy server format: {proxy['server']}")
+    # normalise the server so urlparse can extract the hostname even when no
+    # scheme is present (e.g. "192.168.1.1:8080" or "gate.nodemaven.com:8080")
+    to_parse = server if "://" in server else f"//{server}"
+    server_address = urlparse(to_parse).hostname
 
-    # Accept both IP addresses and domain names like 'gate.nodemaven.com'
-    if is_ipv4_address(server_address) or re.match(
-        r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", server_address
+    # accept both IPv4 addresses and domain names like 'gate.nodemaven.com'
+    if server_address is not None and (
+        is_ipv4_address(server_address)
+        or re.match(r"^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", server_address)
     ):
         return _parse_proxy(proxy)
 
-    assert proxy["server"] == "broker", f"Unknown proxy server type: {proxy['server']}"
-
-    return _search_proxy(proxy)
+    assert False, f"unknown proxy server: {server}"
diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py
@@ -221,6 +221,9 @@ def search_on_web(
             res = research.run(config.query)
             # Extract URLs using regex
             results = re.findall(r"https?://[^\s,\]]+", res)
+            # DuckDuckGo treats max_results as a soft hint and may return more,
+            # so enforce the requested limit explicitly.
+            results = results[: config.max_results]
 
         elif config.search_engine == "bing":
             results = _search_bing(