Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/post-create.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ if [ -S /var/run/docker.sock ]; then
fi

uv sync --all-groups
uv run playwright install chromium --with-deps
uv run camoufox fetch
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ This repository contains the following projects:
- The number of DOM elements in the page
- The size of the page
- The number of external requests of the page
- [Ecoindex Scraper](projects/ecoindex_scraper/README.md): This module provides a simple interface to get the [Ecoindex](http://www.ecoindex.fr) based on a URL. It uses [Playwright](https://playwright.dev/) to get the DOM elements, size and requests of the page.
- [Ecoindex Scraper](projects/ecoindex_scraper/README.md): This module provides a simple interface to get the [Ecoindex](http://www.ecoindex.fr) based on a URL. It uses [Camoufox](https://camoufox.com/) to get the DOM elements, size and requests of the page.
- [Ecoindex CLI](projects/ecoindex_cli/README.md): This module provides a CLI tool to get the [Ecoindex](http://www.ecoindex.fr) based on a URL. It uses the [Ecoindex Scraper](projects/ecoindex_scraper/README.md) module.
- [Ecoindex API](projects/ecoindex_api/README.md): This module provides a REST API to get the [Ecoindex](http://www.ecoindex.fr) based on a URL. It uses the [Ecoindex Scraper](projects/ecoindex_scraper/README.md) module.

Expand Down Expand Up @@ -80,4 +80,4 @@ Please also refer to the mentions provided in the code files for specifics on th

## [Contributing](CONTRIBUTING.md)

## [Code of conduct](CODE_OF_CONDUCT.md)
## [Code of conduct](CODE_OF_CONDUCT.md)
61 changes: 31 additions & 30 deletions bases/ecoindex/worker/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from ecoindex.models.tasks import QueueTaskError, QueueTaskResult
from ecoindex.scraper.scrap import EcoindexScraper
from ecoindex.worker_component import app
from playwright._impl._errors import Error as WebDriverException
from sentry_sdk import init as sentry_init

if Settings().GLITCHTIP_DSN:
Expand Down Expand Up @@ -97,8 +96,34 @@ async def async_ecoindex_task(
),
)

except WebDriverException as exc:
if exc.message and "ERR_NAME_NOT_RESOLVED" in exc.message:
except TypeError as exc:
return QueueTaskResult(
status=TaskStatus.FAILURE,
error=QueueTaskError(
url=url, # type: ignore
exception=EcoindexContentTypeError.__name__,
status_code=520,
message=exc.args[0],
detail={"mimetype": None},
),
)

except EcoindexScraperStatusException as exc:
return QueueTaskResult(
status=TaskStatus.FAILURE,
error=QueueTaskError(
url=url, # type: ignore
status_code=521,
exception=EcoindexStatusError.__name__,
message=exc.message,
detail={"status": exc.status},
),
)

except Exception as exc:
message = getattr(exc, "message", str(exc))

if message and "ERR_NAME_NOT_RESOLVED" in message:
return QueueTaskResult(
status=TaskStatus.FAILURE,
error=QueueTaskError(
Expand All @@ -113,7 +138,7 @@ async def async_ecoindex_task(
),
)

if exc.message and "ERR_CONNECTION_TIMED_OUT" in exc.message:
if message and "ERR_CONNECTION_TIMED_OUT" in message:
return QueueTaskResult(
status=TaskStatus.FAILURE,
error=QueueTaskError(
Expand All @@ -134,35 +159,11 @@ async def async_ecoindex_task(
url=url, # type: ignore
exception=type(exc).__name__,
status_code=500,
message=str(exc.message) if exc.message else "",
message=message,
detail=await format_exception_response(exception=exc),
),
)

except TypeError as exc:
return QueueTaskResult(
status=TaskStatus.FAILURE,
error=QueueTaskError(
url=url, # type: ignore
exception=EcoindexContentTypeError.__name__,
status_code=520,
message=exc.args[0],
detail={"mimetype": None},
),
)

except EcoindexScraperStatusException as exc:
return QueueTaskResult(
status=TaskStatus.FAILURE,
error=QueueTaskError(
url=url, # type: ignore
status_code=521,
exception=EcoindexStatusError.__name__,
message=exc.message,
detail={"status": exc.status},
),
)


@app.task(
name="ecoindex.batch_import",
Expand Down Expand Up @@ -202,7 +203,7 @@ async def async_ecoindex_batch_import_task(
return QueueTaskResult(
status=TaskStatus.FAILURE,
error=QueueTaskError(
url=None, # type: ignore
url=None,
exception=type(exc).__name__,
status_code=500,
message=str(exc.message) if exc.message else "", # type: ignore
Expand Down
22 changes: 10 additions & 12 deletions components/ecoindex/scraper/scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@
from datetime import datetime
from time import sleep
from uuid import uuid4
from typing import Any, cast

from ua_generator.user_agent import UserAgent
from ua_generator import generate as ua_generate

from camoufox.async_api import AsyncCamoufox

from ecoindex.compute import compute_ecoindex
from ecoindex.exceptions.scraper import EcoindexScraperStatusException
from ecoindex.models.compute import PageMetrics, Result, ScreenShot, WindowSize
from ecoindex.models.scraper import MimetypeAggregation, RequestItem, Requests
from ecoindex.utils.screenshots import convert_screenshot_to_webp, set_screenshot_rights
from playwright._impl._api_structures import SetCookieParam, ViewportSize
from playwright.async_api import async_playwright
from typing_extensions import deprecated


Expand All @@ -30,7 +31,7 @@ def __init__(
page_load_timeout: int = 20,
headless: bool = True,
basic_auth: str | None = None,
cookies: list[SetCookieParam] = [],
cookies: list[dict[str, object]] = [],
custom_headers: dict[str, str] = {},
logger=None,
):
Expand Down Expand Up @@ -84,16 +85,13 @@ async def get_requests_by_category(self) -> MimetypeAggregation:
return self.all_requests.aggregation

async def scrap_page(self) -> PageMetrics:
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=self.headless, args=["--disable-software-rasterizer"]
)
async with AsyncCamoufox(
headless=self.headless,
window=(self.window_size.width, self.window_size.height),
) as browser_handle:
browser = cast(Any, browser_handle)
self.context = await browser.new_context(
record_har_path=self.har_temp_file_path,
screen=ViewportSize(
width=self.window_size.width,
height=self.window_size.height,
),
ignore_https_errors=True,
http_credentials={
"username": self.basic_auth.split(":")[0],
Expand All @@ -103,7 +101,7 @@ async def scrap_page(self) -> PageMetrics:
else None,
extra_http_headers=self.custom_headers,
)
await self.context.add_cookies(self.cookies)
await self.context.add_cookies(cast(Any, self.cookies))
self.page = await self.context.new_page()
response = await self.page.goto(self.url)
await self.check_page_response(response)
Expand Down
4 changes: 2 additions & 2 deletions projects/ecoindex_api/Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,8 @@ tasks:
- echo "Initialize the project for development"
- echo "Install uv dependencies"
- task: uv:install
- echo "Install playwright"
- task: uv:install-playwright
- echo "Install Camoufox"
- task: uv:install-camoufox
- echo "Create the environment file"
- task: init-env
- echo "Create the database"
Expand Down
2 changes: 1 addition & 1 deletion projects/ecoindex_api/docker/worker/dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ COPY projects/ecoindex_api/dist/$wheel $wheel
RUN pip install --no-cache-dir $wheel
RUN pip install --no-cache-dir aiomysql

RUN playwright install chromium --with-deps
RUN uv run camoufox fetch

RUN rm -rf $wheel requirements.txt /tmp/dist /var/lib/{apt,dpkg,cache,log}/

Expand Down
6 changes: 2 additions & 4 deletions projects/ecoindex_api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ dependencies = [
"celery>=5.3.4",
"cryptography>=44.0.2",
"fastapi>=0.109.1",
"playwright>=1.39.0",
"playwright-stealth>=1.0.6",
"camoufox>=0.4.11",
"pydantic[email]>=2.1.1,<=2.4.2",
"pydantic-settings>=2.0.3",
"pyyaml>=6.0.1",
Expand Down Expand Up @@ -41,8 +40,7 @@ backend = [
]
worker = [
"pillow>=12.2.0",
"playwright>=1.39.0",
"playwright-stealth>=1.0.6",
"camoufox>=0.4.11",
]
dev = [
"aiosqlite>=0.19.0",
Expand Down
2 changes: 1 addition & 1 deletion projects/ecoindex_cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ At first, you need to install dependencies from the repository root:

```bash
uv sync --all-groups
uv run playwright install chromium --with-deps
uv run camoufox fetch
```

### Usage
Expand Down
2 changes: 1 addition & 1 deletion projects/ecoindex_cli/dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
COPY projects/ecoindex_cli/dist/$wheel $wheel
RUN pip install --no-cache-dir $wheel

RUN playwright install chromium --with-deps
RUN uv run camoufox fetch

RUN rm -rf $wheel requirements.txt /tmp/dist /var/lib/{apt,dpkg,cache,log}/
3 changes: 1 addition & 2 deletions projects/ecoindex_cli/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ dependencies = [
"loguru>=0.7.2",
"matplotlib>=3.8.0",
"pandas>=2.1.2",
"playwright>=1.39.0",
"playwright-stealth>=1.0.6",
"camoufox>=0.4.11",
"pydantic>=2.4.2",
"pydantic-settings>=2.0.3",
"pyyaml>=6.0.1",
Expand Down
2 changes: 2 additions & 0 deletions projects/ecoindex_scraper/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ This module provides a simple interface to get the [Ecoindex](http://www.ecoinde
## Requirements

- Python ^3.10 with [pip](https://pip.pypa.io/en/stable/installation/)
- [Camoufox](https://camoufox.com/python/installation/) for the browser binary

## Install

```shell
pip install ecoindex_scraper
python -m camoufox fetch
```

If you need to convert the screenshot to webp with the `generate_screenshot` method, you need to install the Pillow dependency.
Expand Down
2 changes: 1 addition & 1 deletion projects/ecoindex_scraper/dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ RUN uv sync --package ecoindex_scraper --frozen --no-dev --no-editable

ENV PATH="/code/.venv/bin:$PATH"

RUN playwright install chromium --with-deps
RUN uv run camoufox fetch

RUN rm -rf /tmp/dist /var/lib/{apt,dpkg,cache,log}/
3 changes: 1 addition & 2 deletions projects/ecoindex_scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ requires-python = ">=3.10,<3.13"
license = { text = "Creative Commons BY-NC-ND" }
authors = [{ name = "Vincent Vatelot", email = "vincent.vatelot@ik.me" }]
dependencies = [
"playwright>=1.39.0",
"playwright-stealth>=1.0.6",
"camoufox>=0.4.11",
"pydantic>=2.4.2",
"pyyaml>=6.0.1",
"setuptools>=69.5.1,<79.0.0",
Expand Down
10 changes: 7 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ update-values = "ecoindex.scripts:update_values"

[dependency-groups]
scraper = [
"playwright>=1.39.0",
"playwright-stealth>=1.0.6",
"camoufox>=0.4.11",
]
scraper-webp = [
"pillow>=12.2.0",
Expand Down Expand Up @@ -87,7 +86,12 @@ namespace_packages = true
explicit_package_bases = true
ignore_missing_imports = true
disallow_untyped_defs = false
exclude = ["test", "dist", "__pycache__", "projects/ecoindex_api/alembic/versions"]
exclude = [
"test",
"dist",
"__pycache__",
"projects/ecoindex_api/alembic/versions",
]

[tool.coverage.run]
omit = ["test/*"]
Expand Down
6 changes: 3 additions & 3 deletions tasks/UvTaskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ tasks:
- uv version --package {{.PACKAGE_NAME}} --short
silent: true

install-playwright:
desc: Install playwright
install-camoufox:
desc: Install Camoufox
cmds:
- uv run playwright install chromium --with-deps
- uv run camoufox fetch
silent: true
interactive: true
Loading
Loading