From 2510394984ae73bfa845ba6e4acd429126409e8f Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 21 Apr 2026 07:40:02 +0000 Subject: [PATCH 1/2] refactor(seo): consolidate spec language overview into hub with ?language= filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /{spec_id} and /{spec_id}/{language} pages rendered virtually identical content with separate canonical URLs, causing duplicate-content issues for search engines. Consolidate them into a single canonical hub page (/{spec_id}), and serve language filtering as /{spec_id}?language={language} — a client-side filter whose canonical tag still points at the unfiltered hub. Same pattern already used for ?view=interactive on detail pages. - Router: /:specId/:language redirects to /:specId?language=:language - SpecPage: Mode reduced to hub|detail; hub grid filters from ?language= when present, canonical always /{spec_id} - Sitemap: stop emitting /{spec}/{language} URLs - SEO proxy: /seo-proxy/{spec}/{language} returns 301 to /seo-proxy/{spec} - nginx python.anyplot.ai: hub rewrites to /seo-proxy/{spec} (detail URLs keep /python segment since those are content-unique) - Docs + unit tests updated https://claude.ai/code/session_01Hiwzn5mc979FDGCHkW4os1 --- api/routers/seo.py | 70 ++++++++---------------------- app/nginx.conf | 35 +++++++-------- app/src/pages/SpecPage.tsx | 57 +++++++++++++----------- app/src/router.tsx | 10 ++++- docs/reference/seo.md | 52 ++++++++++++++-------- tests/unit/api/test_routers.py | 8 ++-- tests/unit/api/test_seo_helpers.py | 23 ++++++---- 7 files changed, 130 insertions(+), 125 deletions(-) diff --git a/api/routers/seo.py b/api/routers/seo.py index 3173f1c4f4..93b910f1e0 100644 --- a/api/routers/seo.py +++ b/api/routers/seo.py @@ -4,7 +4,7 @@ from datetime import datetime from fastapi import APIRouter, Depends, HTTPException, Request -from fastapi.responses import HTMLResponse, Response +from fastapi.responses import HTMLResponse, RedirectResponse, Response from sqlalchemy.ext.asyncio import AsyncSession from api.cache import cache_key, get_cache, get_or_set_cache, set_cache @@ -25,10 +25,13 @@ def _lastmod(dt: datetime | None) -> str: def _build_sitemap_xml(specs: list) -> str: """Build sitemap XML string from specs. - Emits three URL tiers per spec: - - /{spec_id} Cross-language hub - - /{spec_id}/{language} Language overview + Emits two URL tiers per spec: + - /{spec_id} Cross-language hub (canonical overview) - /{spec_id}/{language}/{library} Implementation detail + + The /{spec_id}/{language} tier is intentionally omitted: language filtering + is served as /{spec_id}?language={language} (filtered hub, same canonical), + so listing it would create duplicate-content entries for Google. """ xml_lines = [ '', @@ -48,21 +51,7 @@ def _build_sitemap_xml(specs: list) -> str: if not spec.impls: continue spec_id = html.escape(spec.id) - # Cross-language hub xml_lines.append(f" https://anyplot.ai/{spec_id}{_lastmod(spec.updated)}") - # Language overviews + implementation details, grouped per language - languages = sorted({impl.library.language for impl in spec.impls if impl.library}) - for language in languages: - language_esc = html.escape(language) - language_updates = [ - impl.updated - for impl in spec.impls - if impl.library and impl.library.language == language and impl.updated is not None - ] - language_lastmod = max(language_updates) if language_updates else spec.updated - xml_lines.append( - f" https://anyplot.ai/{spec_id}/{language_esc}{_lastmod(language_lastmod)}" - ) for impl in spec.impls: if not impl.library: continue @@ -318,40 +307,17 @@ async def seo_spec_hub(spec_id: str, db: AsyncSession | None = Depends(optional_ @router.get("/seo-proxy/{spec_id}/{language}") -async def seo_spec_language(spec_id: str, language: str, db: AsyncSession | None = Depends(optional_db)): - """Bot-optimized language-specific spec overview.""" - if db is None: - return HTMLResponse( - BOT_HTML_TEMPLATE.format( - title=f"{html.escape(spec_id)} - {html.escape(language)} | anyplot.ai", - description=DEFAULT_DESCRIPTION, - image=DEFAULT_HOME_IMAGE, - url=f"https://anyplot.ai/{html.escape(spec_id)}/{html.escape(language)}", - ) - ) - - key = cache_key("seo", spec_id, language) - cached = get_cache(key) - if cached: - return HTMLResponse(cached) - - repo = SpecRepository(db) - spec = await repo.get_by_id(spec_id) - if not spec: - raise HTTPException(status_code=404, detail="Spec not found") - - lang_impls = [i for i in spec.impls if i.library and i.library.language == language] - has_previews = any(i.preview_url for i in lang_impls) - image = f"https://api.anyplot.ai/og/{spec_id}.png" if has_previews else DEFAULT_HOME_IMAGE - - result = BOT_HTML_TEMPLATE.format( - title=f"{html.escape(spec.title)} - {html.escape(language)} | anyplot.ai", - description=html.escape(spec.description or DEFAULT_DESCRIPTION), - image=html.escape(image, quote=True), - url=f"https://anyplot.ai/{html.escape(spec_id)}/{html.escape(language)}", - ) - set_cache(key, result) - return HTMLResponse(result) +async def seo_spec_language(spec_id: str, language: str): + """Permanent redirect: language-overview URLs now live on the hub with ?language=. + + The /{spec_id}/{language} tier was consolidated into /{spec_id} to eliminate + duplicate content. Bots following this endpoint get a 301 to the hub proxy; + humans get the SPA redirect configured in app/src/router.tsx. The `language` + query parameter is dropped because the hub's canonical tag does not include + it — Google should consolidate the page, not a filtered variant. + """ + del language # referenced for route matching only; deliberately not forwarded + return RedirectResponse(url=f"/seo-proxy/{spec_id}", status_code=301) @router.get("/seo-proxy/{spec_id}/{language}/{library}") diff --git a/app/nginx.conf b/app/nginx.conf index 06b50606fb..48b3b14d3b 100644 --- a/app/nginx.conf +++ b/app/nginx.conf @@ -124,20 +124,20 @@ server { # python.anyplot.ai — language-filtered marketing subdomain. # -# Strategy: serve identical content to anyplot.ai with a /python language -# segment injected into spec routes. The canonical tag in the rendered HTML -# always points back to anyplot.ai, so Google consolidates link equity on the -# main domain while the subdomain stays available for marketing campaigns. +# Strategy: serve identical content to anyplot.ai. The canonical tag always +# points back to the main-domain hub (anyplot.ai/{spec_id}) so Google +# consolidates link equity on a single URL while the subdomain stays available +# for marketing campaigns. # -# Bot path (SEO crawlers): nginx rewrites the requested URI by injecting -# `/python` and proxies to /seo-proxy/{spec}/python[/{library}] on the API. -# The SEO HTML emits canonical=https://anyplot.ai/{spec}/python[/{library}]. +# Bot path (SEO crawlers): nginx proxies the hub to /seo-proxy/{spec} (no +# language segment — the language-filtered hub is /{spec}?language=python on +# the main domain, with the same canonical as the unfiltered hub). Library +# detail URLs keep the /python segment because those are content-unique. # -# Human path: serves index.html for any spec route. The SPA must detect -# `window.location.hostname === 'python.anyplot.ai'` and inject `python` as -# the language when resolving routes / building canonical tags. Without that -# detection the SPA would treat `library` as `language` and 404. Subdomain -# rollout is therefore gated on a follow-up SPA change. +# Human path: serves index.html for any spec route. The SPA may detect +# `window.location.hostname === 'python.anyplot.ai'` and append +# `?language=python` on spec routes so the grid renders filtered without +# changing the canonical. server { listen 8080; server_name python.anyplot.ai; @@ -165,9 +165,9 @@ server { add_header Expires "0"; } - # Bot SEO proxy — rewrite to inject /python and proxy to backend. - # /scatter-basic -> /seo-proxy/scatter-basic/python - # /scatter-basic/matplotlib -> /seo-proxy/scatter-basic/python/matplotlib + # Bot SEO proxy — proxy to backend. + # /scatter-basic -> /seo-proxy/scatter-basic (hub, no language segment) + # /scatter-basic/matplotlib -> /seo-proxy/scatter-basic/python/matplotlib (detail, language in path) location @seo_proxy_python { set $seo_backend https://api.anyplot.ai; proxy_pass $seo_backend$python_seo_uri; @@ -186,14 +186,15 @@ server { try_files $uri $uri/ /index.html; } - # /:specId[/:library] -> internally /:specId/python[/:library] + # /:specId -> hub on main domain (no language segment; canonical is /{spec_id}) location ~ "^/(?[A-Za-z0-9][A-Za-z0-9-]*)/?$" { - set $python_seo_uri /seo-proxy/$spec_id/python; + set $python_seo_uri /seo-proxy/$spec_id; error_page 418 = @seo_proxy_python; if ($is_bot) { return 418; } try_files /index.html =404; } + # /:specId/:library -> detail on main domain (language stays in path) location ~ "^/(?[A-Za-z0-9][A-Za-z0-9-]*)/(?[A-Za-z0-9][A-Za-z0-9-]*)/?$" { set $python_seo_uri /seo-proxy/$spec_id/python/$library; error_page 418 = @seo_proxy_python; diff --git a/app/src/pages/SpecPage.tsx b/app/src/pages/SpecPage.tsx index 28f3448607..b9a1968eec 100644 --- a/app/src/pages/SpecPage.tsx +++ b/app/src/pages/SpecPage.tsx @@ -34,7 +34,7 @@ interface SpecDetail { implementations: Implementation[]; } -type Mode = 'hub' | 'language' | 'detail'; +type Mode = 'hub' | 'detail'; export function SpecPage() { const { specId, language: urlLanguage, library: urlLibrary } = useParams(); @@ -53,8 +53,9 @@ export function SpecPage() { const [highlightedTags, setHighlightedTags] = useState([]); const { fetchCode, getCode } = useCodeFetch(); - const mode: Mode = urlLibrary ? 'detail' : urlLanguage ? 'language' : 'hub'; + const mode: Mode = urlLibrary ? 'detail' : 'hub'; const selectedLibrary = urlLibrary || null; + const languageFilter = mode === 'hub' ? searchParams.get('language') : null; const getLibraryMeta = useCallback( (libraryId: string) => librariesData.find((lib) => lib.id === libraryId), @@ -81,19 +82,14 @@ export function SpecPage() { const data: SpecDetail = await res.json(); setSpecData(data); - // Validate language matches at least one impl - if (urlLanguage && !data.implementations.some((i) => i.language === urlLanguage)) { - navigate(specPath(specId!), { replace: true }); - return; - } - - // Validate library matches an impl in the requested language + // Detail mode: validate library matches an impl in the requested language. + // If no match, fall back to the hub with a language filter to preserve intent. if (urlLibrary && urlLanguage) { const matched = data.implementations.find( (i) => i.library_id === urlLibrary && i.language === urlLanguage, ); if (!matched) { - navigate(specPath(specId!, urlLanguage), { replace: true }); + navigate({ pathname: specPath(specId!), search: `?language=${encodeURIComponent(urlLanguage)}` }, { replace: true }); return; } } @@ -108,7 +104,7 @@ export function SpecPage() { fetchSpec(); }, [specId, urlLanguage, urlLibrary, navigate]); - // Implementations for the selected language (used in language + detail modes) + // Implementations for the selected language (used in detail mode for library pills) const langImpls = useMemo(() => { if (!specData || !urlLanguage) return specData?.implementations || []; return specData.implementations.filter((i) => i.language === urlLanguage); @@ -120,6 +116,15 @@ export function SpecPage() { return Array.from(new Set(specData.implementations.map((i) => i.language))).sort(); }, [specData]); + // If ?language= points at a language that has no implementations, drop it. + useEffect(() => { + if (mode !== 'hub' || !specData || !languageFilter) return; + if (availableLanguages.includes(languageFilter)) return; + const params = new URLSearchParams(searchParams); + params.delete('language'); + setSearchParams(params, { replace: true }); + }, [mode, specData, languageFilter, availableLanguages, searchParams, setSearchParams]); + // Get current implementation (only in detail mode) const currentImpl = useMemo(() => { if (!specData || !selectedLibrary) return null; @@ -199,7 +204,7 @@ export function SpecPage() { trackEvent('download_image', { spec: specId, library: impl.library_id, - page: mode === 'detail' ? 'spec_detail' : mode === 'language' ? 'spec_language' : 'spec_hub', + page: mode === 'detail' ? 'spec_detail' : 'spec_hub', }); }, [specId, trackEvent, mode], @@ -216,7 +221,7 @@ export function SpecPage() { spec: specId, library: impl.library_id, method: 'image', - page: mode === 'detail' ? 'spec_detail' : mode === 'language' ? 'spec_language' : 'spec_hub', + page: mode === 'detail' ? 'spec_detail' : 'spec_hub', }); setTimeout(() => setCodeCopied(null), 2000); } catch (err) { @@ -237,10 +242,12 @@ export function SpecPage() { // Track page view useEffect(() => { if (!specData || !specId) return; - if (mode === 'hub') trackPageview(`/${specId}`); - else if (mode === 'language') trackPageview(`/${specId}/${urlLanguage}`); - else if (mode === 'detail' && selectedLibrary) trackPageview(`/${specId}/${urlLanguage}/${selectedLibrary}`); - }, [specData, mode, specId, urlLanguage, selectedLibrary, trackPageview]); + if (mode === 'hub') { + trackPageview(languageFilter ? `/${specId}?language=${languageFilter}` : `/${specId}`); + } else if (mode === 'detail' && selectedLibrary) { + trackPageview(`/${specId}/${urlLanguage}/${selectedLibrary}`); + } + }, [specData, mode, specId, urlLanguage, selectedLibrary, languageFilter, trackPageview]); // Keyboard shortcuts: left/right arrows switch libraries in detail mode useEffect(() => { @@ -303,15 +310,15 @@ export function SpecPage() { const canonical = mode === 'detail' ? `https://anyplot.ai/${specId}/${urlLanguage}/${selectedLibrary}` - : mode === 'language' - ? `https://anyplot.ai/${specId}/${urlLanguage}` - : `https://anyplot.ai/${specId}`; + : `https://anyplot.ai/${specId}`; - const titleSuffix = - mode === 'detail' ? ` - ${selectedLibrary}` : mode === 'language' ? ` - ${urlLanguage}` : ''; + const titleSuffix = mode === 'detail' ? ` - ${selectedLibrary}` : ''; - // Implementations to render in the grid: language mode → only that lang; hub → all - const gridImpls = mode === 'hub' ? specData.implementations : langImpls; + // Implementations to render in the grid: hub mode optionally filtered by ?language= + const gridImpls = + languageFilter + ? specData.implementations.filter((i) => i.language === languageFilter) + : specData.implementations; return ( <> @@ -445,7 +452,7 @@ export function SpecPage() { /> - ( const lazySpec = () => import('./pages/SpecPage').then(m => ({ Component: m.SpecPage, HydrateFallback: LazyFallback })); +function SpecLanguageRedirect() { + const { specId, language } = useParams(); + if (!specId || !language) return ; + return ; +} + const router = createBrowserRouter([ { element: , @@ -30,7 +36,7 @@ const router = createBrowserRouter([ { path: 'mcp', lazy: () => import('./pages/McpPage').then(m => ({ Component: m.McpPage })) }, { path: 'stats', lazy: () => import('./pages/StatsPage').then(m => ({ Component: m.StatsPage })) }, { path: ':specId', lazy: lazySpec }, - { path: ':specId/:language', lazy: lazySpec }, + { path: ':specId/:language', element: }, { path: ':specId/:language/:library', lazy: lazySpec }, { path: '*', element: }, ], diff --git a/docs/reference/seo.md b/docs/reference/seo.md index 0296785bfb..3a7484e082 100644 --- a/docs/reference/seo.md +++ b/docs/reference/seo.md @@ -139,8 +139,9 @@ Backend endpoints that serve HTML with correct meta tags for bots. | `GET /seo-proxy/plots` | Plots page | Default | | `GET /seo-proxy/specs` | Specs page | Default | | `GET /seo-proxy/legal` | Legal page | Default | -| `GET /seo-proxy/{spec_id}` | Spec overview | Collage (2x3 grid) | -| `GET /seo-proxy/{spec_id}/{library}` | Implementation | Single branded | +| `GET /seo-proxy/{spec_id}` | Spec overview (cross-language hub) | Collage (2x3 grid) | +| `GET /seo-proxy/{spec_id}/{language}` | **301** → `/seo-proxy/{spec_id}` (consolidated) | — | +| `GET /seo-proxy/{spec_id}/{language}/{library}` | Implementation | Single branded | ### HTML Template @@ -259,18 +260,23 @@ Dynamic XML sitemap for search engine indexing. https://anyplot.ai/legal https://anyplot.ai/{spec_id} - https://anyplot.ai/{spec_id}/{library} + https://anyplot.ai/{spec_id}/{language}/{library} ``` +The `/{spec_id}/{language}` tier is intentionally **not** listed: language +filtering is served as `/{spec_id}?language={language}` (the hub with a filter +query param, same canonical as the unfiltered hub), so listing it would create +duplicate-content entries for Google. + ### Included URLs 1. Home page (`/`) 2. Plots page (`/plots`) 3. Legal page (`/legal`) -4. Spec overview pages (`/{spec_id}`) - only if spec has implementations -5. Implementation pages (`/{spec_id}/{library}`) - all implementations +4. Spec overview pages (`/{spec_id}`) — only if spec has implementations +5. Implementation pages (`/{spec_id}/{language}/{library}`) — all implementations ### nginx Proxy @@ -328,15 +334,22 @@ existing Python URLs. |-----|---------|-----------| | `/` | Landing | self | | `/{spec_id}` | Cross-language hub — lists every implementation across all languages | self | -| `/{spec_id}/{language}` | Language overview — all libraries for that language | self | +| `/{spec_id}?language={language}` | Hub filtered to one language (client-side filter) | `/{spec_id}` (without query) | | `/{spec_id}/{language}/{library}` | Implementation detail — preview ↔ interactive toggle | self | | `/{spec_id}/{language}/{library}?view=interactive` | Same page, interactive iframe pre-selected | base URL without query | | `/plots`, `/specs`, `/libraries`, `/palette`, `/about`, `/legal`, `/mcp`, `/stats` | Static pages | self | -The interactive view is no longer a separate route — the spec detail page -toggles between a static preview image and the interactive HTML iframe in -place. `?view=interactive` is a deep-link parameter only; the canonical tag -always points at the base URL without the query string. +There is intentionally no canonical `/{spec_id}/{language}` URL. Language +filtering is served via a `?language=` query param on the hub, and the hub's +canonical tag omits the query — so the hub and its language-filtered variants +all consolidate on the same canonical URL. Legacy links to +`/{spec_id}/{language}` redirect to `/{spec_id}?language={language}` (SPA +client-side redirect via `app/src/router.tsx`; bots get a 301 from +`/seo-proxy/{spec_id}/{language}` to `/seo-proxy/{spec_id}`). + +The interactive view follows the same pattern: `?view=interactive` is a +deep-link parameter only; the canonical tag always points at the base URL +without the query string. ### Reserved Spec Slugs @@ -359,19 +372,20 @@ sitemap stops listing those URLs, and Google removes them on next crawl. ### Marketing Subdomain `python.anyplot.ai` is served by a dedicated nginx server block -(`app/nginx.conf`) that internally rewrites incoming requests so each spec -URL gains a `/python` language segment before it reaches the SEO proxy: +(`app/nginx.conf`) that proxies bot requests to the main-domain hub / detail +proxies: | Subdomain URL | Internal rewrite | Canonical (in HTML) | |---|---|---| -| `python.anyplot.ai/{spec_id}` | `/seo-proxy/{spec_id}/python` | `https://anyplot.ai/{spec_id}/python` | +| `python.anyplot.ai/{spec_id}` | `/seo-proxy/{spec_id}` | `https://anyplot.ai/{spec_id}` | | `python.anyplot.ai/{spec_id}/{library}` | `/seo-proxy/{spec_id}/python/{library}` | `https://anyplot.ai/{spec_id}/python/{library}` | The user keeps the marketing-friendly hostname; Google sees a canonical on the -main domain so authority and ranking signals stay consolidated. Human visitors -require the SPA to detect `window.location.hostname === 'python.anyplot.ai'` -and inject `python` as the language when resolving routes — that hostname-aware -SPA layer is gated as a follow-up before flipping DNS. +main-domain hub so authority and ranking signals stay consolidated on a single +URL. Human visitors: the SPA may detect +`window.location.hostname === 'python.anyplot.ai'` and append +`?language=python` on spec routes so the grid renders filtered without +changing the canonical. ### Path Utility @@ -390,7 +404,9 @@ When adding Julia, R, or MATLAB: `/{spec_id}/julia/{library_id}`; sitemap and OG image routes pick them up. 3. The cross-language hub `/{spec_id}` lists the new language's implementations alongside Python's — no per-spec migration needed. -4. Optionally add a `julia.anyplot.ai` server block mirroring the Python one. +4. Users can filter the hub to a single language via `/{spec_id}?language=julia` + (no new canonical URL is created; the filter is UX-only). +5. Optionally add a `julia.anyplot.ai` server block mirroring the Python one. ## Security diff --git a/tests/unit/api/test_routers.py b/tests/unit/api/test_routers.py index 6e0aba3c77..e69ffafc4f 100644 --- a/tests/unit/api/test_routers.py +++ b/tests/unit/api/test_routers.py @@ -464,13 +464,15 @@ def test_sitemap_with_db(self, db_client, mock_spec) -> None: ): response = client.get("/sitemap.xml") assert response.status_code == 200 - # URL format: /, /plots, /specs, /{spec_id}, /{spec_id}/{language}, /{spec_id}/{language}/{library} + # URL format: /, /plots, /specs, /{spec_id}, /{spec_id}/{language}/{library} + # The /{spec_id}/{language} tier is NOT listed — language filtering is + # served as /{spec_id}?language={language} (filtered hub, same canonical). assert "https://anyplot.ai/plots" in response.text assert "https://anyplot.ai/specs" in response.text # Cross-language hub assert "https://anyplot.ai/scatter-basic" in response.text - # Language overview - assert "https://anyplot.ai/scatter-basic/python" in response.text + # Language-overview tier must NOT appear (consolidated onto hub) + assert "https://anyplot.ai/scatter-basic/python" not in response.text # Implementation detail assert "https://anyplot.ai/scatter-basic/python/matplotlib" in response.text # Legacy /python/{spec} prefix must NOT appear diff --git a/tests/unit/api/test_seo_helpers.py b/tests/unit/api/test_seo_helpers.py index cc16542d1a..4f343a0132 100644 --- a/tests/unit/api/test_seo_helpers.py +++ b/tests/unit/api/test_seo_helpers.py @@ -46,7 +46,7 @@ def test_empty_specs(self) -> None: assert "" in result def test_spec_with_impls(self) -> None: - """Spec with impls should emit hub, language-overview, and detail URLs.""" + """Spec with impls should emit hub and detail URLs (no per-language tier).""" library = MagicMock() library.language = "python" @@ -63,10 +63,11 @@ def test_spec_with_impls(self) -> None: result = _build_sitemap_xml([spec]) # Cross-language hub assert "https://anyplot.ai/scatter-basic" in result - # Language overview - assert "https://anyplot.ai/scatter-basic/python" in result # Implementation detail assert "https://anyplot.ai/scatter-basic/python/matplotlib" in result + # Language-overview URL is consolidated onto the hub via ?language=; it + # must NOT appear as its own sitemap entry (duplicate content for Google). + assert "https://anyplot.ai/scatter-basic/python" not in result # Legacy /python/{spec} path must NOT appear assert "https://anyplot.ai/python/scatter-basic" not in result assert "2025-03-14" in result @@ -111,8 +112,13 @@ def test_multiple_specs(self) -> None: assert "https://anyplot.ai/scatter-basic/python/matplotlib" in result assert "https://anyplot.ai/bar-grouped/python/seaborn" in result - def test_language_overview_deduplicated(self) -> None: - """Multiple impls sharing a language should yield one language-overview URL.""" + def test_no_language_overview_emitted(self) -> None: + """Multiple impls sharing a language must NOT emit a /{spec}/{language} URL. + + Language filtering is served as /{spec}?language={language} (filtered hub, + same canonical as the unfiltered hub), so sitemap entries for the + language tier would create duplicate-content URLs for search engines. + """ library = MagicMock() library.language = "python" @@ -132,9 +138,10 @@ def test_language_overview_deduplicated(self) -> None: spec.updated = None result = _build_sitemap_xml([spec]) - # Language overview appears exactly once despite two impls - assert result.count("https://anyplot.ai/scatter-basic/python") == 1 - # Both implementations are present + # No language-overview URL + assert "https://anyplot.ai/scatter-basic/python" not in result + # Hub + both implementations are present + assert "https://anyplot.ai/scatter-basic" in result assert "https://anyplot.ai/scatter-basic/python/matplotlib" in result assert "https://anyplot.ai/scatter-basic/python/seaborn" in result From 3622d04e7c723d6bf0603eafd3aa9fce7b5db4af Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 21 Apr 2026 10:39:25 +0000 Subject: [PATCH 2/2] =?UTF-8?q?fix(seo):=20address=20PR=20review=20?= =?UTF-8?q?=E2=80=94=20validate=20spec=5Fid,=20track=20=3Flanguage=3D=20vi?= =?UTF-8?q?a=20path=20segment?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes for PR #5297: 1. seo_spec_language (api/routers/seo.py): validate spec_id against the canonical `^[a-z0-9]+(-[a-z0-9]+)*$` pattern before embedding it in the Location header. Closes the CodeQL "Untrusted URL redirection" alert. 2. Add unit tests asserting the 301 + Location behaviour and the 404 response for malformed spec_ids. 3. Fix analytics tracking for the filtered hub: buildPlausibleUrl now includes `language` in its orderedKeys list, so ?language=python is converted to the /{spec}/language/python path-segment form that matches every other filter. SpecPage.tsx hub mode calls trackPageview() without an override so the new path-segment URL is picked up. This unblocks pageview tracking that was silently dropped by the urlOverride validation regex (which rejects ? and =). https://claude.ai/code/session_01Hiwzn5mc979FDGCHkW4os1 --- api/routers/seo.py | 8 ++++++++ app/src/hooks/useAnalytics.ts | 6 +++++- app/src/pages/SpecPage.tsx | 8 ++++++-- tests/unit/api/test_routers.py | 15 +++++++++++++++ 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/api/routers/seo.py b/api/routers/seo.py index 93b910f1e0..90580bcda2 100644 --- a/api/routers/seo.py +++ b/api/routers/seo.py @@ -1,6 +1,7 @@ """SEO endpoints (sitemap, bot-optimized pages).""" import html +import re from datetime import datetime from fastapi import APIRouter, Depends, HTTPException, Request @@ -16,6 +17,11 @@ router = APIRouter(tags=["seo"]) +# Canonical spec-id shape — lowercase alphanumerics with hyphen separators. +# Same pattern enforced in automation/scripts/sync_to_postgres.py. Used here to +# constrain user-controlled path segments before they land in Location headers. +_SPEC_ID_RE = re.compile(r"^[a-z0-9]+(-[a-z0-9]+)*$") + def _lastmod(dt: datetime | None) -> str: """Format datetime as XML element, or empty string if None.""" @@ -317,6 +323,8 @@ async def seo_spec_language(spec_id: str, language: str): it — Google should consolidate the page, not a filtered variant. """ del language # referenced for route matching only; deliberately not forwarded + if not _SPEC_ID_RE.fullmatch(spec_id): + raise HTTPException(status_code=404, detail="Spec not found") return RedirectResponse(url=f"/seo-proxy/{spec_id}", status_code=301) diff --git a/app/src/hooks/useAnalytics.ts b/app/src/hooks/useAnalytics.ts index 34d97db7cb..71cd19c10c 100644 --- a/app/src/hooks/useAnalytics.ts +++ b/app/src/hooks/useAnalytics.ts @@ -32,7 +32,10 @@ function buildPlausibleUrl(): string { const pathPrefix = parts.length > 0 && !RESERVED_TOP_LEVEL.has(parts[0]) ? `/${parts.join("/")}` : ""; - // Definierte Reihenfolge der Filter-Kategorien (inkl. impl-level tags) + // Definierte Reihenfolge der Filter-Kategorien (inkl. impl-level tags). + // `language` is included so the hub's ?language= filter is tracked as a + // distinct pageview path (/{spec}/language/python), matching the path-segment + // convention used for all other filter params. const orderedKeys = [ "lib", "spec", @@ -45,6 +48,7 @@ function buildPlausibleUrl(): string { "pat", "prep", "style", + "language", ]; for (const key of orderedKeys) { diff --git a/app/src/pages/SpecPage.tsx b/app/src/pages/SpecPage.tsx index b9a1968eec..88623e9e80 100644 --- a/app/src/pages/SpecPage.tsx +++ b/app/src/pages/SpecPage.tsx @@ -239,11 +239,15 @@ export function SpecPage() { return `${GITHUB_URL}/issues/new?${params.toString()}`; }, [specId]); - // Track page view + // Track page view. Hub mode calls trackPageview() without an override so + // buildPlausibleUrl() picks up ?language= from window.location and converts + // it to the path-segment form (e.g. /{spec}/language/python). The URL + // override path used for detail mode cannot carry query strings — see + // useAnalytics.ts sendPageview() validation regex. useEffect(() => { if (!specData || !specId) return; if (mode === 'hub') { - trackPageview(languageFilter ? `/${specId}?language=${languageFilter}` : `/${specId}`); + trackPageview(); } else if (mode === 'detail' && selectedLibrary) { trackPageview(`/${specId}/${urlLanguage}/${selectedLibrary}`); } diff --git a/tests/unit/api/test_routers.py b/tests/unit/api/test_routers.py index e69ffafc4f..bf3278a569 100644 --- a/tests/unit/api/test_routers.py +++ b/tests/unit/api/test_routers.py @@ -537,6 +537,21 @@ def test_seo_spec_overview_not_found(self, db_client) -> None: response = client.get("/seo-proxy/nonexistent-spec") assert response.status_code == 404 + def test_seo_spec_language_redirects_to_hub(self, client: TestClient) -> None: + """Language-overview URL should 301-redirect to the cross-language hub. + + The /{spec}/{language} tier was consolidated onto /{spec} to remove + duplicate content; the language segment is intentionally dropped. + """ + response = client.get("/seo-proxy/scatter-basic/python", follow_redirects=False) + assert response.status_code == 301 + assert response.headers["location"] == "/seo-proxy/scatter-basic" + + def test_seo_spec_language_rejects_malformed_spec_id(self, client: TestClient) -> None: + """Malformed spec_ids must 404, not redirect — prevents header injection.""" + response = client.get("/seo-proxy/Invalid_Spec/python", follow_redirects=False) + assert response.status_code == 404 + def test_seo_spec_implementation_without_db(self, client: TestClient) -> None: """SEO spec implementation should return fallback HTML when DB unavailable.""" with patch(DB_CONFIG_PATCH, return_value=False):