From 96f7f6802cf161be3d7e22fd53054e8b39653ffc Mon Sep 17 00:00:00 2001 From: Lukas Wallrich Date: Fri, 1 May 2026 11:32:48 +0100 Subject: [PATCH 1/3] Update existing link-check issue instead of creating weekly duplicates The weekly Link Checker run now finds the most recent open "link-check"-labeled issue and edits its body in place, posting a short comment so subscribers see the refresh. Falls back to creating a fresh issue only when none is open. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/link-check.yaml | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/.github/workflows/link-check.yaml b/.github/workflows/link-check.yaml index 467e1c75322..bbc5c15ca4e 100644 --- a/.github/workflows/link-check.yaml +++ b/.github/workflows/link-check.yaml @@ -5,7 +5,7 @@ name: Link Checker # ======================= # Purpose: Downloads the latest built site and checks all links (internal + external) # Triggers: Weekly on Mondays at 01:30 UTC or manual dispatch -# Reports: Creates a GitHub issue with label "link-check" when broken links are found +# Reports: Updates the existing open "link-check"-labeled issue (or creates one if none exists) # Config: See .lychee.toml for exclusion patterns and request settings on: @@ -164,11 +164,22 @@ jobs: echo "found=false" >> "$GITHUB_OUTPUT" fi - - name: Create issue from lychee output + - name: Create or update Link Checker Report issue if: steps.lychee.outputs.exit_code != 0 || steps.doi-check.outputs.found == 'true' - uses: peter-evans/create-issue-from-file@v5 - with: - title: "Link Checker Report" - content-filepath: /tmp/lychee/out.md - labels: link-check - token: ${{ secrets.GITHUB_TOKEN }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_REPO: ${{ github.repository }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + set -euo pipefail + TITLE="Link Checker Report" + BODY_FILE=/tmp/lychee/out.md + EXISTING=$(gh issue list --label link-check --state open --limit 1 --json number --jq '.[0].number // empty') + if [ -n "$EXISTING" ]; then + echo "Updating existing issue #$EXISTING" + gh issue edit "$EXISTING" --body-file "$BODY_FILE" + gh issue comment "$EXISTING" --body "Report refreshed by [workflow run]($RUN_URL)." + else + echo "No open link-check issue found; creating a new one" + gh issue create --title "$TITLE" --body-file "$BODY_FILE" --label link-check + fi From 6e0d0ccfb388405835e04b13d6ef2dc815ed8dd8 Mon Sep 17 00:00:00 2001 From: Lukas Wallrich Date: Fri, 1 May 2026 12:39:27 +0100 Subject: [PATCH 2/3] Validate DOI links via Crossref API instead of via redirects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lychee was following doi.org redirects to publisher sites and getting bot-blocked there, producing 403 noise and missed real-typo DOIs. The workflow now extracts every doi.org URL from the rendered site and checks it against the Crossref REST API (which doesn't bot-block); doi.org / dx.doi.org are excluded from lychee so the redirect path isn't double-checked. Implementation notes: - DOIs in the HTML are sometimes URL-encoded (e.g. %2F for /) — decode before re-encoding for the Crossref URL to avoid double-encoding. - Crossref rate-limits HEAD bursts even within the polite pool, so concurrency is capped at 4 and 429 responses are retried with exponential backoff. - A local sample of ~2700 DOIs runs in roughly 3 minutes. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/link-check.yaml | 103 +++++++++++++++++++++++++++++- .lychee.toml | 5 ++ 2 files changed, 107 insertions(+), 1 deletion(-) diff --git a/.github/workflows/link-check.yaml b/.github/workflows/link-check.yaml index bbc5c15ca4e..a100b293966 100644 --- a/.github/workflows/link-check.yaml +++ b/.github/workflows/link-check.yaml @@ -134,6 +134,107 @@ jobs: f.write(output) PYEOF + - name: Validate DOI links via Crossref API + id: doi-validate + run: | + mkdir -p /tmp/lychee + python3 << 'PYEOF' + import os, re, sys, time, urllib.error, urllib.parse, urllib.request + from collections import defaultdict + from concurrent.futures import ThreadPoolExecutor, as_completed + from pathlib import Path + + SITE_ROOT = Path("/tmp/site") + # DOI body: any non-whitespace char except quotes / angle brackets / closers / + # backslash. Backslashes show up in JSON-escaped strings inside rendered HTML. + DOI_RE = re.compile(r'https?://(?:dx\.)?doi\.org/(10\.[^\s"\'<>)\]\\]+)', re.I) + # Strip trailing punctuation that's almost always sentence/markup, not DOI + TRAIL = '.,);:]}>"\'' + + # doi -> set of source pages + doi_pages = defaultdict(set) + for html in SITE_ROOT.rglob("*.html"): + try: + text = html.read_text(errors="ignore") + except Exception: + continue + # Strip the artifact-name dir (/tmp/site//) -> / + rel_parts = html.relative_to(SITE_ROOT).parts[1:] + page = "/" + "/".join(rel_parts) + page = re.sub(r"/index\.html$", "/", page) + for m in DOI_RE.finditer(text): + doi = m.group(1).rstrip(TRAIL).lower() + doi_pages[doi].add(page) + + if not doi_pages: + print("No DOIs found in site") + with open(os.environ["GITHUB_OUTPUT"], "a") as gh: + gh.write("found=false\n") + sys.exit(0) + + print(f"Validating {len(doi_pages)} unique DOIs via Crossref...") + + # Crossref polite pool: include a contact email in the UA. + # https://api.crossref.org/swagger-ui/index.html + headers = { + "User-Agent": "forrt-link-checker/1.0 (+https://forrt.org; mailto:info@forrt.org)", + "Accept": "application/json", + } + + def check(doi, retries=3): + # DOIs in HTML are sometimes URL-encoded (e.g. %2F for /, %28 for (). + # Decode first to avoid double-encoding when we hand it to Crossref. + clean = urllib.parse.unquote(doi) + url = "https://api.crossref.org/works/" + urllib.parse.quote(clean, safe="/") + delay = 1.5 + for attempt in range(retries): + req = urllib.request.Request(url, headers=headers, method="HEAD") + try: + with urllib.request.urlopen(req, timeout=20) as r: + return doi, r.status, None + except urllib.error.HTTPError as e: + if e.code == 429 and attempt < retries - 1: + time.sleep(delay) + delay *= 2 + continue + return doi, e.code, None + except Exception as e: + return doi, None, str(e) + return doi, 429, None + + broken = {} + transient = [] + # Keep concurrency modest — Crossref rate-limits HEAD bursts even + # within the polite pool, and the cron job is not time-critical. + with ThreadPoolExecutor(max_workers=4) as pool: + futures = {pool.submit(check, d): d for d in sorted(doi_pages)} + for fut in as_completed(futures): + doi, code, err = fut.result() + if code == 404: + broken[doi] = sorted(doi_pages[doi]) + elif err is not None or (code is not None and code >= 500): + transient.append((doi, code or err)) + + if transient: + print(f" ({len(transient)} DOIs returned transient errors - treating as valid)") + + if broken: + with open("/tmp/lychee/out.md", "a") as f: + f.write(f"\n## Broken DOIs ({len(broken)} found via Crossref API)\n\n") + f.write("These DOIs are not registered with Crossref - they likely contain a typo, were withdrawn, or were never minted.\n\n") + for doi in sorted(broken): + pages = broken[doi] + page_str = f" (in {pages[0]})" if len(pages) == 1 else f" (in {len(pages)} pages)" + f.write(f"* {page_str}\n") + with open(os.environ["GITHUB_OUTPUT"], "a") as gh: + gh.write("found=true\n") + print(f"Found {len(broken)} broken DOIs") + else: + with open(os.environ["GITHUB_OUTPUT"], "a") as gh: + gh.write("found=false\n") + print(f"All {len(doi_pages)} DOIs valid") + PYEOF + - name: Find publisher URLs that should use doi.org id: doi-check run: | @@ -165,7 +266,7 @@ jobs: fi - name: Create or update Link Checker Report issue - if: steps.lychee.outputs.exit_code != 0 || steps.doi-check.outputs.found == 'true' + if: steps.lychee.outputs.exit_code != 0 || steps.doi-check.outputs.found == 'true' || steps.doi-validate.outputs.found == 'true' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} GH_REPO: ${{ github.repository }} diff --git a/.lychee.toml b/.lychee.toml index c9ee28b3fe8..6fe1c576d86 100644 --- a/.lychee.toml +++ b/.lychee.toml @@ -25,6 +25,11 @@ exclude = [ # Web Archive — often slow or flaky "web\\.archive\\.org", + # DOI resolvers — validated separately via the Crossref REST API in the + # workflow, since following the redirect to the publisher just hits + # bot-blocking and produces noisy 403/404s. + "(?:dx\\.)?doi\\.org", + # GitHub edit links with templated paths "github\\.com/.*/edit/", ] From 003f4aca6e743ca0a623f114331e281477f9a22a Mon Sep 17 00:00:00 2001 From: Lukas Wallrich Date: Fri, 1 May 2026 17:14:52 +0100 Subject: [PATCH 3/3] Switch DOI validator to the registry-agnostic Handle API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crossref's API only knows Crossref-registered DOIs and returns 404 for DOIs minted by other agencies (DataCite for Zenodo / OSF / institutional repositories, JaLC, mEDRA, etc.), which produced 43/58 false positives on the local site. The DOI Handle API at doi.org/api/handles/{doi} is the authoritative cross-registrar resolver — responseCode 1 means the handle exists, 100 means it does not. Also fixes a regex bug that truncated SICI-style DOIs at the first ')': the extractor now allows parens in the DOI body and strips trailing unbalanced ')' / ']' afterwards, so DOIs like 10.1016/0277-9536(95)00127-S are captured intact. On the local site this reduced the broken count from 58 -> 11, and the validation step now runs in ~25s instead of ~160s. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/link-check.yaml | 79 +++++++++++++++++++------------ 1 file changed, 50 insertions(+), 29 deletions(-) diff --git a/.github/workflows/link-check.yaml b/.github/workflows/link-check.yaml index a100b293966..8b4b6972e84 100644 --- a/.github/workflows/link-check.yaml +++ b/.github/workflows/link-check.yaml @@ -134,22 +134,32 @@ jobs: f.write(output) PYEOF - - name: Validate DOI links via Crossref API + - name: Validate DOI links via the DOI Handle API id: doi-validate run: | mkdir -p /tmp/lychee python3 << 'PYEOF' - import os, re, sys, time, urllib.error, urllib.parse, urllib.request + import json, os, re, sys, time, urllib.error, urllib.parse, urllib.request from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path SITE_ROOT = Path("/tmp/site") - # DOI body: any non-whitespace char except quotes / angle brackets / closers / - # backslash. Backslashes show up in JSON-escaped strings inside rendered HTML. - DOI_RE = re.compile(r'https?://(?:dx\.)?doi\.org/(10\.[^\s"\'<>)\]\\]+)', re.I) - # Strip trailing punctuation that's almost always sentence/markup, not DOI - TRAIL = '.,);:]}>"\'' + # DOI body: any non-whitespace char except quotes, angle brackets, backslash. + # Parens are allowed because SICI-style DOIs (e.g. 10.1016/0277-9536(95)00127-S) + # legitimately contain balanced parens — we strip unbalanced trailing ones below. + DOI_RE = re.compile(r'https?://(?:dx\.)?doi\.org/(10\.[^\s"\'<>\\]+)', re.I) + TRAIL = '.,;:]}>"\'' + + def balance(s): + """Strip trailing ')' / ']' if they are unbalanced (i.e. enclosing punctuation).""" + while s and s[-1] in ')]': + opener = '(' if s[-1] == ')' else '[' + if s.count(opener) < s.count(s[-1]): + s = s[:-1] + else: + break + return s # doi -> set of source pages doi_pages = defaultdict(set) @@ -163,8 +173,11 @@ jobs: page = "/" + "/".join(rel_parts) page = re.sub(r"/index\.html$", "/", page) for m in DOI_RE.finditer(text): - doi = m.group(1).rstrip(TRAIL).lower() - doi_pages[doi].add(page) + doi = m.group(1).rstrip(TRAIL) + doi = balance(doi) + doi = doi.rstrip(TRAIL) + if doi: + doi_pages[doi.lower()].add(page) if not doi_pages: print("No DOIs found in site") @@ -172,56 +185,64 @@ jobs: gh.write("found=false\n") sys.exit(0) - print(f"Validating {len(doi_pages)} unique DOIs via Crossref...") + print(f"Validating {len(doi_pages)} unique DOIs via the DOI Handle API...") - # Crossref polite pool: include a contact email in the UA. - # https://api.crossref.org/swagger-ui/index.html + # The Handle API is registry-agnostic — works for Crossref, DataCite, + # mEDRA, JaLC, etc. responseCode 1 = handle found, 100 = not registered. + # https://www.doi.org/factsheets/DOIProxy.html headers = { "User-Agent": "forrt-link-checker/1.0 (+https://forrt.org; mailto:info@forrt.org)", - "Accept": "application/json", } def check(doi, retries=3): # DOIs in HTML are sometimes URL-encoded (e.g. %2F for /, %28 for (). - # Decode first to avoid double-encoding when we hand it to Crossref. + # Decode first to avoid double-encoding when we hand it to the API. clean = urllib.parse.unquote(doi) - url = "https://api.crossref.org/works/" + urllib.parse.quote(clean, safe="/") - delay = 1.5 + url = "https://doi.org/api/handles/" + urllib.parse.quote(clean, safe="/") + delay = 1.0 for attempt in range(retries): - req = urllib.request.Request(url, headers=headers, method="HEAD") + req = urllib.request.Request(url, headers=headers) try: with urllib.request.urlopen(req, timeout=20) as r: - return doi, r.status, None + data = json.load(r) + return doi, data.get("responseCode"), None except urllib.error.HTTPError as e: - if e.code == 429 and attempt < retries - 1: + if e.code in (429, 500, 502, 503, 504) and attempt < retries - 1: time.sleep(delay) delay *= 2 continue - return doi, e.code, None + # 404 from the proxy means the DOI is not registered. + if e.code == 404: + return doi, 100, None + return doi, None, f"HTTP {e.code}" except Exception as e: + if attempt < retries - 1: + time.sleep(delay) + delay *= 2 + continue return doi, None, str(e) - return doi, 429, None + return doi, None, "exhausted retries" broken = {} transient = [] - # Keep concurrency modest — Crossref rate-limits HEAD bursts even - # within the polite pool, and the cron job is not time-critical. - with ThreadPoolExecutor(max_workers=4) as pool: + with ThreadPoolExecutor(max_workers=6) as pool: futures = {pool.submit(check, d): d for d in sorted(doi_pages)} for fut in as_completed(futures): doi, code, err = fut.result() - if code == 404: + if code == 1: + pass + elif code == 100: broken[doi] = sorted(doi_pages[doi]) - elif err is not None or (code is not None and code >= 500): - transient.append((doi, code or err)) + else: + transient.append((doi, err or code)) if transient: print(f" ({len(transient)} DOIs returned transient errors - treating as valid)") if broken: with open("/tmp/lychee/out.md", "a") as f: - f.write(f"\n## Broken DOIs ({len(broken)} found via Crossref API)\n\n") - f.write("These DOIs are not registered with Crossref - they likely contain a typo, were withdrawn, or were never minted.\n\n") + f.write(f"\n## Broken DOIs ({len(broken)} not registered)\n\n") + f.write("These DOIs are not registered with any DOI agency (Crossref, DataCite, etc.) per the DOI Handle System — they likely contain a typo, were withdrawn, or were never minted.\n\n") for doi in sorted(broken): pages = broken[doi] page_str = f" (in {pages[0]})" if len(pages) == 1 else f" (in {len(pages)} pages)"