From 96f7f6802cf161be3d7e22fd53054e8b39653ffc Mon Sep 17 00:00:00 2001
From: Lukas Wallrich <lukas.wallrich@gmail.com>
Date: Fri, 1 May 2026 11:32:48 +0100
Subject: [PATCH 1/3] Update existing link-check issue instead of creating
 weekly duplicates

The weekly Link Checker run now finds the most recent open
"link-check"-labeled issue and edits its body in place, posting a
short comment so subscribers see the refresh. Falls back to creating
a fresh issue only when none is open.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/link-check.yaml | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/link-check.yaml b/.github/workflows/link-check.yaml
index 467e1c75322..bbc5c15ca4e 100644
--- a/.github/workflows/link-check.yaml
+++ b/.github/workflows/link-check.yaml
@@ -5,7 +5,7 @@ name: Link Checker
 # =======================
 # Purpose: Downloads the latest built site and checks all links (internal + external)
 # Triggers: Weekly on Mondays at 01:30 UTC or manual dispatch
-# Reports: Creates a GitHub issue with label "link-check" when broken links are found
+# Reports: Updates the existing open "link-check"-labeled issue (or creates one if none exists)
 # Config: See .lychee.toml for exclusion patterns and request settings
 
 on:
@@ -164,11 +164,22 @@ jobs:
             echo "found=false" >> "$GITHUB_OUTPUT"
           fi
 
-      - name: Create issue from lychee output
+      - name: Create or update Link Checker Report issue
         if: steps.lychee.outputs.exit_code != 0 || steps.doi-check.outputs.found == 'true'
-        uses: peter-evans/create-issue-from-file@v5
-        with:
-          title: "Link Checker Report"
-          content-filepath: /tmp/lychee/out.md
-          labels: link-check
-          token: ${{ secrets.GITHUB_TOKEN }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GH_REPO: ${{ github.repository }}
+          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: |
+          set -euo pipefail
+          TITLE="Link Checker Report"
+          BODY_FILE=/tmp/lychee/out.md
+          EXISTING=$(gh issue list --label link-check --state open --limit 1 --json number --jq '.[0].number // empty')
+          if [ -n "$EXISTING" ]; then
+            echo "Updating existing issue #$EXISTING"
+            gh issue edit "$EXISTING" --body-file "$BODY_FILE"
+            gh issue comment "$EXISTING" --body "Report refreshed by [workflow run]($RUN_URL)."
+          else
+            echo "No open link-check issue found; creating a new one"
+            gh issue create --title "$TITLE" --body-file "$BODY_FILE" --label link-check
+          fi

From 6e0d0ccfb388405835e04b13d6ef2dc815ed8dd8 Mon Sep 17 00:00:00 2001
From: Lukas Wallrich <lukas.wallrich@gmail.com>
Date: Fri, 1 May 2026 12:39:27 +0100
Subject: [PATCH 2/3] Validate DOI links via Crossref API instead of via
 redirects
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lychee was following doi.org redirects to publisher sites and getting
bot-blocked there, producing 403 noise and missed real-typo DOIs. The
workflow now extracts every doi.org URL from the rendered site and
checks it against the Crossref REST API (which doesn't bot-block);
doi.org / dx.doi.org are excluded from lychee so the redirect path
isn't double-checked.

Implementation notes:
- DOIs in the HTML are sometimes URL-encoded (e.g. %2F for /) — decode
  before re-encoding for the Crossref URL to avoid double-encoding.
- Crossref rate-limits HEAD bursts even within the polite pool, so
  concurrency is capped at 4 and 429 responses are retried with
  exponential backoff.
- A local sample of ~2700 DOIs runs in roughly 3 minutes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/link-check.yaml | 103 +++++++++++++++++++++++++++++-
 .lychee.toml                      |   5 ++
 2 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/link-check.yaml b/.github/workflows/link-check.yaml
index bbc5c15ca4e..a100b293966 100644
--- a/.github/workflows/link-check.yaml
+++ b/.github/workflows/link-check.yaml
@@ -134,6 +134,107 @@ jobs:
               f.write(output)
           PYEOF
 
+      - name: Validate DOI links via Crossref API
+        id: doi-validate
+        run: |
+          mkdir -p /tmp/lychee
+          python3 << 'PYEOF'
+          import os, re, sys, time, urllib.error, urllib.parse, urllib.request
+          from collections import defaultdict
+          from concurrent.futures import ThreadPoolExecutor, as_completed
+          from pathlib import Path
+
+          SITE_ROOT = Path("/tmp/site")
+          # DOI body: any non-whitespace char except quotes / angle brackets / closers /
+          # backslash. Backslashes show up in JSON-escaped strings inside rendered HTML.
+          DOI_RE = re.compile(r'https?://(?:dx\.)?doi\.org/(10\.[^\s"\'<>)\]\\]+)', re.I)
+          # Strip trailing punctuation that's almost always sentence/markup, not DOI
+          TRAIL = '.,);:]}>"\''
+
+          # doi -> set of source pages
+          doi_pages = defaultdict(set)
+          for html in SITE_ROOT.rglob("*.html"):
+              try:
+                  text = html.read_text(errors="ignore")
+              except Exception:
+                  continue
+              # Strip the artifact-name dir (/tmp/site/<artifact>/<path>) -> /<path>
+              rel_parts = html.relative_to(SITE_ROOT).parts[1:]
+              page = "/" + "/".join(rel_parts)
+              page = re.sub(r"/index\.html$", "/", page)
+              for m in DOI_RE.finditer(text):
+                  doi = m.group(1).rstrip(TRAIL).lower()
+                  doi_pages[doi].add(page)
+
+          if not doi_pages:
+              print("No DOIs found in site")
+              with open(os.environ["GITHUB_OUTPUT"], "a") as gh:
+                  gh.write("found=false\n")
+              sys.exit(0)
+
+          print(f"Validating {len(doi_pages)} unique DOIs via Crossref...")
+
+          # Crossref polite pool: include a contact email in the UA.
+          # https://api.crossref.org/swagger-ui/index.html
+          headers = {
+              "User-Agent": "forrt-link-checker/1.0 (+https://forrt.org; mailto:info@forrt.org)",
+              "Accept": "application/json",
+          }
+
+          def check(doi, retries=3):
+              # DOIs in HTML are sometimes URL-encoded (e.g. %2F for /, %28 for ().
+              # Decode first to avoid double-encoding when we hand it to Crossref.
+              clean = urllib.parse.unquote(doi)
+              url = "https://api.crossref.org/works/" + urllib.parse.quote(clean, safe="/")
+              delay = 1.5
+              for attempt in range(retries):
+                  req = urllib.request.Request(url, headers=headers, method="HEAD")
+                  try:
+                      with urllib.request.urlopen(req, timeout=20) as r:
+                          return doi, r.status, None
+                  except urllib.error.HTTPError as e:
+                      if e.code == 429 and attempt < retries - 1:
+                          time.sleep(delay)
+                          delay *= 2
+                          continue
+                      return doi, e.code, None
+                  except Exception as e:
+                      return doi, None, str(e)
+              return doi, 429, None
+
+          broken = {}
+          transient = []
+          # Keep concurrency modest — Crossref rate-limits HEAD bursts even
+          # within the polite pool, and the cron job is not time-critical.
+          with ThreadPoolExecutor(max_workers=4) as pool:
+              futures = {pool.submit(check, d): d for d in sorted(doi_pages)}
+              for fut in as_completed(futures):
+                  doi, code, err = fut.result()
+                  if code == 404:
+                      broken[doi] = sorted(doi_pages[doi])
+                  elif err is not None or (code is not None and code >= 500):
+                      transient.append((doi, code or err))
+
+          if transient:
+              print(f"  ({len(transient)} DOIs returned transient errors - treating as valid)")
+
+          if broken:
+              with open("/tmp/lychee/out.md", "a") as f:
+                  f.write(f"\n## Broken DOIs ({len(broken)} found via Crossref API)\n\n")
+                  f.write("These DOIs are not registered with Crossref - they likely contain a typo, were withdrawn, or were never minted.\n\n")
+                  for doi in sorted(broken):
+                      pages = broken[doi]
+                      page_str = f" (in {pages[0]})" if len(pages) == 1 else f" (in {len(pages)} pages)"
+                      f.write(f"* <https://doi.org/{doi}>{page_str}\n")
+              with open(os.environ["GITHUB_OUTPUT"], "a") as gh:
+                  gh.write("found=true\n")
+              print(f"Found {len(broken)} broken DOIs")
+          else:
+              with open(os.environ["GITHUB_OUTPUT"], "a") as gh:
+                  gh.write("found=false\n")
+              print(f"All {len(doi_pages)} DOIs valid")
+          PYEOF
+
       - name: Find publisher URLs that should use doi.org
         id: doi-check
         run: |
@@ -165,7 +266,7 @@ jobs:
           fi
 
       - name: Create or update Link Checker Report issue
-        if: steps.lychee.outputs.exit_code != 0 || steps.doi-check.outputs.found == 'true'
+        if: steps.lychee.outputs.exit_code != 0 || steps.doi-check.outputs.found == 'true' || steps.doi-validate.outputs.found == 'true'
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GH_REPO: ${{ github.repository }}
diff --git a/.lychee.toml b/.lychee.toml
index c9ee28b3fe8..6fe1c576d86 100644
--- a/.lychee.toml
+++ b/.lychee.toml
@@ -25,6 +25,11 @@ exclude = [
   # Web Archive — often slow or flaky
   "web\\.archive\\.org",
 
+  # DOI resolvers — validated separately via the Crossref REST API in the
+  # workflow, since following the redirect to the publisher just hits
+  # bot-blocking and produces noisy 403/404s.
+  "(?:dx\\.)?doi\\.org",
+
   # GitHub edit links with templated paths
   "github\\.com/.*/edit/",
 ]

From 003f4aca6e743ca0a623f114331e281477f9a22a Mon Sep 17 00:00:00 2001
From: Lukas Wallrich <lukas.wallrich@gmail.com>
Date: Fri, 1 May 2026 17:14:52 +0100
Subject: [PATCH 3/3] Switch DOI validator to the registry-agnostic Handle API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Crossref's API only knows Crossref-registered DOIs and returns 404 for
DOIs minted by other agencies (DataCite for Zenodo / OSF / institutional
repositories, JaLC, mEDRA, etc.), which produced 43/58 false positives
on the local site. The DOI Handle API at doi.org/api/handles/{doi} is
the authoritative cross-registrar resolver — responseCode 1 means the
handle exists, 100 means it does not.

Also fixes a regex bug that truncated SICI-style DOIs at the first ')':
the extractor now allows parens in the DOI body and strips trailing
unbalanced ')' / ']' afterwards, so DOIs like
10.1016/0277-9536(95)00127-S are captured intact.

On the local site this reduced the broken count from 58 -> 11, and the
validation step now runs in ~25s instead of ~160s.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/link-check.yaml | 79 +++++++++++++++++++------------
 1 file changed, 50 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/link-check.yaml b/.github/workflows/link-check.yaml
index a100b293966..8b4b6972e84 100644
--- a/.github/workflows/link-check.yaml
+++ b/.github/workflows/link-check.yaml
@@ -134,22 +134,32 @@ jobs:
               f.write(output)
           PYEOF
 
-      - name: Validate DOI links via Crossref API
+      - name: Validate DOI links via the DOI Handle API
         id: doi-validate
         run: |
           mkdir -p /tmp/lychee
           python3 << 'PYEOF'
-          import os, re, sys, time, urllib.error, urllib.parse, urllib.request
+          import json, os, re, sys, time, urllib.error, urllib.parse, urllib.request
           from collections import defaultdict
           from concurrent.futures import ThreadPoolExecutor, as_completed
           from pathlib import Path
 
           SITE_ROOT = Path("/tmp/site")
-          # DOI body: any non-whitespace char except quotes / angle brackets / closers /
-          # backslash. Backslashes show up in JSON-escaped strings inside rendered HTML.
-          DOI_RE = re.compile(r'https?://(?:dx\.)?doi\.org/(10\.[^\s"\'<>)\]\\]+)', re.I)
-          # Strip trailing punctuation that's almost always sentence/markup, not DOI
-          TRAIL = '.,);:]}>"\''
+          # DOI body: any non-whitespace char except quotes, angle brackets, backslash.
+          # Parens are allowed because SICI-style DOIs (e.g. 10.1016/0277-9536(95)00127-S)
+          # legitimately contain balanced parens — we strip unbalanced trailing ones below.
+          DOI_RE = re.compile(r'https?://(?:dx\.)?doi\.org/(10\.[^\s"\'<>\\]+)', re.I)
+          TRAIL = '.,;:]}>"\''
+
+          def balance(s):
+              """Strip trailing ')' / ']' if they are unbalanced (i.e. enclosing punctuation)."""
+              while s and s[-1] in ')]':
+                  opener = '(' if s[-1] == ')' else '['
+                  if s.count(opener) < s.count(s[-1]):
+                      s = s[:-1]
+                  else:
+                      break
+              return s
 
           # doi -> set of source pages
           doi_pages = defaultdict(set)
@@ -163,8 +173,11 @@ jobs:
               page = "/" + "/".join(rel_parts)
               page = re.sub(r"/index\.html$", "/", page)
               for m in DOI_RE.finditer(text):
-                  doi = m.group(1).rstrip(TRAIL).lower()
-                  doi_pages[doi].add(page)
+                  doi = m.group(1).rstrip(TRAIL)
+                  doi = balance(doi)
+                  doi = doi.rstrip(TRAIL)
+                  if doi:
+                      doi_pages[doi.lower()].add(page)
 
           if not doi_pages:
               print("No DOIs found in site")
@@ -172,56 +185,64 @@ jobs:
                   gh.write("found=false\n")
               sys.exit(0)
 
-          print(f"Validating {len(doi_pages)} unique DOIs via Crossref...")
+          print(f"Validating {len(doi_pages)} unique DOIs via the DOI Handle API...")
 
-          # Crossref polite pool: include a contact email in the UA.
-          # https://api.crossref.org/swagger-ui/index.html
+          # The Handle API is registry-agnostic — works for Crossref, DataCite,
+          # mEDRA, JaLC, etc. responseCode 1 = handle found, 100 = not registered.
+          # https://www.doi.org/factsheets/DOIProxy.html
           headers = {
               "User-Agent": "forrt-link-checker/1.0 (+https://forrt.org; mailto:info@forrt.org)",
-              "Accept": "application/json",
           }
 
           def check(doi, retries=3):
               # DOIs in HTML are sometimes URL-encoded (e.g. %2F for /, %28 for ().
-              # Decode first to avoid double-encoding when we hand it to Crossref.
+              # Decode first to avoid double-encoding when we hand it to the API.
               clean = urllib.parse.unquote(doi)
-              url = "https://api.crossref.org/works/" + urllib.parse.quote(clean, safe="/")
-              delay = 1.5
+              url = "https://doi.org/api/handles/" + urllib.parse.quote(clean, safe="/")
+              delay = 1.0
               for attempt in range(retries):
-                  req = urllib.request.Request(url, headers=headers, method="HEAD")
+                  req = urllib.request.Request(url, headers=headers)
                   try:
                       with urllib.request.urlopen(req, timeout=20) as r:
-                          return doi, r.status, None
+                          data = json.load(r)
+                          return doi, data.get("responseCode"), None
                   except urllib.error.HTTPError as e:
-                      if e.code == 429 and attempt < retries - 1:
+                      if e.code in (429, 500, 502, 503, 504) and attempt < retries - 1:
                           time.sleep(delay)
                           delay *= 2
                           continue
-                      return doi, e.code, None
+                      # 404 from the proxy means the DOI is not registered.
+                      if e.code == 404:
+                          return doi, 100, None
+                      return doi, None, f"HTTP {e.code}"
                   except Exception as e:
+                      if attempt < retries - 1:
+                          time.sleep(delay)
+                          delay *= 2
+                          continue
                       return doi, None, str(e)
-              return doi, 429, None
+              return doi, None, "exhausted retries"
 
           broken = {}
           transient = []
-          # Keep concurrency modest — Crossref rate-limits HEAD bursts even
-          # within the polite pool, and the cron job is not time-critical.
-          with ThreadPoolExecutor(max_workers=4) as pool:
+          with ThreadPoolExecutor(max_workers=6) as pool:
               futures = {pool.submit(check, d): d for d in sorted(doi_pages)}
               for fut in as_completed(futures):
                   doi, code, err = fut.result()
-                  if code == 404:
+                  if code == 1:
+                      pass
+                  elif code == 100:
                       broken[doi] = sorted(doi_pages[doi])
-                  elif err is not None or (code is not None and code >= 500):
-                      transient.append((doi, code or err))
+                  else:
+                      transient.append((doi, err or code))
 
           if transient:
               print(f"  ({len(transient)} DOIs returned transient errors - treating as valid)")
 
           if broken:
               with open("/tmp/lychee/out.md", "a") as f:
-                  f.write(f"\n## Broken DOIs ({len(broken)} found via Crossref API)\n\n")
-                  f.write("These DOIs are not registered with Crossref - they likely contain a typo, were withdrawn, or were never minted.\n\n")
+                  f.write(f"\n## Broken DOIs ({len(broken)} not registered)\n\n")
+                  f.write("These DOIs are not registered with any DOI agency (Crossref, DataCite, etc.) per the DOI Handle System — they likely contain a typo, were withdrawn, or were never minted.\n\n")
                   for doi in sorted(broken):
                       pages = broken[doi]
                       page_str = f" (in {pages[0]})" if len(pages) == 1 else f" (in {len(pages)} pages)"