From e177846fef9545792c6a1d2de3c4b8b5d5daa788 Mon Sep 17 00:00:00 2001 From: AmitMY Date: Fri, 19 Jun 2026 13:16:16 +0200 Subject: [PATCH] papers: expand crawl with CSV/EML missed-paper seeds Cross-referenced two external Google Scholar exports (Publish-or-Perish title CSVs + "sign language" alert EMLs) against the crawl to find SL papers we never reached. - New `scripts/resolve_seeds.py`: resolves paper titles to Semantic Scholar paperIds via the title-match endpoint and injects the new ids into state/frontier.json (resumable, no year filter). - Took the 1,568 missed post-2014 SL titles (non-SL false positives excluded via an LLM title judge), resolved 634 to SS ids, added 602 new seeds, and drained the crawl. Result (state.tar.gz refreshed): 15,575 -> 16,137 rendered papers (+562; +541 in the >=2014 window). The remaining ~933 unresolved titles are not indexed on Semantic Scholar (recent theses, local journals), so they're unreachable by the SS crawl. Co-Authored-By: Claude Opus 4.8 (1M context) --- papers/scripts/resolve_seeds.py | 90 +++++++++++++++++++++++++++++++++ papers/state.tar.gz | 4 +- 2 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 papers/scripts/resolve_seeds.py diff --git a/papers/scripts/resolve_seeds.py b/papers/scripts/resolve_seeds.py new file mode 100644 index 0000000..8bb8484 --- /dev/null +++ b/papers/scripts/resolve_seeds.py @@ -0,0 +1,90 @@ +"""Resolve CSV/EML missed-paper titles to Semantic Scholar paperIds and +inject the new ones into the crawl frontier. + +Input : a JSON list of {title, year, src} (path via argv[1]). +Output: state/seed_csv_eml.json — {query_title: paperId|null} checkpoint, + and the resolved new ids appended to state/frontier.json. + +Resumable: re-running skips titles already in seed_csv_eml.json. +No year filter here — caller pre-filters to post-2014. +""" + +from __future__ import annotations + +import json +import re +import sys +from pathlib import Path + +from ss_client import SSClient + +STATE = Path(__file__).resolve().parent.parent / "state" +SEED_OUT = STATE / "seed_csv_eml.json" +FRONTIER = STATE / "frontier.json" +VISITED = STATE / "visited.json" + + +def norm(t: str) -> str: + return " ".join(re.sub(r"[^a-z0-9]+", " ", (t or "").lower()).split()) + + +def title_ok(a: str, b: str) -> bool: + a, b = norm(a), norm(b) + if a == b: + return True + s = min(len(a), len(b)) + return s >= 25 and (a.startswith(b) or b.startswith(a)) + + +def main(seeds_path: str) -> None: + seeds = json.loads(Path(seeds_path).read_text()) + resolved = json.loads(SEED_OUT.read_text()) if SEED_OUT.exists() else {} + client = SSClient() + + todo = [s for s in seeds if norm(s["title"]) not in resolved] + print(f"{len(seeds)} seeds, {len(resolved)} already resolved, {len(todo)} to do", flush=True) + + for i, s in enumerate(todo): + q = s["title"] + key = norm(q) + pid = None + try: + resp = client.get( + "/paper/search/match", + {"query": q[:300], "fields": "paperId,title,year"}, + ) + data = resp.get("data") or [] + if data and title_ok(q, data[0].get("title", "")): + pid = data[0]["paperId"] + except Exception as e: # noqa: BLE001 — match returns 404 when no hit + if "404" not in str(e): + print(f" err {key[:40]}: {e}", flush=True) + resolved[key] = pid + if (i + 1) % 20 == 0: + SEED_OUT.write_text(json.dumps(resolved, ensure_ascii=False)) + found = sum(1 for v in resolved.values() if v) + print(f" {i + 1}/{len(todo)} resolved; {found} have paperIds", flush=True) + + SEED_OUT.write_text(json.dumps(resolved, ensure_ascii=False)) + + # Inject new ids into frontier (skip ones already visited/known). + visited = json.loads(VISITED.read_text()) if VISITED.exists() else {} + frontier = json.loads(FRONTIER.read_text()) if FRONTIER.exists() else [] + fset = set(frontier) + ids = {v for v in resolved.values() if v} + new = [pid for pid in ids if pid not in visited and pid not in fset] + frontier.extend(new) + tmp = FRONTIER.with_suffix(".json.tmp") + tmp.write_text(json.dumps(frontier, ensure_ascii=False)) + tmp.replace(FRONTIER) + + print( + f"DONE: {len(ids)} titles resolved to ids; " + f"{len(ids & set(visited))} already in crawl; " + f"{len(new)} NEW ids added to frontier (now {len(frontier)})", + flush=True, + ) + + +if __name__ == "__main__": + main(sys.argv[1]) diff --git a/papers/state.tar.gz b/papers/state.tar.gz index 7985eca..5256719 100644 --- a/papers/state.tar.gz +++ b/papers/state.tar.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c821661e5e2498e90830591542e6c30f908c28bbc8e1343bcb203cd17d794724 -size 423330512 +oid sha256:82ca5aac5cb0edd43ac45bf0aefa62c2b87577896091bf1662cae84441c2f35e +size 432691153