From 963b0abfe8a33445d5d19c68168837bd004cd6ac Mon Sep 17 00:00:00 2001 From: Gabriele Battimelli Date: Wed, 22 Apr 2026 21:04:03 +0200 Subject: [PATCH] Add incremental vector-db and weekly CI workflow --- .github/workflows/weekly-index.yml | 176 +++++++++++++++++++++++++++++ database/vector_db.py | 37 +++--- 2 files changed, 200 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/weekly-index.yml diff --git a/.github/workflows/weekly-index.yml b/.github/workflows/weekly-index.yml new file mode 100644 index 0000000..15e732e --- /dev/null +++ b/.github/workflows/weekly-index.yml @@ -0,0 +1,176 @@ +name: Weekly Index + +on: + schedule: + - cron: '0 3 * * 1' # Every Monday at 03:00 UTC + workflow_dispatch: # Manual trigger via GitHub UI + +jobs: + index: + runs-on: ubuntu-latest + + env: + PHYSLIB_REPO: https://github.com/leanprover-community/physlib + JIXIA_REPO: https://github.com/frenzymath/jixia + MODULE_NAMES: Physlib + DRY_RUN: 'false' + CHROMA_PATH: chroma + CONNECTION_STRING: ${{ secrets.DATABASE_URL }} + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + GEMINI_MODEL: ${{ vars.GEMINI_MODEL || 'gemini-2.5-flash-preview-04-17' }} + GEMINI_FAST_MODEL: ${{ vars.GEMINI_FAST_MODEL || 'gemini-2.5-flash-preview-04-17' }} + GEMINI_EMBEDDING_MODEL: ${{ vars.GEMINI_EMBEDDING_MODEL || 'gemini-embedding-2-preview' }} + + steps: + - name: Checkout main + uses: actions/checkout@v4 + with: + ref: main + + # Authenticate the heroku remote so we can read/write chroma/ storage + - name: Authenticate Heroku remote + env: + HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} + run: | + git remote set-url heroku "https://heroku:$HEROKU_API_KEY@git.heroku.com/physlibsearch.git" + + # Restore chroma/ and the SHA tracker from the heroku git remote + - name: Restore ChromaDB and SHA tracker + run: | + git fetch heroku main + git checkout heroku/main -- chroma/ .last-physlib-sha 2>/dev/null \ + || echo "No prior index on heroku/main — starting fresh." + + # Skip everything if PhysLib has not changed since the last run + - name: Check PhysLib for new commits + id: check + run: | + CURRENT_SHA=$(git ls-remote "$PHYSLIB_REPO" HEAD | cut -f1) + LAST_SHA=$(cat .last-physlib-sha 2>/dev/null || echo "") + echo "current_sha=$CURRENT_SHA" >> "$GITHUB_OUTPUT" + if [ "$CURRENT_SHA" = "$LAST_SHA" ]; then + echo "has_changes=false" >> "$GITHUB_OUTPUT" + echo "PhysLib unchanged at $CURRENT_SHA — nothing to do." + else + echo "has_changes=true" >> "$GITHUB_OUTPUT" + echo "PhysLib changed: $LAST_SHA -> $CURRENT_SHA" + fi + + - name: Set up Python + if: steps.check.outputs.has_changes == 'true' + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: pip + + - name: Install Python dependencies + if: steps.check.outputs.has_changes == 'true' + run: pip install -r requirements.txt + + # PhysLib must be cloned before the cache steps so hashFiles() can read lean-toolchain + - name: Clone PhysLib + if: steps.check.outputs.has_changes == 'true' + run: git clone --depth 1 "$PHYSLIB_REPO" physlib + + - name: Cache elan toolchains + if: steps.check.outputs.has_changes == 'true' + uses: actions/cache@v4 + with: + path: ~/.elan + key: elan-${{ hashFiles('physlib/lean-toolchain') }} + + - name: Cache PhysLib lake build + if: steps.check.outputs.has_changes == 'true' + uses: actions/cache@v4 + with: + path: physlib/.lake/build + key: physlib-lake-${{ steps.check.outputs.current_sha }} + restore-keys: physlib-lake- + + - name: Install elan (Lean version manager) + if: steps.check.outputs.has_changes == 'true' + run: | + if ! command -v elan &>/dev/null; then + curl -sSf https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh \ + | sh -s -- -y --no-modify-path + fi + echo "$HOME/.elan/bin" >> "$GITHUB_PATH" + + - name: Build PhysLib + if: steps.check.outputs.has_changes == 'true' + run: cd physlib && lake build + timeout-minutes: 90 + + # jixia must be cloned before its cache step so hashFiles() can read its lakefile + - name: Clone jixia + if: steps.check.outputs.has_changes == 'true' + run: git clone --depth 1 "$JIXIA_REPO" jixia + + - name: Cache jixia build + if: steps.check.outputs.has_changes == 'true' + uses: actions/cache@v4 + with: + path: jixia/.lake/build + key: jixia-${{ hashFiles('physlib/lean-toolchain') }}-${{ hashFiles('jixia/lakefile.lean', 'jixia/lakefile.toml') }} + restore-keys: jixia-${{ hashFiles('physlib/lean-toolchain') }}- + + - name: Build jixia + if: steps.check.outputs.has_changes == 'true' + run: cd jixia && lake build + timeout-minutes: 30 + + - name: Set JIXIA_PATH and LEAN_SYSROOT + if: steps.check.outputs.has_changes == 'true' + run: | + echo "JIXIA_PATH=$(pwd)/jixia/.lake/build/bin/jixia" >> "$GITHUB_ENV" + TOOLCHAIN=$(cat physlib/lean-toolchain) + echo "LEAN_SYSROOT=$HOME/.elan/toolchains/$TOOLCHAIN" >> "$GITHUB_ENV" + + # --- Incremental pipeline --- + # Each step skips rows/vectors that already exist, so only new theorems are processed. + + - name: Create/update schema + if: steps.check.outputs.has_changes == 'true' + run: python3 -m database schema + + - name: Load jixia data into PostgreSQL + if: steps.check.outputs.has_changes == 'true' + run: python3 -m database jixia ./physlib "$MODULE_NAMES" + + - name: Informalize new declarations + if: steps.check.outputs.has_changes == 'true' + run: python3 -m database informal --batch-size 50 + + - name: Embed new declarations into ChromaDB + if: steps.check.outputs.has_changes == 'true' + run: python3 -m database vector-db --batch-size 8 + + # Persist the updated chroma/ and SHA tracker back to the heroku git remote + - name: Push updated index to Heroku git remote + if: steps.check.outputs.has_changes == 'true' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git worktree add /tmp/heroku-tree heroku/main + rsync -a --delete chroma/ /tmp/heroku-tree/chroma/ + echo "${{ steps.check.outputs.current_sha }}" > /tmp/heroku-tree/.last-physlib-sha + cd /tmp/heroku-tree + git add -f chroma/ .last-physlib-sha + git diff --cached --quiet \ + || git commit -m "chore: weekly index $(date -u +%Y-%m-%d) [physlib ${{ steps.check.outputs.current_sha }}]" + git push heroku HEAD:main + cd - + git worktree remove /tmp/heroku-tree + + # Rebuild and release the Docker image so the live app gets the new chroma/ + - name: Install Heroku CLI + if: steps.check.outputs.has_changes == 'true' + run: curl https://cli-assets.heroku.com/install.sh | sh + + - name: Build and release Docker image + if: steps.check.outputs.has_changes == 'true' + env: + HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} + run: | + heroku container:push web --app physlibsearch + heroku container:release web --app physlibsearch diff --git a/database/vector_db.py b/database/vector_db.py index 0229e5a..669805c 100644 --- a/database/vector_db.py +++ b/database/vector_db.py @@ -14,16 +14,19 @@ def create_vector_db(conn: Connection, path: str, batch_size: int): client = chromadb.PersistentClient(path) try: - client.delete_collection("physlibsearch") - logger.info("deleted existing physlibsearch collection") + collection = client.get_collection(name="physlibsearch", embedding_function=None) + existing_ids = set(collection.get(include=[])["ids"]) + logger.warning("using existing physlibsearch collection (%d vectors)", len(existing_ids)) except Exception: - pass - collection = client.create_collection( - name="physlibsearch", - metadata={"hnsw:space": "cosine"}, - embedding_function=None, - ) - + collection = client.create_collection( + name="physlibsearch", + metadata={"hnsw:space": "cosine"}, + embedding_function=None, + ) + existing_ids = set() + logger.warning("created new physlibsearch collection") + + added = 0 with conn.cursor() as cursor: cursor.execute(""" SELECT s.name, d.module_name, d.index, s.kind, d.signature, s.type, i.name, i.description @@ -40,12 +43,20 @@ def create_vector_db(conn: Connection, path: str, batch_size: int): for name, module_name, index, kind, signature, tp, informal_name, informal_description in batch: if signature is None: signature = tp - batch_doc.append(f"{kind} {name} {signature}\n{informal_name}: {informal_description}") # NOTE: the space character is not used in names from Physlib and its dependencies - batch_id.append(" ".join(str(x) for x in name)) - if os.environ["DRY_RUN"] == "true": - logger.info("DRY_RUN:skipped embedding: %s", f"{kind} {name} {signature} {informal_name}") + vec_id = " ".join(str(x) for x in name) + if vec_id in existing_ids: + continue + batch_doc.append(f"{kind} {name} {signature}\n{informal_name}: {informal_description}") + batch_id.append(vec_id) + if not batch_doc: + continue if os.environ["DRY_RUN"] == "true": + for doc in batch_doc: + logger.info("DRY_RUN:skipped embedding: %s", doc) return batch_embedding = embedding.embed(batch_doc) collection.add(embeddings=batch_embedding, ids=batch_id) + added += len(batch_id) + + logger.warning("vector-db: added %d new vectors, %d already existed", added, len(existing_ids))