-
Notifications
You must be signed in to change notification settings - Fork 1
Add incremental vector-db and weekly CI workflow #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,176 @@ | ||||||||||||||||||||||||||||||
| name: Weekly Index | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| on: | ||||||||||||||||||||||||||||||
| schedule: | ||||||||||||||||||||||||||||||
| - cron: '0 3 * * 1' # Every Monday at 03:00 UTC | ||||||||||||||||||||||||||||||
| workflow_dispatch: # Manual trigger via GitHub UI | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| jobs: | ||||||||||||||||||||||||||||||
| index: | ||||||||||||||||||||||||||||||
| runs-on: ubuntu-latest | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| env: | ||||||||||||||||||||||||||||||
| PHYSLIB_REPO: https://github.com/leanprover-community/physlib | ||||||||||||||||||||||||||||||
| JIXIA_REPO: https://github.com/frenzymath/jixia | ||||||||||||||||||||||||||||||
| MODULE_NAMES: Physlib | ||||||||||||||||||||||||||||||
| DRY_RUN: 'false' | ||||||||||||||||||||||||||||||
| CHROMA_PATH: chroma | ||||||||||||||||||||||||||||||
| CONNECTION_STRING: ${{ secrets.DATABASE_URL }} | ||||||||||||||||||||||||||||||
| GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} | ||||||||||||||||||||||||||||||
| GEMINI_MODEL: ${{ vars.GEMINI_MODEL || 'gemini-2.5-flash-preview-04-17' }} | ||||||||||||||||||||||||||||||
| GEMINI_FAST_MODEL: ${{ vars.GEMINI_FAST_MODEL || 'gemini-2.5-flash-preview-04-17' }} | ||||||||||||||||||||||||||||||
| GEMINI_EMBEDDING_MODEL: ${{ vars.GEMINI_EMBEDDING_MODEL || 'gemini-embedding-2-preview' }} | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| steps: | ||||||||||||||||||||||||||||||
| - name: Checkout main | ||||||||||||||||||||||||||||||
| uses: actions/checkout@v4 | ||||||||||||||||||||||||||||||
| with: | ||||||||||||||||||||||||||||||
| ref: main | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| # Authenticate the heroku remote so we can read/write chroma/ storage | ||||||||||||||||||||||||||||||
| - name: Authenticate Heroku remote | ||||||||||||||||||||||||||||||
| env: | ||||||||||||||||||||||||||||||
| HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} | ||||||||||||||||||||||||||||||
| run: | | ||||||||||||||||||||||||||||||
| git remote set-url heroku "https://heroku:$HEROKU_API_KEY@git.heroku.com/physlibsearch.git" | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
| git remote set-url heroku "https://heroku:$HEROKU_API_KEY@git.heroku.com/physlibsearch.git" | |
| git remote add heroku "https://heroku:$HEROKU_API_KEY@git.heroku.com/physlibsearch.git" \ | |
| || git remote set-url heroku "https://heroku:$HEROKU_API_KEY@git.heroku.com/physlibsearch.git" |
Copilot
AI
Apr 22, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
git fetch heroku main will hard-fail the workflow if the Heroku remote/branch doesn't exist yet (e.g., first run or app not initialized), even though later steps try to handle a missing prior index. Consider making the fetch/checkout resilient (e.g., tolerate missing branch and proceed with an empty chroma/ and no .last-physlib-sha) so the workflow can bootstrap from scratch.
| git fetch heroku main | |
| git checkout heroku/main -- chroma/ .last-physlib-sha 2>/dev/null \ | |
| || echo "No prior index on heroku/main — starting fresh." | |
| if git fetch heroku main; then | |
| git checkout heroku/main -- chroma/ .last-physlib-sha 2>/dev/null || { | |
| echo "No prior index files on heroku/main — starting fresh." | |
| mkdir -p "$CHROMA_PATH" | |
| rm -f .last-physlib-sha | |
| } | |
| else | |
| echo "No heroku/main branch available yet — starting fresh." | |
| mkdir -p "$CHROMA_PATH" | |
| rm -f .last-physlib-sha | |
| fi |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,16 +14,19 @@ def create_vector_db(conn: Connection, path: str, batch_size: int): | |
|
|
||
| client = chromadb.PersistentClient(path) | ||
| try: | ||
| client.delete_collection("physlibsearch") | ||
| logger.info("deleted existing physlibsearch collection") | ||
| collection = client.get_collection(name="physlibsearch", embedding_function=None) | ||
| existing_ids = set(collection.get(include=[])["ids"]) | ||
| logger.warning("using existing physlibsearch collection (%d vectors)", len(existing_ids)) | ||
|
Comment on lines
+18
to
+19
|
||
| except Exception: | ||
| pass | ||
| collection = client.create_collection( | ||
| name="physlibsearch", | ||
| metadata={"hnsw:space": "cosine"}, | ||
| embedding_function=None, | ||
| ) | ||
|
|
||
| collection = client.create_collection( | ||
| name="physlibsearch", | ||
| metadata={"hnsw:space": "cosine"}, | ||
| embedding_function=None, | ||
| ) | ||
|
Comment on lines
16
to
+25
|
||
| existing_ids = set() | ||
| logger.warning("created new physlibsearch collection") | ||
|
|
||
| added = 0 | ||
| with conn.cursor() as cursor: | ||
| cursor.execute(""" | ||
| SELECT s.name, d.module_name, d.index, s.kind, d.signature, s.type, i.name, i.description | ||
|
|
@@ -40,12 +43,20 @@ def create_vector_db(conn: Connection, path: str, batch_size: int): | |
| for name, module_name, index, kind, signature, tp, informal_name, informal_description in batch: | ||
| if signature is None: | ||
| signature = tp | ||
| batch_doc.append(f"{kind} {name} {signature}\n{informal_name}: {informal_description}") | ||
| # NOTE: the space character is not used in names from Physlib and its dependencies | ||
| batch_id.append(" ".join(str(x) for x in name)) | ||
| if os.environ["DRY_RUN"] == "true": | ||
| logger.info("DRY_RUN:skipped embedding: %s", f"{kind} {name} {signature} {informal_name}") | ||
| vec_id = " ".join(str(x) for x in name) | ||
| if vec_id in existing_ids: | ||
| continue | ||
| batch_doc.append(f"{kind} {name} {signature}\n{informal_name}: {informal_description}") | ||
|
Comment on lines
+47
to
+50
|
||
| batch_id.append(vec_id) | ||
| if not batch_doc: | ||
| continue | ||
| if os.environ["DRY_RUN"] == "true": | ||
| for doc in batch_doc: | ||
| logger.info("DRY_RUN:skipped embedding: %s", doc) | ||
| return | ||
| batch_embedding = embedding.embed(batch_doc) | ||
| collection.add(embeddings=batch_embedding, ids=batch_id) | ||
| added += len(batch_id) | ||
|
|
||
| logger.warning("vector-db: added %d new vectors, %d already existed", added, len(existing_ids)) | ||
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This workflow runs
python3 -m database ..., which executesdatabase/__main__.py. That module currently requiresLOG_FILENAME,LOG_FILEMODE, andLOG_LEVELto be present in the environment (it usesos.environ[...]), but they are not set here, so the job will crash withKeyError. Either set these env vars in the workflow (e.g.,LOG_LEVEL: WARNING, etc.) or makedatabase/__main__.pyuseos.environ.get(...)defaults.