Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 48 additions & 10 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ on:
- .github/workflows/docker.yml
- uv.lock

# Required for Workload Identity Federation to GCP.
# `id-token: write` lets the runner mint an OIDC token that GCP exchanges for
# short-lived credentials. `contents: read` is needed for actions/checkout.
permissions:
contents: read
id-token: write

jobs:
push_to_registry:
name: Build and push Docker images
Expand All @@ -29,6 +36,17 @@ jobs:
strategy:
matrix:
backend: [vllm, sglang]
env:
# These are read from GitHub Actions repository variables (Settings ->
# Secrets and variables -> Actions -> Variables). Set them once for the
# repo and they apply to every workflow run.
#
# GCP_AR_REGION e.g. us-central1, northamerica-northeast1
# GCP_PROJECT_ID e.g. my-gcp-project-123456
# GCP_AR_REPOSITORY e.g. vector-inference (must already exist in GAR)
GCP_AR_REGION: ${{ vars.GCP_AR_REGION }}
GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID }}
GCP_AR_REPOSITORY: ${{ vars.GCP_AR_REPOSITORY }}
steps:
- name: Checkout repository
uses: actions/checkout@v6.0.2
Expand All @@ -39,38 +57,58 @@ jobs:
VERSION=$(grep -A 1 "name = \"${{ matrix.backend }}\"" uv.lock | grep version | cut -d '"' -f 2)
echo "version=$VERSION" >> $GITHUB_OUTPUT

- name: Compute image base path
id: image
run: |
BASE="${GCP_AR_REGION}-docker.pkg.dev/${GCP_PROJECT_ID}/${GCP_AR_REPOSITORY}/vector-inference-${{ matrix.backend }}"
echo "base=${BASE}" >> $GITHUB_OUTPUT

- name: Maximize build space
run: |
echo "Disk space before cleanup:"
df -h
# Remove unnecessary pre-installed software
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
# Clean apt cache
sudo apt-get clean
# Remove docker images
docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true
echo "Disk space after cleanup:"
df -h

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v4

- name: Log in to Docker Hub
uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121
# Authenticate to Google Cloud using Workload Identity Federation.
# No long-lived service-account JSON key is stored in GitHub.
#
# Required secrets:
# GCP_WIF_PROVIDER Full resource name of the Workload Identity
# Provider, e.g.
# projects/123456789/locations/global/workloadIdentityPools/github-pool/providers/github-provider
# GCP_WIF_SERVICE_ACCOUNT Email of the service account to impersonate,
# e.g. gh-actions-pusher@my-project.iam.gserviceaccount.com
- name: Authenticate to Google Cloud
id: gcp-auth
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER }}
service_account: ${{ secrets.GCP_WIF_SERVICE_ACCOUNT }}

- name: Log in to Google Artifact Registry
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
registry: ${{ env.GCP_AR_REGION }}-docker.pkg.dev
username: oauth2accesstoken
password: ${{ steps.gcp-auth.outputs.access_token }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@030e881283bb7a6894de51c315a6bfe6a94e05cf
with:
images: vectorinstitute/vector-inference-${{ matrix.backend }}
images: ${{ steps.image.outputs.base }}

- name: Build and push Docker image
uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294
Expand All @@ -80,6 +118,6 @@ jobs:
push: true
tags: |
${{ steps.meta.outputs.tags }}
vectorinstitute/vector-inference-${{ matrix.backend }}:${{ steps.backend-version.outputs.version }}
vectorinstitute/vector-inference-${{ matrix.backend }}:latest
${{ steps.image.outputs.base }}:${{ steps.backend-version.outputs.version }}
${{ steps.image.outputs.base }}:latest
labels: ${{ steps.meta.outputs.labels }}
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,18 @@ If you are using the Vector cluster environment, and you don't need any customiz
```bash
pip install vec-inf
```
Otherwise, we recommend using the provided [`vllm.Dockerfile`](vllm.Dockerfile) and [`sglang.Dockerfile`](sglang.Dockerfile) to set up your own environment with the package. The built images are available through [Docker Hub](https://hub.docker.com/orgs/vectorinstitute/repositories)
Otherwise, we recommend using the provided [`vllm.Dockerfile`](vllm.Dockerfile) and [`sglang.Dockerfile`](sglang.Dockerfile) to set up your own environment with the package. The built images are published to **Google Artifact Registry** at:

```
<REGION>-docker.pkg.dev/<PROJECT_ID>/<REPOSITORY>/vector-inference-vllm
<REGION>-docker.pkg.dev/<PROJECT_ID>/<REPOSITORY>/vector-inference-sglang
```

Pull an image with (after `gcloud auth configure-docker <REGION>-docker.pkg.dev`):

```bash
docker pull <REGION>-docker.pkg.dev/<PROJECT_ID>/<REPOSITORY>/vector-inference-vllm:latest
```

If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
* Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](vec_inf/config/), then install from source by running `pip install .`.
Expand Down
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
pip install vec-inf
```

Otherwise, we recommend using the provided [`vllm.Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/vllm.Dockerfile) and [`sglang.Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/sglang.Dockerfile) to set up your own environment with the package. The built images are available through [Docker Hub](https://hub.docker.com/orgs/vectorinstitute/repositories)
Otherwise, we recommend using the provided [`vllm.Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/vllm.Dockerfile) and [`sglang.Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/sglang.Dockerfile) to set up your own environment with the package. The built images are published to **Google Artifact Registry** at `<REGION>-docker.pkg.dev/<PROJECT_ID>/<REPOSITORY>/vector-inference-{vllm,sglang}`. Run `gcloud auth configure-docker <REGION>-docker.pkg.dev` once, then `docker pull` the image you want.

If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:

Expand Down
16 changes: 13 additions & 3 deletions sglang.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,21 @@ ENV NCCL_DEBUG=INFO
WORKDIR /vec-inf
COPY . /vec-inf

# Install project dependencies with sglang backend and inference group
# Use --no-cache to prevent uv from storing both downloaded and extracted packages
RUN uv pip install --system -e .[sglang] --group inference --prerelease=allow --no-cache && \
# Install project dependencies pinned to uv.lock.
# See vllm.Dockerfile for the full rationale; same logic, sglang extra.
RUN uv export --frozen --no-emit-project --no-hashes \
--extra sglang --group inference \
-o /tmp/requirements.txt && \
uv pip install --system --no-cache --no-deps --prerelease=allow \
-r /tmp/requirements.txt && \
uv pip install --system --no-cache --no-deps -e . && \
rm -f /tmp/requirements.txt && \
rm -rf /root/.cache/uv /tmp/*

# Build-time canary: fail the build if the locked deps cannot be imported.
RUN python3.12 -c "import sglang, torch; \
print('sglang', sglang.__version__, '/ torch', torch.__version__)"

# Install a single, system NCCL (from NVIDIA CUDA repo in base image)
RUN apt-get update && apt-get install -y --allow-change-held-packages\
libnccl2 libnccl-dev \
Expand Down
27 changes: 24 additions & 3 deletions vllm.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,32 @@ ENV NCCL_DEBUG=INFO
WORKDIR /vec-inf
COPY . /vec-inf

# Install project dependencies with vllm backend and inference group
# Use --no-cache to prevent uv from storing both downloaded and extracted packages
RUN uv pip install --system -e .[vllm] --group inference --prerelease=allow --no-cache && \
# Install project dependencies pinned to uv.lock.
#
# `uv pip install` does NOT consult uv.lock -- only `uv sync` does, and
# `uv sync` requires a venv (incompatible with --system). Without this,
# every image build does fresh PyPI resolution and may pick a different
# transitive set than what the lockfile records (this is how :0.19.0
# shipped with the pyarrow/datasets ABI mismatch). Instead:
# 1. Export uv.lock to a fully-pinned requirements.txt (no resolver).
# 2. Install transitives with --no-deps so nothing is re-resolved.
# 3. Install the project itself editable, also --no-deps.
RUN uv export --frozen --no-emit-project --no-hashes \
--extra vllm --group inference \
-o /tmp/requirements.txt && \
uv pip install --system --no-cache --no-deps --prerelease=allow \
-r /tmp/requirements.txt && \
uv pip install --system --no-cache --no-deps -e . && \
rm -f /tmp/requirements.txt && \
rm -rf /root/.cache/uv /tmp/*

# Build-time canary: fail the build if the locked deps cannot be imported
# together. This is the check that would have caught the pyarrow/datasets
# ABI mismatch in :0.19.0 at build time instead of at job start.
RUN python3.12 -c "import vllm, datasets, pyarrow, transformers, torch; \
print('vllm', vllm.__version__, '/ datasets', datasets.__version__, \
'/ pyarrow', pyarrow.__version__, '/ torch', torch.__version__)"

# Install a single, system NCCL (from NVIDIA CUDA repo in base image)
RUN apt-get update && apt-get install -y --allow-change-held-packages\
libnccl2 libnccl-dev \
Expand Down
Loading