Skip to content
86 changes: 86 additions & 0 deletions lib/services/distributedLock.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/**
* Distributed lock primitive for multi-pod cron jobs.
*
* Uses a MongoDB TTL collection (`cron_locks`) to ensure mutual exclusion
* across replicas. A lock document expires automatically after `ttlMs`
* milliseconds — pod crashes therefore never permanently block scheduling.
*
* Usage:
* const holder = `${process.env.HOSTNAME ?? 'unknown'}:${randomUUID()}`
* const acquired = await acquireLock({ name: 'billing.weeklyReset', ttlMs: 10 * 60 * 1000, holder })
* if (!acquired) return // another pod holds the lock
* try {
* // ... work
* } finally {
* await releaseLock({ name: 'billing.weeklyReset', holder })
* }
*/

import mongoose from 'mongoose';

const LockSchema = new mongoose.Schema(
{
_id: { type: String, required: true },
lockedAt: { type: Date, required: true },
lockedUntil: { type: Date, required: true },
holder: { type: String, required: true },
},
{ collection: 'cron_locks', versionKey: false },
);

// MongoDB TTL index — auto-deletes expired docs so stale locks don't accumulate.
LockSchema.index({ lockedUntil: 1 }, { expireAfterSeconds: 0 });

export const CronLock = mongoose.models.CronLock ?? mongoose.model('CronLock', LockSchema);

Comment thread
PierreBrisorgueil marked this conversation as resolved.
/**
* @function acquireLock
* @description Attempt to acquire a named lock. Returns true if acquired,
* false if the lock is currently held by another holder.
*
* Implementation: findOneAndUpdate with upsert on the condition that either
* no doc exists (_id absent) or the existing doc has expired (lockedUntil < now).
* Duplicate-key errors (E11000) from the unique _id index are caught and
* returned as false (another pod raced to acquire simultaneously).
*
* @param {object} opts
* @param {string} opts.name - Unique lock name (e.g. 'billing.weeklyReset')
* @param {number} opts.ttlMs - Lock duration in milliseconds
* @param {string} opts.holder - Unique identifier for the calling pod/process
* @returns {Promise<boolean>}
*/
export async function acquireLock({ name, ttlMs, holder }) {
if (!Number.isFinite(ttlMs) || ttlMs <= 0) {
throw new Error(`acquireLock: ttlMs must be a positive number, received ${ttlMs}`);
}
const now = new Date();
const lockedUntil = new Date(now.getTime() + ttlMs);
try {
const result = await CronLock.findOneAndUpdate(
{ _id: name, lockedUntil: { $lt: now } },
{ $set: { lockedAt: now, lockedUntil, holder } },
{ upsert: true, returnDocument: 'after' },
);
return result?.holder === holder;
} catch (err) {
if (err.code === 11000) return false;
throw err;
}
}
Comment thread
PierreBrisorgueil marked this conversation as resolved.

/**
* @function releaseLock
* @description Release a lock only if the caller is the current holder.
* No-op if the lock is held by a different holder (prevents accidental release
* after a TTL expiry + re-acquire by another pod).
*
* @param {object} opts
* @param {string} opts.name - Lock name to release
* @param {string} opts.holder - Must match the holder that acquired the lock
* @returns {Promise<void>}
*/
export async function releaseLock({ name, holder }) {
await CronLock.deleteOne({ _id: name, holder });
}

export default { CronLock, acquireLock, releaseLock };
160 changes: 160 additions & 0 deletions lib/services/tests/distributedLock.unit.tests.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
/**
* Module dependencies.
*/
import { jest, describe, test, beforeEach, afterEach, expect } from '@jest/globals';

/**
* Unit tests for lib/services/distributedLock.js
*
* All Mongoose interactions are mocked — no real DB connection required.
* Tests verify the acquire / release contract and contention handling.
*/
describe('distributedLock — acquireLock:', () => {
let acquireLock;
let mockFindOneAndUpdate;
let mockDeleteOne;

beforeEach(async () => {
jest.resetModules();

mockFindOneAndUpdate = jest.fn();
mockDeleteOne = jest.fn();

const mockCronLock = {
findOneAndUpdate: mockFindOneAndUpdate,
deleteOne: mockDeleteOne,
};

jest.unstable_mockModule('mongoose', () => ({
default: {
Schema: class MockSchema {
constructor() {}
index() {}
},
models: {},
model: jest.fn(() => mockCronLock),
},
}));

({ acquireLock } = await import('../distributedLock.js'));
});

afterEach(() => {
jest.restoreAllMocks();
});

test('returns true when findOneAndUpdate resolves with matching holder', async () => {
mockFindOneAndUpdate.mockResolvedValue({ holder: 'pod-1' });

const ok = await acquireLock({ name: 'job-a', ttlMs: 60_000, holder: 'pod-1' });

expect(ok).toBe(true);
expect(mockFindOneAndUpdate).toHaveBeenCalledTimes(1);
const [filter, update, opts] = mockFindOneAndUpdate.mock.calls[0];
expect(filter._id).toBe('job-a');
expect(filter.lockedUntil.$lt).toBeInstanceOf(Date);
expect(update.$set.holder).toBe('pod-1');
expect(opts.upsert).toBe(true);
});

test('returns false when findOneAndUpdate returns doc held by different holder', async () => {
mockFindOneAndUpdate.mockResolvedValue({ holder: 'pod-1' });

const ok = await acquireLock({ name: 'job-b', ttlMs: 60_000, holder: 'pod-2' });

expect(ok).toBe(false);
});

test('returns false on E11000 duplicate-key (concurrent upsert race)', async () => {
const dupErr = new Error('E11000 duplicate key');
dupErr.code = 11000;
mockFindOneAndUpdate.mockRejectedValue(dupErr);

const ok = await acquireLock({ name: 'job-c', ttlMs: 60_000, holder: 'pod-1' });

expect(ok).toBe(false);
});

test('re-throws non-duplicate errors', async () => {
const dbErr = new Error('network timeout');
dbErr.code = 13;
mockFindOneAndUpdate.mockRejectedValue(dbErr);

await expect(acquireLock({ name: 'job-d', ttlMs: 60_000, holder: 'pod-1' })).rejects.toThrow('network timeout');
});

test.each([
[0, 'zero'],
[-1, 'negative'],
[Number.NaN, 'NaN'],
[Infinity, 'Infinity'],
[undefined, 'undefined'],
[null, 'null'],
])('throws when ttlMs is %s (%s)', async (ttlMs) => {
await expect(acquireLock({ name: 'job-guard', ttlMs, holder: 'pod-1' })).rejects.toThrow(
'acquireLock: ttlMs must be a positive number',
);
expect(mockFindOneAndUpdate).not.toHaveBeenCalled();
});

test('lockedUntil is set to now + ttlMs', async () => {
const before = Date.now();
mockFindOneAndUpdate.mockResolvedValue({ holder: 'pod-1' });

await acquireLock({ name: 'job-e', ttlMs: 10_000, holder: 'pod-1' });

const after = Date.now();
const { lockedUntil } = mockFindOneAndUpdate.mock.calls[0][1].$set;
expect(lockedUntil.getTime()).toBeGreaterThanOrEqual(before + 10_000);
expect(lockedUntil.getTime()).toBeLessThanOrEqual(after + 10_000);
});
});

describe('distributedLock — releaseLock:', () => {
let releaseLock;
let mockDeleteOne;

beforeEach(async () => {
jest.resetModules();

mockDeleteOne = jest.fn().mockResolvedValue({});

const mockCronLock = {
findOneAndUpdate: jest.fn(),
deleteOne: mockDeleteOne,
};

jest.unstable_mockModule('mongoose', () => ({
default: {
Schema: class MockSchema {
constructor() {}
index() {}
},
models: {},
model: jest.fn(() => mockCronLock),
},
}));

({ releaseLock } = await import('../distributedLock.js'));
});

afterEach(() => {
jest.restoreAllMocks();
});

test('calls deleteOne with name and holder', async () => {
await releaseLock({ name: 'job-a', holder: 'pod-1' });

expect(mockDeleteOne).toHaveBeenCalledWith({ _id: 'job-a', holder: 'pod-1' });
});

test('does not throw when deleteOne resolves', async () => {
await expect(releaseLock({ name: 'job-b', holder: 'pod-2' })).resolves.toBeUndefined();
});

test('propagates deleteOne errors to the caller', async () => {
const dbErr = new Error('network timeout');
mockDeleteOne.mockRejectedValue(dbErr);
await expect(releaseLock({ name: 'job-c', holder: 'pod-1' })).rejects.toThrow('network timeout');
});
});
43 changes: 43 additions & 0 deletions modules/billing/RUNBOOKS.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,46 @@ Operational runbooks for the billing module. Each runbook references real endpoi
4. Monitor `billing.plans.stale` event frequency — if the stale cache is 24h+, alert the on-call to decide whether to take the plans endpoint down entirely or serve a static fallback.
5. Once Stripe recovers: `POST /api/admin/billing/sync/:orgId` on any org that attempted a subscription change during the outage.
6. Check dead-letter queue for events that exhausted retries during the outage window: `GET /api/admin/billing/dead-letters`.

---

## 6 — Cron lock stuck

**Symptom:** All billing crons emit `lock held by another pod, skipping` for longer than the lock TTL duration, meaning no billing cron is running at all.

**Cause:** A pod crashed mid-job without reaching the `finally` block that calls `releaseLock`. The TTL has not yet expired on the stale lock doc in `cron_locks`.

**Wait first:** Lock TTLs are sized 2–3× typical exec time. Wait for the TTL to expire (max 15 min for `dunningSweep`). MongoDB's TTL monitor runs every 60 seconds, so actual cleanup may lag up to 60 s after expiry.

**If urgent — drop the stale lock manually:**

**Before drop:** verify the holder and TTL window first to avoid kicking a running cron.

```js
db.cron_locks.findOne({ _id: "billing.weeklyReset" })
// If lockedUntil is in the past → safe to drop.
// If in the future → the lock is genuinely held; wait for TTL unless the holder pod is confirmed dead.
```

Then drop:

```js
// weeklyReset
db.cron_locks.deleteOne({ _id: "billing.weeklyReset" })

// dunningSweep
db.cron_locks.deleteOne({ _id: "billing.dunningSweep" })

// extrasExpiration
db.cron_locks.deleteOne({ _id: "billing.extrasExpiration" })
```

Or via `kubectl exec` on the mongo pod:

```bash
kubectl exec -n pierreb-projects mongo-0 -- mongosh \
"mongodb://localhost:27017/<your-db>" \
--eval 'db.cron_locks.deleteOne({ _id: "billing.weeklyReset" })'
```

**Prevention:** Lock TTLs are intentionally conservative. If you see frequent stuck-lock incidents, investigate cron duration (slow query? tenant scale?) rather than lower the TTL — a TTL too short defeats the mutual-exclusion guarantee.
19 changes: 19 additions & 0 deletions modules/billing/crons/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,22 @@ const orgs = allOrgs.filter(o => {
## Dependency: meterMode flag

All scripts check `config.billing.meterMode` at startup. Downstream projects must set this flag to `true` in their project config to activate billing crons. The devkit default is `false` — all crons are no-ops until explicitly enabled.

## Concurrency control

All billing crons acquire a distributed lock (`lib/services/distributedLock.js`) before
mutating state. The lock auto-expires after TTL (5–15 min depending on cron)
so that pod crashes don't permanently block scheduling.

Lock names and TTLs:

| Lock name | TTL | Cron |
|-----------|-----|------|
| `billing.weeklyReset` | 10 min | `billing.weeklyReset.js` |
| `billing.dunningSweep` | 15 min | `billing.dunningSweep.js` |
| `billing.extrasExpiration` | 5 min | `billing.extrasExpiration.js` |

If you see `lock held by another pod, skipping` in logs, that is expected when
two pods race after a K8s `concurrencyPolicy` bypass (e.g. pod crash after
jitter but before finalize). See the runbook entry `## 6 — Cron lock stuck` in
`modules/billing/RUNBOOKS.md` for manual resolution.
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Loading
Loading