Skip to content
44 changes: 44 additions & 0 deletions modules/billing/RUNBOOKS.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,47 @@ kubectl exec -n pierreb-projects mongo-0 -- mongosh \
```

**Prevention:** Lock TTLs are intentionally conservative. If you see frequent stuck-lock incidents, investigate cron duration (slow query? tenant scale?) rather than lower the TTL — a TTL too short defeats the mutual-exclusion guarantee.

---

## 7 — Refund Correlation Backfill Failure

**Symptom:** ERROR log `[billing.webhook] PI metadata backfill failed after retries — refund correlation at risk`.
A later refund webhook may additionally log `refund unresolved — no stripeSessionId on charge metadata`.

**Cause:** `stripe.paymentIntents.update` failed for all 3 retry attempts during
`checkout.session.completed` handling. Likely cause: transient Stripe API outage.

**Triage:**

1. Query unresolved entries:

```bash
db.billing_failed_backfills.find({ resolvedAt: null })
```

Each document contains `paymentIntentId`, `stripeSessionId`, `error`, and `failedAt`.

2. For each entry, manually patch the PI via Stripe CLI or Dashboard:

```bash
stripe payment_intents update pi_xxx \
--metadata stripeSessionId=cs_xxx
```

Verify in Stripe Dashboard → Payments → PaymentIntent → Metadata.

3. Mark the record resolved to close the loop:

```bash
db.billing_failed_backfills.updateOne(
{ _id: ObjectId('...') },
{ $set: { resolvedAt: new Date(), resolvedBy: 'admin' } }
)
```

4. Confirm refund correlation: if a refund was already processed while the PI metadata was missing,
check for `billing.refund.unresolved` alerts and follow **Runbook #2** (Dead-Letter Investigation)
to replay the refund event after the PI metadata is patched.

**Escalate if:** frequency exceeds 1 per week → promote to cron-based auto-retry.
36 changes: 36 additions & 0 deletions modules/billing/lib/billing.retry.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/**
* Retry an async operation with exponential backoff.
*
* For default opts (attempts=3, baseMs=200), delays are 200ms then 400ms
* (no delay after the final attempt). General formula: baseMs * 2^i for
* each non-final attempt i.
*
* Returns the result of the first successful call, or throws the last
* error after all attempts are exhausted.
*
* @param {() => Promise<T>} fn - Async function to attempt.
* @param {object} [opts]
* @param {number} [opts.attempts=3] - Maximum number of attempts (including the first call).
* @param {number} [opts.baseMs=200] - Base delay in ms for the first retry.
* @returns {Promise<T>}
*/
export async function retryWithBackoff(fn, { attempts = 3, baseMs = 200 } = {}) {
if (!Number.isInteger(attempts) || attempts < 1) {
throw new TypeError(`retryWithBackoff: attempts must be a positive integer, received ${attempts}`);
}
if (!Number.isFinite(baseMs) || baseMs < 0) {
throw new TypeError(`retryWithBackoff: baseMs must be a non-negative finite number, received ${baseMs}`);
}
let lastErr;
for (let i = 0; i < attempts; i++) {
try {
return await fn();
} catch (err) {
lastErr = err;
if (i < attempts - 1) {
await new Promise((resolve) => setTimeout(resolve, baseMs * 2 ** i));
}
}
}
throw lastErr;
}
Comment thread
PierreBrisorgueil marked this conversation as resolved.
Comment thread
coderabbitai[bot] marked this conversation as resolved.
94 changes: 94 additions & 0 deletions modules/billing/models/billing.failedBackfill.model.mongoose.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/**
* Module dependencies
*/
import mongoose from 'mongoose';

const Schema = mongoose.Schema;

/**
* BillingFailedBackfill Data Model Mongoose
*
* Dead-letter store for PaymentIntent metadata backfill failures.
* Records are written when the refund-correlation backfill (stripe.paymentIntents.update
* in handleCheckoutPaymentCompleted) fails after all retry attempts.
*
* Kept permanently so operators can manually reconcile unresolved entries.
* Never auto-expired — resolvedAt is set by the operator after manual fix.
*/
const BillingFailedBackfillMongoose = new Schema(
{
paymentIntentId: {
type: String,
required: true,
index: true,
},
stripeSessionId: {
type: String,
required: true,
},
/**
* Serialised error message from the last failed attempt.
*/
error: {
type: String,
default: null,
},
/**
* Timestamp of the first failure (when the record was created).
*/
failedAt: {
type: Date,
required: true,
default: () => new Date(),
},
/**
* Timestamp set by the operator after the PI metadata has been manually patched
* and the refund correlation risk resolved.
*/
resolvedAt: {
type: Date,
default: null,
},
/**
* Operator tag explaining how the record was resolved.
* E.g. 'admin', 'cron'.
*/
resolvedBy: {
type: String,
default: null,
},
},
{
collection: 'billing_failed_backfills',
timestamps: false,
},
);

// Partial index — only unresolved documents are indexed, so this stays small
// even after the collection accumulates many resolved entries.
// (Sparse would be a no-op here: resolvedAt has default: null, so every document
// has the field present — sparse skips only docs where the field is absent.)
BillingFailedBackfillMongoose.index(
{ resolvedAt: 1 },
{ partialFilterExpression: { resolvedAt: null } },
);

/**
* Returns the hex string representation of the document ObjectId.
* @returns {string} Hex string of the ObjectId.
*/
function addID() {
return this._id.toHexString();
}

/**
* Model configuration
*/
BillingFailedBackfillMongoose.virtual('id').get(addID);
BillingFailedBackfillMongoose.set('toJSON', {
virtuals: true,
});

export const BillingFailedBackfill =
mongoose.models.BillingFailedBackfill ??
mongoose.model('BillingFailedBackfill', BillingFailedBackfillMongoose);
30 changes: 30 additions & 0 deletions modules/billing/repositories/billing.failedBackfill.repository.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/**
* Module dependencies
*/
import mongoose from 'mongoose';

/**
* @function BillingFailedBackfill
* @description Lazily resolves the BillingFailedBackfill Mongoose model.
* Deferred to keep unit tests importable before model registration.
* @returns {import('mongoose').Model} The registered BillingFailedBackfill model.
*/
// biome-ignore lint/correctness/useQwikValidLexicalScope: false positive — Node.js repository, not Qwik
const BillingFailedBackfill = () => mongoose.model('BillingFailedBackfill');

/**
* @function record
* @description Write a dead-letter entry for a PaymentIntent metadata backfill failure.
* Called by billing.webhook.service after all retry attempts are exhausted.
* @param {object} opts
* @param {string} opts.paymentIntentId - Stripe PaymentIntent id (pi_*).
* @param {string} opts.stripeSessionId - Stripe checkout session id (cs_*).
* @param {string|null} [opts.error] - Serialised error message from the last failed attempt.
* @param {Date} [opts.failedAt] - Timestamp of the failure (defaults to now).
* @returns {Promise<import('mongoose').Document>}
*/
// biome-ignore lint/correctness/useQwikValidLexicalScope: false positive — Node.js repository, not Qwik
const record = ({ paymentIntentId, stripeSessionId, error = null, failedAt = new Date() }) =>
BillingFailedBackfill().create({ paymentIntentId, stripeSessionId, error, failedAt });

export default { record };
45 changes: 31 additions & 14 deletions modules/billing/services/billing.webhook.service.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ import logger from '../../../lib/services/logger.js';
import SubscriptionRepository from '../repositories/billing.subscription.repository.js';
import ProcessedStripeEventRepository from '../repositories/billing.processedStripeEvent.repository.js';
import OrganizationRepository from '../../organizations/repositories/organizations.repository.js';
import BillingFailedBackfillRepository from '../repositories/billing.failedBackfill.repository.js';
import BillingExtraService from './billing.extra.service.js';
import BillingResetService from './billing.reset.service.js';
import billingEvents from '../lib/events.js';
import { SENTINEL_PENDING } from '../lib/billing.constants.js';
import { retryWithBackoff } from '../lib/billing.retry.js';

/**
* Treats a stripeSessionId as "unresolved" when absent, empty, or still the
Expand Down Expand Up @@ -311,21 +313,36 @@ const handleCheckoutPaymentCompleted = async (session) => {
const stripe = getStripe();
if (stripe) {
try {
await stripe.paymentIntents.update(paymentIntentId, {
metadata: {
organizationId,
packId,
kind: 'extras',
stripeSessionId, // real cs_* ID (replaces SENTINEL_PENDING)
},
});
await retryWithBackoff(
() =>
stripe.paymentIntents.update(paymentIntentId, {
metadata: {
organizationId,
packId,
kind: 'extras',
stripeSessionId, // real cs_* ID (replaces SENTINEL_PENDING)
},
}),
{ attempts: 3, baseMs: 200 },
);
} catch (err) {
// Log but don't fail — refund correlation may use the backfill resolver path
logger.warn('[billing.webhook] PaymentIntent metadata update failed', {
paymentIntentId,
error: err?.message ?? String(err),
stack: err?.stack,
});
logger.error(
'[billing.webhook] PI metadata backfill failed after retries — refund correlation at risk',
{ paymentIntentId, stripeSessionId, error: err?.message ?? String(err), stack: err?.stack },
);
Comment thread
PierreBrisorgueil marked this conversation as resolved.
try {
await BillingFailedBackfillRepository.record({
paymentIntentId,
stripeSessionId,
error: err?.message ?? String(err),
failedAt: new Date(),
});
} catch (dlqErr) {
logger.error(
'[billing.webhook] dead-letter write failed — manual reconciliation required',
{ paymentIntentId, stripeSessionId, error: dlqErr?.message ?? String(dlqErr), stack: dlqErr?.stack },
);
Comment thread
PierreBrisorgueil marked this conversation as resolved.
}
}
}
}
Expand Down
Loading
Loading