diff --git a/modules/billing/RUNBOOKS.md b/modules/billing/RUNBOOKS.md index e932319c5..3a6bafb05 100644 --- a/modules/billing/RUNBOOKS.md +++ b/modules/billing/RUNBOOKS.md @@ -218,3 +218,47 @@ kubectl exec -n pierreb-projects mongo-0 -- mongosh \ ``` **Prevention:** Lock TTLs are intentionally conservative. If you see frequent stuck-lock incidents, investigate cron duration (slow query? tenant scale?) rather than lower the TTL — a TTL too short defeats the mutual-exclusion guarantee. + +--- + +## 7 — Refund Correlation Backfill Failure + +**Symptom:** ERROR log `[billing.webhook] PI metadata backfill failed after retries — refund correlation at risk`. +A later refund webhook may additionally log `refund unresolved — no stripeSessionId on charge metadata`. + +**Cause:** `stripe.paymentIntents.update` failed for all 3 retry attempts during +`checkout.session.completed` handling. Likely cause: transient Stripe API outage. + +**Triage:** + +1. Query unresolved entries: + + ```bash + db.billing_failed_backfills.find({ resolvedAt: null }) + ``` + + Each document contains `paymentIntentId`, `stripeSessionId`, `error`, and `failedAt`. + +2. For each entry, manually patch the PI via Stripe CLI or Dashboard: + + ```bash + stripe payment_intents update pi_xxx \ + --metadata stripeSessionId=cs_xxx + ``` + + Verify in Stripe Dashboard → Payments → PaymentIntent → Metadata. + +3. Mark the record resolved to close the loop: + + ```bash + db.billing_failed_backfills.updateOne( + { _id: ObjectId('...') }, + { $set: { resolvedAt: new Date(), resolvedBy: 'admin' } } + ) + ``` + +4. Confirm refund correlation: if a refund was already processed while the PI metadata was missing, + check for `billing.refund.unresolved` alerts and follow **Runbook #2** (Dead-Letter Investigation) + to replay the refund event after the PI metadata is patched. + +**Escalate if:** frequency exceeds 1 per week → promote to cron-based auto-retry. diff --git a/modules/billing/lib/billing.retry.js b/modules/billing/lib/billing.retry.js new file mode 100644 index 000000000..a238157bd --- /dev/null +++ b/modules/billing/lib/billing.retry.js @@ -0,0 +1,36 @@ +/** + * Retry an async operation with exponential backoff. + * + * For default opts (attempts=3, baseMs=200), delays are 200ms then 400ms + * (no delay after the final attempt). General formula: baseMs * 2^i for + * each non-final attempt i. + * + * Returns the result of the first successful call, or throws the last + * error after all attempts are exhausted. + * + * @param {() => Promise} fn - Async function to attempt. + * @param {object} [opts] + * @param {number} [opts.attempts=3] - Maximum number of attempts (including the first call). + * @param {number} [opts.baseMs=200] - Base delay in ms for the first retry. + * @returns {Promise} + */ +export async function retryWithBackoff(fn, { attempts = 3, baseMs = 200 } = {}) { + if (!Number.isInteger(attempts) || attempts < 1) { + throw new TypeError(`retryWithBackoff: attempts must be a positive integer, received ${attempts}`); + } + if (!Number.isFinite(baseMs) || baseMs < 0) { + throw new TypeError(`retryWithBackoff: baseMs must be a non-negative finite number, received ${baseMs}`); + } + let lastErr; + for (let i = 0; i < attempts; i++) { + try { + return await fn(); + } catch (err) { + lastErr = err; + if (i < attempts - 1) { + await new Promise((resolve) => setTimeout(resolve, baseMs * 2 ** i)); + } + } + } + throw lastErr; +} diff --git a/modules/billing/models/billing.failedBackfill.model.mongoose.js b/modules/billing/models/billing.failedBackfill.model.mongoose.js new file mode 100644 index 000000000..2d94a887d --- /dev/null +++ b/modules/billing/models/billing.failedBackfill.model.mongoose.js @@ -0,0 +1,94 @@ +/** + * Module dependencies + */ +import mongoose from 'mongoose'; + +const Schema = mongoose.Schema; + +/** + * BillingFailedBackfill Data Model Mongoose + * + * Dead-letter store for PaymentIntent metadata backfill failures. + * Records are written when the refund-correlation backfill (stripe.paymentIntents.update + * in handleCheckoutPaymentCompleted) fails after all retry attempts. + * + * Kept permanently so operators can manually reconcile unresolved entries. + * Never auto-expired — resolvedAt is set by the operator after manual fix. + */ +const BillingFailedBackfillMongoose = new Schema( + { + paymentIntentId: { + type: String, + required: true, + index: true, + }, + stripeSessionId: { + type: String, + required: true, + }, + /** + * Serialised error message from the last failed attempt. + */ + error: { + type: String, + default: null, + }, + /** + * Timestamp of the first failure (when the record was created). + */ + failedAt: { + type: Date, + required: true, + default: () => new Date(), + }, + /** + * Timestamp set by the operator after the PI metadata has been manually patched + * and the refund correlation risk resolved. + */ + resolvedAt: { + type: Date, + default: null, + }, + /** + * Operator tag explaining how the record was resolved. + * E.g. 'admin', 'cron'. + */ + resolvedBy: { + type: String, + default: null, + }, + }, + { + collection: 'billing_failed_backfills', + timestamps: false, + }, +); + +// Partial index — only unresolved documents are indexed, so this stays small +// even after the collection accumulates many resolved entries. +// (Sparse would be a no-op here: resolvedAt has default: null, so every document +// has the field present — sparse skips only docs where the field is absent.) +BillingFailedBackfillMongoose.index( + { resolvedAt: 1 }, + { partialFilterExpression: { resolvedAt: null } }, +); + +/** + * Returns the hex string representation of the document ObjectId. + * @returns {string} Hex string of the ObjectId. + */ +function addID() { + return this._id.toHexString(); +} + +/** + * Model configuration + */ +BillingFailedBackfillMongoose.virtual('id').get(addID); +BillingFailedBackfillMongoose.set('toJSON', { + virtuals: true, +}); + +export const BillingFailedBackfill = + mongoose.models.BillingFailedBackfill ?? + mongoose.model('BillingFailedBackfill', BillingFailedBackfillMongoose); diff --git a/modules/billing/repositories/billing.failedBackfill.repository.js b/modules/billing/repositories/billing.failedBackfill.repository.js new file mode 100644 index 000000000..9144324e6 --- /dev/null +++ b/modules/billing/repositories/billing.failedBackfill.repository.js @@ -0,0 +1,30 @@ +/** + * Module dependencies + */ +import mongoose from 'mongoose'; + +/** + * @function BillingFailedBackfill + * @description Lazily resolves the BillingFailedBackfill Mongoose model. + * Deferred to keep unit tests importable before model registration. + * @returns {import('mongoose').Model} The registered BillingFailedBackfill model. + */ +// biome-ignore lint/correctness/useQwikValidLexicalScope: false positive — Node.js repository, not Qwik +const BillingFailedBackfill = () => mongoose.model('BillingFailedBackfill'); + +/** + * @function record + * @description Write a dead-letter entry for a PaymentIntent metadata backfill failure. + * Called by billing.webhook.service after all retry attempts are exhausted. + * @param {object} opts + * @param {string} opts.paymentIntentId - Stripe PaymentIntent id (pi_*). + * @param {string} opts.stripeSessionId - Stripe checkout session id (cs_*). + * @param {string|null} [opts.error] - Serialised error message from the last failed attempt. + * @param {Date} [opts.failedAt] - Timestamp of the failure (defaults to now). + * @returns {Promise} + */ +// biome-ignore lint/correctness/useQwikValidLexicalScope: false positive — Node.js repository, not Qwik +const record = ({ paymentIntentId, stripeSessionId, error = null, failedAt = new Date() }) => + BillingFailedBackfill().create({ paymentIntentId, stripeSessionId, error, failedAt }); + +export default { record }; diff --git a/modules/billing/services/billing.webhook.service.js b/modules/billing/services/billing.webhook.service.js index 8d1ba64b3..987edc874 100644 --- a/modules/billing/services/billing.webhook.service.js +++ b/modules/billing/services/billing.webhook.service.js @@ -9,10 +9,12 @@ import logger from '../../../lib/services/logger.js'; import SubscriptionRepository from '../repositories/billing.subscription.repository.js'; import ProcessedStripeEventRepository from '../repositories/billing.processedStripeEvent.repository.js'; import OrganizationRepository from '../../organizations/repositories/organizations.repository.js'; +import BillingFailedBackfillRepository from '../repositories/billing.failedBackfill.repository.js'; import BillingExtraService from './billing.extra.service.js'; import BillingResetService from './billing.reset.service.js'; import billingEvents from '../lib/events.js'; import { SENTINEL_PENDING } from '../lib/billing.constants.js'; +import { retryWithBackoff } from '../lib/billing.retry.js'; /** * Treats a stripeSessionId as "unresolved" when absent, empty, or still the @@ -311,21 +313,36 @@ const handleCheckoutPaymentCompleted = async (session) => { const stripe = getStripe(); if (stripe) { try { - await stripe.paymentIntents.update(paymentIntentId, { - metadata: { - organizationId, - packId, - kind: 'extras', - stripeSessionId, // real cs_* ID (replaces SENTINEL_PENDING) - }, - }); + await retryWithBackoff( + () => + stripe.paymentIntents.update(paymentIntentId, { + metadata: { + organizationId, + packId, + kind: 'extras', + stripeSessionId, // real cs_* ID (replaces SENTINEL_PENDING) + }, + }), + { attempts: 3, baseMs: 200 }, + ); } catch (err) { - // Log but don't fail — refund correlation may use the backfill resolver path - logger.warn('[billing.webhook] PaymentIntent metadata update failed', { - paymentIntentId, - error: err?.message ?? String(err), - stack: err?.stack, - }); + logger.error( + '[billing.webhook] PI metadata backfill failed after retries — refund correlation at risk', + { paymentIntentId, stripeSessionId, error: err?.message ?? String(err), stack: err?.stack }, + ); + try { + await BillingFailedBackfillRepository.record({ + paymentIntentId, + stripeSessionId, + error: err?.message ?? String(err), + failedAt: new Date(), + }); + } catch (dlqErr) { + logger.error( + '[billing.webhook] dead-letter write failed — manual reconciliation required', + { paymentIntentId, stripeSessionId, error: dlqErr?.message ?? String(dlqErr), stack: dlqErr?.stack }, + ); + } } } } diff --git a/modules/billing/tests/billing.refund-correlation.unit.tests.js b/modules/billing/tests/billing.refund-correlation.unit.tests.js new file mode 100644 index 000000000..2593e2aad --- /dev/null +++ b/modules/billing/tests/billing.refund-correlation.unit.tests.js @@ -0,0 +1,206 @@ +/** + * Module dependencies. + */ +import { jest, describe, test, beforeEach, afterEach, expect } from '@jest/globals'; +import { retryWithBackoff } from '../lib/billing.retry.js'; + +/** + * Unit tests — checkout.session.completed PaymentIntent metadata backfill retry + * + * Covers: + * - retryWithBackoff: 3 attempts on Stripe API failure before escalating + * - Dead-letter: BillingFailedBackfillRepository.record called after 3 consecutive failures + * - Dead-letter write failure: logger.error emitted, handler does not throw + */ + +describe('checkout.session.completed — metadata backfill retry:', () => { + let BillingWebhookService; + let mockStripeInstance; + let mockExtraService; + let mockLogger; + let mockBillingFailedBackfill; + + const orgId = '507f1f77bcf86cd799439011'; + const stripeSessionId = 'cs_test_session_abc'; + const paymentIntentId = 'pi_test_retry_001'; + + /** + * Build a minimal Stripe checkout.session fixture for backfill tests. + * @returns {{ id: string, payment_status: string, payment_intent: string, metadata: { organizationId: string, packId: string, kind: string } }} + */ + const makeSession = () => ({ + id: stripeSessionId, + payment_status: 'paid', + payment_intent: paymentIntentId, + metadata: { organizationId: orgId, packId: 'pack_500k', kind: 'extras' }, + }); + + beforeEach(async () => { + jest.resetModules(); + // Use fake timers so backoff delays don't slow the suite. + jest.useFakeTimers(); + + mockLogger = { info: jest.fn(), error: jest.fn(), warn: jest.fn(), debug: jest.fn() }; + + mockExtraService = { + creditPack: jest.fn().mockResolvedValue({ doc: {}, applied: true }), + refundPartial: jest.fn(), + }; + + mockStripeInstance = { + paymentIntents: { + update: jest.fn().mockResolvedValue({}), + retrieve: jest.fn().mockResolvedValue({ id: paymentIntentId, metadata: { stripeSessionId } }), + }, + }; + + // BillingFailedBackfillRepository.record() is the repository boundary used by the service. + mockBillingFailedBackfill = { + record: jest.fn().mockResolvedValue({}), + }; + + jest.unstable_mockModule('../lib/stripe.js', () => ({ + default: jest.fn(() => mockStripeInstance), + })); + + jest.unstable_mockModule('../services/billing.extra.service.js', () => ({ + default: mockExtraService, + })); + + jest.unstable_mockModule('../repositories/billing.subscription.repository.js', () => ({ + default: { + findByOrganization: jest.fn(), + findByStripeCustomerId: jest.fn(), + findByStripeSubscriptionId: jest.fn(), + create: jest.fn(), + update: jest.fn(), + updateIfEventNewer: jest.fn().mockResolvedValue(null), + }, + })); + + jest.unstable_mockModule('../repositories/billing.processedStripeEvent.repository.js', () => ({ + default: { + tryRecord: jest.fn().mockResolvedValue({ recorded: true }), + incrementAttempts: jest.fn(), + markDeadLetter: jest.fn(), + deleteByEventId: jest.fn(), + }, + })); + + jest.unstable_mockModule('../../organizations/repositories/organizations.repository.js', () => ({ + default: { setPlan: jest.fn().mockResolvedValue({}) }, + })); + + jest.unstable_mockModule('../services/billing.reset.service.js', () => ({ + default: { resetWeek: jest.fn(), forceRotateForPlanChange: jest.fn() }, + })); + + jest.unstable_mockModule('../lib/events.js', () => ({ + default: { emit: jest.fn() }, + })); + + jest.unstable_mockModule('../../../lib/services/logger.js', () => ({ + default: mockLogger, + })); + + jest.unstable_mockModule('../../../config/index.js', () => ({ + default: { billing: { plans: ['free', 'starter', 'pro', 'enterprise'] } }, + })); + + jest.unstable_mockModule('../repositories/billing.failedBackfill.repository.js', () => ({ + default: mockBillingFailedBackfill, + })); + + const mod = await import('../services/billing.webhook.service.js'); + BillingWebhookService = mod.default; + }); + + afterEach(() => { + jest.useRealTimers(); + jest.restoreAllMocks(); + }); + + test('retries up to 3 times on Stripe API failure before escalating', async () => { + // Fail twice, succeed on the third attempt. + mockStripeInstance.paymentIntents.update + .mockRejectedValueOnce(new Error('Stripe 500')) + .mockRejectedValueOnce(new Error('Stripe 500')) + .mockResolvedValueOnce({}); + + const promise = BillingWebhookService.handleCheckoutPaymentCompleted(makeSession()); + + // Advance fake timers past the two backoff delays (200ms + 400ms). + await jest.runAllTimersAsync(); + + await promise; + + // 3 total calls (2 retries + 1 success) + expect(mockStripeInstance.paymentIntents.update).toHaveBeenCalledTimes(3); + // No dead-letter record created because the third attempt succeeded. + expect(mockBillingFailedBackfill.record).not.toHaveBeenCalled(); + // No error log — outcome was successful. + expect(mockLogger.error).not.toHaveBeenCalled(); + }); + + test('escalates to dead-letter after 3 failed retries', async () => { + const stripeErr = new Error('Stripe persistently down'); + mockStripeInstance.paymentIntents.update.mockRejectedValue(stripeErr); + + const promise = BillingWebhookService.handleCheckoutPaymentCompleted(makeSession()); + + // Advance past all backoff delays. + await jest.runAllTimersAsync(); + + await promise; + + // All 3 attempts exhausted. + expect(mockStripeInstance.paymentIntents.update).toHaveBeenCalledTimes(3); + + // logger.error should be called for the final failure. + expect(mockLogger.error).toHaveBeenCalledWith( + expect.stringContaining('backfill failed after retries'), + expect.objectContaining({ paymentIntentId }), + ); + + // Dead-letter record created with the right shape. + expect(mockBillingFailedBackfill.record).toHaveBeenCalledWith( + expect.objectContaining({ + paymentIntentId, + stripeSessionId, + error: stripeErr.message, + }), + ); + }); + + test('handler does not throw when dead-letter write itself fails', async () => { + mockStripeInstance.paymentIntents.update.mockRejectedValue(new Error('Stripe down')); + mockBillingFailedBackfill.record.mockRejectedValue(new Error('Mongo down')); + + const promise = BillingWebhookService.handleCheckoutPaymentCompleted(makeSession()); + await jest.runAllTimersAsync(); + + // Must not throw even if both the backfill and dead-letter write fail. + await expect(promise).resolves.toBeUndefined(); + + // Both error paths should be logged. + expect(mockLogger.error).toHaveBeenCalledTimes(2); + expect(mockLogger.error).toHaveBeenCalledWith( + expect.stringContaining('backfill failed after retries'), + expect.objectContaining({ paymentIntentId }), + ); + expect(mockLogger.error).toHaveBeenCalledWith( + expect.stringContaining('dead-letter write failed'), + expect.objectContaining({ paymentIntentId }), + ); + }); +}); + +describe('retryWithBackoff guards:', () => { + test('throws TypeError when attempts < 1', async () => { + await expect(retryWithBackoff(async () => 'x', { attempts: 0 })).rejects.toThrow(TypeError); + }); + + test('throws TypeError when baseMs is negative', async () => { + await expect(retryWithBackoff(async () => 'x', { baseMs: -1 })).rejects.toThrow(TypeError); + }); +});