Limit GLM sessions to 12 hours (#553)

jahooma · web-flow · commit b7c0155716e1 · 2026-04-26T18:17:16.000-07:00
diff --git a/cli/src/components/waiting-room-screen.tsx b/cli/src/components/waiting-room-screen.tsx
@@ -230,7 +230,7 @@ export const WaitingRoomScreen: React.FC<WaitingRoomScreenProps> = ({
                   <span>Elapsed </span>
                   {formatElapsed(elapsedMs)}
                 </text>
-                {/* Per-model session quota (e.g. GLM 5.1 caps at 5/20h). Only
+                {/* Per-model session quota (e.g. GLM 5.1 caps at 5/12h). Only
                     rendered for rate-limited models so the Minimax queue stays
                     clutter-free. */}
                 {session.rateLimit && (
@@ -298,7 +298,7 @@ export const WaitingRoomScreen: React.FC<WaitingRoomScreenProps> = ({
           )}
 
           {/* Per-model session quota exhausted (e.g. 5+ GLM sessions in the
-              last 20h). Terminal for this run — the user can exit and come
+              last 12h). Terminal for this run — the user can exit and come
               back once the oldest session in the window rolls off. */}
           {session?.status === 'rate_limited' && (
             <>
diff --git a/cli/src/hooks/use-freebuff-session.ts b/cli/src/hooks/use-freebuff-session.ts
@@ -101,7 +101,7 @@ async function callSession(
     }
   }
   // 429 from POST is the per-model session-quota reject (e.g. too many GLM
-  // sessions in the last 20h). Terminal for the current poll — the CLI shows
+  // sessions in the last 12h). Terminal for the current poll — the CLI shows
   // a screen explaining the limit and when the user can try again. The 429
   // status (rather than 200) keeps older CLIs in their error path so they
   // back off instead of tight-polling an unrecognized 200 body.
diff --git a/common/src/types/freebuff-session.ts b/common/src/types/freebuff-session.ts
@@ -10,7 +10,7 @@
  * Per-model usage counter surfaced to the CLI so the waiting-room UI can
  * render "N of M sessions used" alongside queue/active state. Present when
  * the joined model has a rate limit applied (today: GLM 5.1 with 5 admits
- * per 20-hour window). `recentCount` is the number of admissions inside
+ * per 12-hour window). `recentCount` is the number of admissions inside
  * `windowHours` at the time the response was produced — see also the
  * standalone `rate_limited` status for the reject path.
  */
@@ -132,7 +132,7 @@ export type FreebuffSessionServerResponse =
     }
   | {
       /** User has used up their per-model admission quota in the rolling
-       *  window (GLM 5.1: 5 one-hour sessions per 20h). Returned from POST
+       *  window (GLM 5.1: 5 one-hour sessions per 12h). Returned from POST
        *  /session before the user is placed in the queue. `retryAfterMs` is
        *  the time until the oldest admission inside the window falls off
        *  and one quota slot opens up — clients should show the user when
diff --git a/packages/internal/src/db/schema.ts b/packages/internal/src/db/schema.ts
@@ -873,7 +873,7 @@ export const freeSession = pgTable(
 
 /**
  * Audit log of every admission — one row per queued→active transition. Used
- * to rate-limit heavy users (e.g. no more than 5 GLM sessions per 20h).
+ * to rate-limit heavy users (e.g. no more than 5 GLM sessions per 12h).
  *
  * Separate from `free_session` because that table is one-row-per-user (state,
  * not history); the UPSERT path there would otherwise destroy prior admissions.
diff --git a/web/src/server/free-session/__tests__/public-api.test.ts b/web/src/server/free-session/__tests__/public-api.test.ts
@@ -323,23 +323,23 @@ describe('requestSession', () => {
     expect(s3.status).toBe('active')
   })
 
-  // Per-user rate limit (5 GLM admissions per 20h) — the wire limit is
+  // Per-user rate limit (5 GLM admissions per 12h) — the wire limit is
   // hard-coded in public-api.ts, so tests seed the fake admit log directly
   // rather than configuring it. GLM also has deployment-hours gating, so
   // these tests bump `now` into the open window (12pm ET on a weekday)
   // before issuing the request.
   const GLM_MODEL = 'z-ai/glm-5.1'
   const GLM_LIMIT = 5
-  const GLM_WINDOW_HOURS = 20
+  const GLM_WINDOW_HOURS = 12
   const GLM_OPEN_TIME = new Date('2026-04-17T16:00:00Z')
 
   test('rate_limited: 5th GLM admit in window blocks the 6th attempt', async () => {
     deps._tick(GLM_OPEN_TIME)
-    // Seed 5 admits inside the 20h window, spaced so we can verify retryAfter
+    // Seed 5 admits inside the 12h window, spaced so we can verify retryAfter
     // points at the oldest one sliding off.
     const now = deps._now()
-    // Oldest: 19h ago (still in window). Next 4: 1h, 2h, 3h, 4h ago.
-    const ages = [19, 4, 3, 2, 1]
+    // Oldest: 11h ago (still in window). Next 4: 1h, 2h, 3h, 4h ago.
+    const ages = [11, 4, 3, 2, 1]
     for (const hoursAgo of ages) {
       deps.admits.push({
         user_id: 'u1',
@@ -359,15 +359,15 @@ describe('requestSession', () => {
     expect(state.limit).toBe(GLM_LIMIT)
     expect(state.windowHours).toBe(GLM_WINDOW_HOURS)
     expect(state.recentCount).toBe(GLM_LIMIT)
-    // Oldest admit is 19h ago; slot opens when it hits 20h, i.e. in 1h.
+    // Oldest admit is 11h ago; slot opens when it hits 12h, i.e. in 1h.
     expect(state.retryAfterMs).toBe(60 * 60 * 1000)
     // Blocked before any row is written — the user doesn't take a queue slot.
     expect(deps.rows.has('u1')).toBe(false)
   })
 
-  test('rate_limited: admits outside the 20h window do not count', async () => {
+  test('rate_limited: admits outside the 12h window do not count', async () => {
     deps._tick(GLM_OPEN_TIME)
-    // 5 admits, each just over 20h old → all fall off the window.
+    // 5 admits, each just over 12h old → all fall off the window.
     const now = deps._now()
     for (let i = 0; i < 5; i++) {
       deps.admits.push({
@@ -446,7 +446,7 @@ describe('requestSession', () => {
     const now = deps._now()
     // Seed 5 prior admits (the cap), with the latest one matching the
     // active row we're about to install.
-    const ages = [19, 4, 3, 2, 0]
+    const ages = [11, 4, 3, 2, 0]
     for (const hoursAgo of ages) {
       deps.admits.push({
         user_id: 'u1',
@@ -527,7 +527,7 @@ describe('requestSession', () => {
     // must be blocked by the quota.
     deps._tick(GLM_OPEN_TIME)
     const now = deps._now()
-    const ages = [19, 4, 3, 2, 1]
+    const ages = [11, 4, 3, 2, 1]
     for (const hoursAgo of ages) {
       deps.admits.push({
         user_id: 'u1',
@@ -660,7 +660,7 @@ describe('getSessionState', () => {
     expect(state.rateLimit).toEqual({
       model: 'z-ai/glm-5.1',
       limit: 5,
-      windowHours: 20,
+      windowHours: 12,
       recentCount: 1,
     })
   })
diff --git a/web/src/server/free-session/public-api.ts b/web/src/server/free-session/public-api.ts
@@ -41,25 +41,28 @@ import type { InternalSessionRow, SessionStateResponse } from './types'
  * queued/active responses — changing them is a deliberate, typed edit.
  */
 const RATE_LIMITS: Record<string, { limit: number; windowHours: number }> = {
-  'z-ai/glm-5.1': { limit: 5, windowHours: 20 },
+  'z-ai/glm-5.1': { limit: 5, windowHours: 12 },
 }
 
 /** Fetch the caller's current quota snapshot for `model`, or undefined if the
  *  model isn't rate-limited. Used by both POST (after admit) and GET polls so
  *  the CLI's "N of M sessions used" line stays live instead of disappearing
- *  after the first poll. Also returns the oldest admit in-window so callers
- *  that need `retryAfterMs` don't have to re-query. */
+ *  after the first poll. Also returns the oldest admit in-window and the
+ *  window duration so callers that need `retryAfterMs` don't have to re-query
+ *  or duplicate the window math. */
 async function fetchRateLimitSnapshot(
   userId: string,
   model: string,
   deps: SessionDeps,
 ): Promise<
-  { info: FreebuffSessionRateLimit; oldest: Date | null } | undefined
+  | { info: FreebuffSessionRateLimit; oldest: Date | null; windowMs: number }
+  | undefined
 > {
   const cfg = RATE_LIMITS[model]
   if (!cfg) return undefined
   const now = nowOf(deps)
-  const since = new Date(now.getTime() - cfg.windowHours * 60 * 60 * 1000)
+  const windowMs = cfg.windowHours * 60 * 60 * 1000
+  const since = new Date(now.getTime() - windowMs)
   const admits = await deps.listRecentAdmits({
     userId,
     model,
@@ -74,6 +77,7 @@ async function fetchRateLimitSnapshot(
       recentCount: admits.length,
     },
     oldest: admits[0] ?? null,
+    windowMs,
   }
 }
 
@@ -271,10 +275,9 @@ export async function requestSession(params: {
     if (snapshot && snapshot.info.recentCount >= snapshot.info.limit) {
       // Oldest admit's window-anniversary is when one slot opens back up.
       // Clamped at 0 so a clock skew can't surface a negative retry-after.
-      const windowMs = snapshot.info.windowHours * 60 * 60 * 1000
       const retryAfterMs = Math.max(
         0,
-        (snapshot.oldest?.getTime() ?? 0) + windowMs - now.getTime(),
+        (snapshot.oldest?.getTime() ?? 0) + snapshot.windowMs - now.getTime(),
       )
       return {
         status: 'rate_limited',
diff --git a/web/src/server/free-session/store.ts b/web/src/server/free-session/store.ts
@@ -436,7 +436,7 @@ export async function promoteQueuedUser(params: {
  * so one query covers both the check and the reject path.
  *
  * Drives the per-user, per-model rate limit (e.g. at most 5 GLM sessions in
- * the last 20h) enforced before `joinOrTakeOver`.
+ * the last 12h) enforced before `joinOrTakeOver`.
  */
 export async function listRecentAdmits(params: {
   userId: string

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ async function callSession(`
`101`	`101`	`}`
`102`	`102`	`}`
`103`	`103`	`// 429 from POST is the per-model session-quota reject (e.g. too many GLM`
`104`		`- // sessions in the last 20h). Terminal for the current poll — the CLI shows`
	`104`	`+ // sessions in the last 12h). Terminal for the current poll — the CLI shows`
`105`	`105`	`// a screen explaining the limit and when the user can try again. The 429`
`106`	`106`	`// status (rather than 200) keeps older CLIs in their error path so they`
`107`	`107`	`// back off instead of tight-polling an unrecognized 200 body.`
Original file line number	Diff line number	Diff line change
`@@ -873,7 +873,7 @@ export const freeSession = pgTable(`
`873`	`873`
`874`	`874`	`/**`
`875`	`875`	`* Audit log of every admission — one row per queued→active transition. Used`
`876`		`- * to rate-limit heavy users (e.g. no more than 5 GLM sessions per 20h).`
	`876`	`+ * to rate-limit heavy users (e.g. no more than 5 GLM sessions per 12h).`
`877`	`877`	`*`
`878`	`878`	* Separate from `free_session` because that table is one-row-per-user (state,
`879`	`879`	`* not history); the UPSERT path there would otherwise destroy prior admissions.`
Original file line number	Diff line number	Diff line change
`@@ -436,7 +436,7 @@ export async function promoteQueuedUser(params: {`
`436`	`436`	`* so one query covers both the check and the reject path.`
`437`	`437`	`*`
`438`	`438`	`* Drives the per-user, per-model rate limit (e.g. at most 5 GLM sessions in`
`439`		- * the last 20h) enforced before `joinOrTakeOver`.
	`439`	+ * the last 12h) enforced before `joinOrTakeOver`.
`440`	`440`	`*/`
`441`	`441`	`export async function listRecentAdmits(params: {`
`442`	`442`	`userId: string`