From aa1cd1dbd50e075a6f0ad08055c59be3adc8b3e5 Mon Sep 17 00:00:00 2001 From: Juraj Majerik Date: Fri, 8 May 2026 15:37:11 +0200 Subject: [PATCH] feat(code): suggest experiment opportunities in setup discovery Adds an experiment-category tier to setup discovery, gated by the posthog-code-experiment-suggestions flag. When enabled, the agent scans for A/B-testable surfaces alongside the existing bug and PostHog-instrumentation tiers, and the slot-reservation rule keeps at least one experiment card if a credible candidate exists. Accepted experiment cards build a task prompt that reuses the run-experiment skill template to scaffold flag, draft experiment, and code wiring end-to-end. Defaults off in production; on in dev via import.meta.env.DEV. --- .../src/renderer/features/setup/prompts.ts | 28 +++- .../setup/services/setupRunService.ts | 25 ++- .../code/src/renderer/features/setup/types.ts | 142 ++++++++++-------- .../setup/utils/buildDiscoveredTaskPrompt.ts | 34 +++++ .../features/setup/utils/categoryConfig.ts | 2 + apps/code/src/shared/constants.ts | 2 + apps/code/src/shared/types/analytics.ts | 3 +- 7 files changed, 163 insertions(+), 73 deletions(-) diff --git a/apps/code/src/renderer/features/setup/prompts.ts b/apps/code/src/renderer/features/setup/prompts.ts index a3816fbd4..3f3c7646b 100644 --- a/apps/code/src/renderer/features/setup/prompts.ts +++ b/apps/code/src/renderer/features/setup/prompts.ts @@ -4,7 +4,7 @@ After the integration is wired up, also instrument error tracking and session re Run autonomously with sensible defaults — do not ask the user questions. If the PostHog API key isn't already in the project's env files and you can't read it from the PostHog MCP server, leave a placeholder env var and note it in the PR body rather than blocking.`; -export const DISCOVERY_PROMPT = `You are analyzing this codebase to find the highest-value first tasks for the developer. +const DISCOVERY_PROMPT_BASE = `You are analyzing this codebase to find the highest-value first tasks for the developer. Scan the codebase for issues in two tiers. Tier 1 applies to every repo. Tier 2 only applies when PostHog is already installed (look for posthog-js, posthog-node, posthog-react-native or similar PostHog SDK imports). @@ -21,7 +21,22 @@ Scan the codebase for issues in two tiers. Tier 1 applies to every repo. Tier 2 - **Stale feature flags**: Flags that are always evaluated the same way, flags referenced in code but never toggled, flags guarding code that shipped long ago. Category: stale_feature_flag - **Error tracking gaps**: Catch blocks that swallow errors without reporting, missing error boundaries, untracked 5xx responses. Category: error_tracking - **Event tracking improvements**: Key user actions (signup, purchase, invite, upgrade) with no analytics event, events missing useful properties (plan, user role, page context). Category: event_tracking -- **Funnel weak spots**: Multi-step flows (onboarding, checkout, activation) where intermediate steps have no tracking, making drop-off invisible. Category: funnel +- **Funnel weak spots**: Multi-step flows (onboarding, checkout, activation) where intermediate steps have no tracking, making drop-off invisible. Category: funnel`; + +const DISCOVERY_PROMPT_EXPERIMENT_TIER = ` + +## Tier 3 -- Experiment opportunities (only when PostHog SDK is detected) + +- **Experimentable surfaces**: User-facing surfaces where an A/B test would meaningfully inform a product decision — pricing pages, paywalls, primary CTAs, signup/onboarding flows, empty states, recommendation lists, upgrade prompts. Category: experiment + - Title: a one-line hypothesis ("Test 'Get started free' vs 'Sign up' on landing CTA") + - Description: state the hypothesis as a sentence — what you would change and why you think it would move the metric + - Impact: name the primary metric you would measure (e.g. "Sign-up conversion on /landing") and what a winning variant would look like + - Recommendation: describe the control and test variants concretely (exact copy, layout change, or behavior), and note any flag wiring required (\`posthog.getFeatureFlag\`) + - Only suggest experiments where: (a) the surface is in code you can point at, (b) the variant is implementable without backend changes you can't see, and (c) the metric is something a typical PostHog event would capture + +If you find at least one credible Tier 3 experiment opportunity, include at least one experiment-category task in your output — even if doing so displaces a lower-impact Tier 1/2 finding. Do not fabricate an experiment to fill the slot: if no credible candidate exists, omit the category entirely.`; + +const DISCOVERY_PROMPT_RULES = ` ## Rules @@ -35,3 +50,12 @@ Scan the codebase for issues in two tiers. Tier 1 applies to every repo. Tier 2 - Maximum 4 tasks. Quality over quantity. When you are done analyzing, call create_output with your findings.`; + +export function buildDiscoveryPrompt({ + includeExperiments, +}: { + includeExperiments: boolean; +}): string { + const middle = includeExperiments ? DISCOVERY_PROMPT_EXPERIMENT_TIER : ""; + return `${DISCOVERY_PROMPT_BASE}${middle}${DISCOVERY_PROMPT_RULES}`; +} diff --git a/apps/code/src/renderer/features/setup/services/setupRunService.ts b/apps/code/src/renderer/features/setup/services/setupRunService.ts index 1ae496e36..70c55c33f 100644 --- a/apps/code/src/renderer/features/setup/services/setupRunService.ts +++ b/apps/code/src/renderer/features/setup/services/setupRunService.ts @@ -1,10 +1,10 @@ import { getAuthenticatedClient } from "@features/auth/hooks/authClient"; import { fetchAuthState } from "@features/auth/hooks/authQueries"; -import { DISCOVERY_PROMPT, WIZARD_PROMPT } from "@features/setup/prompts"; +import { buildDiscoveryPrompt, WIZARD_PROMPT } from "@features/setup/prompts"; import { useSetupStore } from "@features/setup/stores/setupStore"; import { + buildTaskDiscoverySchema, type DiscoveredTask, - TASK_DISCOVERY_JSON_SCHEMA, } from "@features/setup/types"; import type { PostHogAPIClient } from "@renderer/api/posthogClient"; import { @@ -12,10 +12,15 @@ import { TaskCreationSaga, } from "@renderer/sagas/task/task-creation"; import { trpcClient } from "@renderer/trpc/client"; +import { EXPERIMENT_SUGGESTIONS_FLAG } from "@shared/constants"; import { isTerminalStatus, type Task } from "@shared/types"; import { ANALYTICS_EVENTS } from "@shared/types/analytics"; import { getCloudUrlFromRegion } from "@shared/utils/urls"; -import { captureException, track } from "@utils/analytics"; +import { + captureException, + isFeatureFlagEnabled, + track, +} from "@utils/analytics"; import { logger } from "@utils/logger"; import { queryClient } from "@utils/queryClient"; import { injectable } from "inversify"; @@ -487,10 +492,16 @@ export class SetupRunService { return; } + const includeExperiments = + isFeatureFlagEnabled(EXPERIMENT_SUGGESTIONS_FLAG) || + import.meta.env.DEV; + const discoveryPrompt = buildDiscoveryPrompt({ includeExperiments }); + const discoverySchema = buildTaskDiscoverySchema({ includeExperiments }); + const task = (await client.createTask({ title: "Discover first tasks", - description: DISCOVERY_PROMPT, - json_schema: TASK_DISCOVERY_JSON_SCHEMA as Record, + description: discoveryPrompt, + json_schema: discoverySchema, })) as unknown as Task; if (abort.signal.aborted) return; @@ -514,14 +525,14 @@ export class SetupRunService { apiHost, projectId, permissionMode: "bypassPermissions", - jsonSchema: TASK_DISCOVERY_JSON_SCHEMA as Record, + jsonSchema: discoverySchema, }); if (abort.signal.aborted) return; trpcClient.agent.prompt .mutate({ sessionId: taskRun.id, - prompt: [{ type: "text", text: DISCOVERY_PROMPT }], + prompt: [{ type: "text", text: discoveryPrompt }], }) .catch((err) => { log.error("Failed to send discovery prompt", { error: err }); diff --git a/apps/code/src/renderer/features/setup/types.ts b/apps/code/src/renderer/features/setup/types.ts index eb61a4008..15481e625 100644 --- a/apps/code/src/renderer/features/setup/types.ts +++ b/apps/code/src/renderer/features/setup/types.ts @@ -11,76 +11,92 @@ export interface DiscoveredTask { | "stale_feature_flag" | "error_tracking" | "event_tracking" - | "funnel"; + | "funnel" + | "experiment"; file?: string; lineHint?: number; impact?: string; recommendation?: string; } -export const TASK_DISCOVERY_JSON_SCHEMA = { - type: "object", - properties: { - tasks: { - type: "array", - items: { - type: "object", - properties: { - id: { type: "string", description: "A short kebab-case identifier" }, - title: { - type: "string", - description: - "Short, action-oriented header — under 60 characters. No file paths or line numbers.", - }, - description: { - type: "string", - description: - "A clear paragraph (2–4 sentences) describing the problem: what's wrong and the conditions under which it manifests. Do NOT include the file path or line number — those go in the file/lineHint fields.", - }, - category: { - type: "string", - enum: [ - "bug", - "security", - "dead_code", - "duplication", - "performance", - "stale_feature_flag", - "error_tracking", - "event_tracking", - "funnel", - ], - }, - file: { - type: "string", - description: "Relative file path where the issue lives", - }, - lineHint: { - type: "integer", - description: "Approximate line number", - }, - impact: { - type: "string", - description: - "Why this matters — concrete impact, blast radius, or risk. 1–3 sentences. Be specific (e.g. 'silently drops auth errors so users see a successful login UI even when backend rejects them').", - }, - recommendation: { - type: "string", - description: - "Suggested approach to fix, in plain prose. 2–4 sentences pointing at the right shape of the fix without writing the patch. Reference any specific functions, types, or files involved.", +const BASE_CATEGORY_ENUM = [ + "bug", + "security", + "dead_code", + "duplication", + "performance", + "stale_feature_flag", + "error_tracking", + "event_tracking", + "funnel", +] as const; + +export function buildTaskDiscoverySchema({ + includeExperiments, +}: { + includeExperiments: boolean; +}): Record { + const categoryEnum = includeExperiments + ? [...BASE_CATEGORY_ENUM, "experiment"] + : [...BASE_CATEGORY_ENUM]; + + return { + type: "object", + properties: { + tasks: { + type: "array", + items: { + type: "object", + properties: { + id: { + type: "string", + description: "A short kebab-case identifier", + }, + title: { + type: "string", + description: + "Short, action-oriented header — under 60 characters. No file paths or line numbers.", + }, + description: { + type: "string", + description: + "A clear paragraph (2–4 sentences) describing the problem: what's wrong and the conditions under which it manifests. Do NOT include the file path or line number — those go in the file/lineHint fields. For experiment-category tasks, state the hypothesis being tested instead of a problem.", + }, + category: { + type: "string", + enum: categoryEnum, + }, + file: { + type: "string", + description: "Relative file path where the issue lives", + }, + lineHint: { + type: "integer", + description: "Approximate line number", + }, + impact: { + type: "string", + description: + "Why this matters — concrete impact, blast radius, or risk. 1–3 sentences. For experiment-category tasks, state the metric you would measure and the outcome a winning variant would produce.", + }, + recommendation: { + type: "string", + description: + "Suggested approach to fix, in plain prose. 2–4 sentences pointing at the right shape of the fix without writing the patch. Reference any specific functions, types, or files involved. For experiment-category tasks, describe the proposed control and test variants concretely.", + }, }, + required: [ + "id", + "title", + "description", + "category", + "impact", + "recommendation", + ], }, - required: [ - "id", - "title", - "description", - "category", - "impact", - "recommendation", - ], + maxItems: 4, }, - maxItems: 4, }, - }, - required: ["tasks"], -} as const; + required: ["tasks"], + }; +} diff --git a/apps/code/src/renderer/features/setup/utils/buildDiscoveredTaskPrompt.ts b/apps/code/src/renderer/features/setup/utils/buildDiscoveredTaskPrompt.ts index 8ec6d0e42..a2c0d1224 100644 --- a/apps/code/src/renderer/features/setup/utils/buildDiscoveredTaskPrompt.ts +++ b/apps/code/src/renderer/features/setup/utils/buildDiscoveredTaskPrompt.ts @@ -1,6 +1,40 @@ import type { DiscoveredTask } from "@features/setup/types"; +import { SKILL_BUTTONS } from "@features/skill-buttons/prompts"; + +function buildExperimentTaskPrompt(task: DiscoveredTask): string { + const sections: string[] = [ + SKILL_BUTTONS["run-experiment"].prompt, + "", + "Use the analysis below as the starting point.", + "", + `Hypothesis: ${task.title}`, + "", + task.description, + ]; + + if (task.impact) { + sections.push("", "Primary metric:", task.impact); + } + + if (task.recommendation) { + sections.push("", "Proposed variants:", task.recommendation); + } + + if (task.file) { + const location = task.lineHint + ? `${task.file}:${task.lineHint}` + : task.file; + sections.push("", `Surface: ${location}`); + } + + return sections.join("\n"); +} export function buildDiscoveredTaskPrompt(task: DiscoveredTask): string { + if (task.category === "experiment") { + return buildExperimentTaskPrompt(task); + } + const sections: string[] = [ "Investigate this issue and implement the fix. Open a PR if appropriate.", "", diff --git a/apps/code/src/renderer/features/setup/utils/categoryConfig.ts b/apps/code/src/renderer/features/setup/utils/categoryConfig.ts index b60d96c8b..44807eee5 100644 --- a/apps/code/src/renderer/features/setup/utils/categoryConfig.ts +++ b/apps/code/src/renderer/features/setup/utils/categoryConfig.ts @@ -5,6 +5,7 @@ import { ChartLine, Copy, Flag, + Flask, Funnel, Lightning, Lock, @@ -35,6 +36,7 @@ export const CATEGORY_CONFIG: Record< error_tracking: { icon: Warning, color: "orange", label: "Error tracking" }, event_tracking: { icon: ChartLine, color: "blue", label: "Event tracking" }, funnel: { icon: Funnel, color: "violet", label: "Funnel" }, + experiment: { icon: Flask, color: "violet", label: "Experiment" }, }; // Fallback when a `DiscoveredTask.category` somehow doesn't match the map diff --git a/apps/code/src/shared/constants.ts b/apps/code/src/shared/constants.ts index 2c2a27339..48869ff8f 100644 --- a/apps/code/src/shared/constants.ts +++ b/apps/code/src/shared/constants.ts @@ -1,5 +1,7 @@ export const BILLING_FLAG = "posthog-code-billing"; export const INBOX_GATED_DUE_TO_SCALE_FLAG = "inbox-gated-due-to-scale"; +export const EXPERIMENT_SUGGESTIONS_FLAG = + "posthog-code-experiment-suggestions"; export const BRANCH_PREFIX = "posthog-code/"; export const DATA_DIR = ".posthog-code"; export const WORKTREES_DIR = ".posthog-code/worktrees"; diff --git a/apps/code/src/shared/types/analytics.ts b/apps/code/src/shared/types/analytics.ts index 589bfd99c..500c75460 100644 --- a/apps/code/src/shared/types/analytics.ts +++ b/apps/code/src/shared/types/analytics.ts @@ -279,7 +279,8 @@ type SetupDiscoveredTaskCategory = | "stale_feature_flag" | "error_tracking" | "event_tracking" - | "funnel"; + | "funnel" + | "experiment"; export interface SetupViewedProperties { discovery_status: "idle" | "running" | "done" | "error";