From 7691830d67b3d02f3eefe3bd374e6ecfb6f695c2 Mon Sep 17 00:00:00 2001 From: nullhack Date: Fri, 17 Apr 2026 15:13:08 -0400 Subject: [PATCH 01/12] docs(scope): refactor Step 1 to 3-session discovery structure - AGENTS.md: rewrite Step 1 phases with 3-session model (Ph1+Ph2), move decomposition check to after Ph2 Session 3, add INVEST gate to Phase 3, remove final pre-mortem from Phase 4, update TODO.md reference, fix Step 2 description (remove 'get PO approval') - docs/workflow.md: replace Step 1 diagram with full 3-session flow including gap-finding labels (CIT/Laddering/CI Perspective), template enforcement gates, update Feature File Structure example - scope/SKILL.md (v4.0): add gap-finding techniques section, add active listening protocol (3 levels), rewrite Phase 1+2 around 3 sessions each with template gates and pre-mortems at correct positions, update feature and project-level discovery formats - docs/features/discovery.md: expand to full 3-session template - scope/discovery-template.md: expand to full 3-session template - product-owner.md: update skill description to 3-session model --- .opencode/agents/product-owner.md | 2 +- .opencode/skills/scope/SKILL.md | 241 +++++++++++++------ .opencode/skills/scope/discovery-template.md | 18 +- AGENTS.md | 45 ++-- docs/features/discovery.md | 35 ++- docs/workflow.md | 100 ++++++-- 6 files changed, 317 insertions(+), 124 deletions(-) diff --git a/.opencode/agents/product-owner.md b/.opencode/agents/product-owner.md index d83d128..ed8a8a2 100644 --- a/.opencode/agents/product-owner.md +++ b/.opencode/agents/product-owner.md @@ -58,4 +58,4 @@ When a gap is reported (by developer or reviewer): ## Available Skills - `session-workflow` — session start/end protocol -- `scope` — Step 1: full 4-phase discovery, stories, and criteria protocol +- `scope` — Step 1: 3-session discovery (Phase 1 + 2), stories (Phase 3), and criteria (Phase 4) diff --git a/.opencode/skills/scope/SKILL.md b/.opencode/skills/scope/SKILL.md index 9464f90..91bb8e2 100644 --- a/.opencode/skills/scope/SKILL.md +++ b/.opencode/skills/scope/SKILL.md @@ -1,7 +1,7 @@ --- name: scope description: Step 1 — discover requirements through stakeholder interviews and write Gherkin acceptance criteria -version: "3.0" +version: "4.0" author: product-owner audience: product-owner workflow: feature-lifecycle @@ -26,19 +26,59 @@ Step 1 has 4 phases: | 3. Stories | PO alone | `Rule:` blocks in the `.feature` file (no Examples) | | 4. Criteria | PO alone | `Example:` blocks with `@id` tags under each `Rule:` | +Each phase produces a template-gated deliverable. A section must be complete and confirmed before the next section unlocks. Template enforcement is the process discipline — not a "baseline" command. + --- -## Phase 1 — Project Discovery +## Gap-Finding Techniques + +Three techniques are applied across all interview sessions to surface what stakeholders have not yet said. Use them during every session, not just at the end. + +### Critical Incident Technique (CIT) — Flanagan 1954 +Ask about a specific past event rather than a general description. Schema-based recall ("usually we...") hides edge cases and workarounds. A concrete incident forces actual memory. + +- "Tell me about a specific time when [X] worked exactly as you needed." +- "Tell me about a specific time when [X] broke down or frustrated you." +- Probe each incident: "What task were you doing? What happened next? What made it effective / ineffective?" + +### Laddering / Means-End Chain — Reynolds & Gutman 1988 +Climb from surface feature to underlying consequence to terminal value. The first answer is rarely the real constraint. + +- "Why is that important to you?" +- "What does that enable?" +- "What would break if that were not available?" +- Stop when the stakeholder reaches a value they cannot explain further. -**When**: Once per project, before any features are scoped. +### CI Perspective Change — Fisher & Geiselman 1987 +Ask the stakeholder to describe the same situation from another actor's point of view. Peripheral details and cross-role concerns surface that the primary perspective conceals. -### 1.1 Create Project Discovery Document +- "What do you think the end user experiences in that situation?" +- "What would your team lead's concern be here?" +- "From the perspective of someone encountering this for the first time, what would they need to know?" -Create `docs/features/discovery.md` with Status + Questions only (no Entities table). See the format in the "Discovery Document Formats" section below. +--- + +## Active Listening Protocol + +Three levels of active listening apply throughout every interview session: + +- **Level 1 — Per answer**: immediately paraphrase each answer before moving to the next question. "So if I understand correctly, you're saying that X happens when Y?" Catches misunderstanding in the moment. +- **Level 2 — Per cluster**: brief synthesis when transitioning between topic clusters. "We've covered [area A] and [area B]. Before I ask about [area C], here is what I understood so far: [summary]. Does that capture it?" Confirms completeness, gives stakeholder a recovery point. +- **Level 3 — End of session**: full synthesis of everything discussed. Present to stakeholder for approval. This is the accuracy gate, the baseline signal, and the input to domain modeling. -### 1.2 Ask the 7 Standard Questions +Do not introduce topic labels or categories during active listening. The summary must reflect what the stakeholder said, not new framing that prompts reactions to things they haven't considered. -Present all questions to the stakeholder at once: +--- + +## Phase 1 — Project Discovery + +**When**: Once per project, before any features are scoped. **Skip entirely if `discovery.md` Status is `BASELINED`.** Adding features to an existing project: append new questions to Session 1 and re-fill from there. + +### Session 1 — Individual Scope Elicitation + +**Before the session**: Create `docs/features/discovery.md` using the project-level discovery template. Open to the Session 1 section. + +**Ask the 7 standard questions** (present all at once): 1. **Who** are the users of this product? 2. **What** does the product do at a high level? @@ -48,23 +88,40 @@ Present all questions to the stakeholder at once: 6. **Failure** — what does failure look like? What must never happen? 7. **Out-of-scope** — what are we explicitly not building? -### 1.3 Silent Pre-mortem +**During the session**: Apply Level 1 active listening (paraphrase each answer). Apply CIT, Laddering, and CI Perspective Change per answer to surface gaps. Add new questions to the Questions table as they arise — do not defer to a later session. + +**After the session**: + +1. Write the **Session 1 Synthesis** in `discovery.md`: a 3–5 sentence summary of who the users are, what the product does, why it exists, its success/failure conditions, and explicit out-of-scope boundaries. +2. Present the synthesis to the stakeholder: "Here is my understanding of what you told me — please correct anything that is missing or wrong." +3. Stakeholder confirms or corrects. PO refines until approved. +4. Run a **silent pre-mortem** on the confirmed synthesis: "Imagine we build exactly what was described, ship it, and it fails. What was missing?" Add any discoveries as new questions to the Questions table. +5. Mark `Template §1: CONFIRMED` in `discovery.md`. This unlocks Session 2. + +### Session 2 — Cluster / Big Picture -After receiving answers, run this internally (do not show the stakeholder): +**Before the session**: Review the confirmed Session 1 synthesis. Identify topic clusters (cross-cutting concerns, system-wide constraints, integration points, lifecycle questions). Prepare cluster-level questions. -> "Imagine we build exactly what the stakeholder described, ship it, and it fails. What was missing from their answers?" +**During the session**: Apply Level 1 active listening per answer. Apply Level 2 active listening when transitioning between clusters. Apply CIT, Laddering, and CI Perspective Change per cluster. Add new questions in the moment. -Generate targeted follow-up questions from this analysis. Add them to the Questions table in `discovery.md`. +**After the session**: -### 1.4 Follow Up +1. For each cluster, write a **Cluster Summary** in `discovery.md`. +2. Mark `Template §2: CONFIRMED` in `discovery.md`. This unlocks Session 3. -Present all follow-up questions at once. Continue until all questions have status `ANSWERED`. +### Session 3 — Synthesis Approval + Feature Derivation -### 1.5 Baseline +**Before the session**: Produce a **Full Synthesis** across all clusters from Sessions 1 and 2. Write it to `discovery.md`. -When all questions are answered, autonomously set `Status: BASELINED` in `docs/features/discovery.md`. +**During the session**: Present the full synthesis to the stakeholder. "This is my understanding of the full scope. Please correct anything that is missing or wrong." Stakeholder approves or corrects. PO refines until the stakeholder explicitly approves. -From the answers, identify the feature list. For each feature, create `docs/features/backlog/.feature` using the feature file template (discovery section only — no Rules yet). +**After the session** (PO alone): + +1. Domain analysis: extract all nouns (candidate entities) and verbs (candidate operations) from the approved synthesis. +2. Group nouns into subject areas (Bounded Contexts: where the same word means different things, a new context begins). +3. Name each subject area as a feature using FDD "Action object" triples: "Calculate the total of a sale", "Validate the password of a user", "Enroll a student in a seminar". +4. For each feature: create `docs/features/backlog/.feature` using the feature file template (discovery section only — no Rules yet). +5. Write `Status: BASELINED (YYYY-MM-DD)` to `discovery.md`. Commit: `feat(discovery): baseline project discovery` @@ -72,56 +129,72 @@ Commit: `feat(discovery): baseline project discovery` ## Phase 2 — Feature Discovery -**When**: Per feature, after project discovery is baselined. +**When**: Per feature, after project discovery is baselined. Each `.feature` file has its own 3-session discovery template in its description. -### 2.1 Derive Questions from Feature Entities +### Session 1 — Individual Entity Elicitation -Open `docs/features/backlog/.feature`. This step happens **before** any stakeholder interaction. +**Before the session**: Open `docs/features/backlog/.feature`. -1. **Populate the Entities table**: Extract nouns (candidate classes/models) and verbs (candidate methods/features) from project discovery answers relevant to this feature. Mark each as in-scope or not. -2. **Generate questions from entities**: For each in-scope entity, ask: +1. **Populate the Entities table**: extract nouns (candidate classes) and verbs (candidate methods) from the project discovery synthesis that are relevant to this feature. Mark each as in-scope or not. +2. **Generate questions from entity gaps**: for each in-scope entity, ask internally: - What are its boundaries and edge cases? - - What happens when it's missing, invalid, or at its limits? - - How does it interact with other entities? -3. **Add questions from gaps**: Questions from areas not covered by project discovery, ambiguities specific to this feature, and boundary conditions. -4. **Silent pre-mortem** (before the first interview round): + - What happens when it is missing, invalid, or at its limits? + - How does it interact with other in-scope entities? +3. Add questions to the Session 1 Questions table. +4. Run a **silent pre-mortem**: "Imagine the developer builds this feature exactly as described, all tests pass, but the feature doesn't work for the user. What would be missing?" Add any discoveries as new questions. + +**During the session**: Apply Level 1 active listening per answer. Apply CIT, Laddering, and CI Perspective Change per answer. Add new questions in the moment. + +**After the session**: -> "Imagine the developer builds this feature exactly as described, all tests pass, but the feature doesn't work for the user. What would be missing?" +1. Write the **Session 1 Synthesis** in the `.feature` file: summarize the key entities, their relationships, and the constraints that emerged. +2. Present the synthesis to the stakeholder. Stakeholder confirms or corrects. PO refines until approved. +3. Run a **silent pre-mortem** on the confirmed synthesis. +4. Mark `Template §1: CONFIRMED`. This unlocks Session 2. -Add any discoveries as new questions to the Questions table. +### Session 2 — Cluster / Big Picture for This Feature -### 2.2 Interview +**Before the session**: Review the confirmed Session 1 synthesis. Identify clusters of behavior within this feature (happy paths, error paths, edge cases, lifecycle events, integration points). -Present **all** questions to the stakeholder at once. After receiving answers: +**During the session**: Apply Level 1 active listening per answer. Apply Level 2 active listening when transitioning between clusters. Apply CIT, Laddering, and CI Perspective Change per cluster. -1. Mark answered questions as `ANSWERED` in the Questions table -2. Run a silent pre-mortem on the new answers — generate follow-up questions -3. Present follow-up questions to the stakeholder -4. Repeat until the stakeholder says **"baseline"** to freeze discovery +**After the session**: -### 2.3 Feature Decomposition Check +1. Write **Cluster Summaries** in the `.feature` file. Name each cluster — these names become candidate `Rule:` titles. +2. Mark `Template §2: CONFIRMED`. This unlocks Session 3. -Before moving to Phase 3, check: does this feature span **>2 distinct concerns** OR have **>8 candidate Examples**? If yes: +### Session 3 — Feature Synthesis Approval + Story Derivation -1. Split into separate `.feature` files in `backlog/` — each addressing a single cohesive concern -2. Populate the discovery section for each split feature -3. Re-run Phase 2 for any split feature that needs its own discovery +**Before the session**: Produce a **Full Synthesis** of the feature scope, covering all clusters from Sessions 1 and 2. -### 2.4 Baseline +**During the session**: Present the full synthesis to the stakeholder. Stakeholder approves or corrects. PO refines until explicitly approved. -When the stakeholder says "baseline" (and decomposition check passes), set `Status: BASELINED (YYYY-MM-DD)` in the feature file's discovery section. +**After the session** (PO alone): + +1. Map each named cluster from Session 2 to a candidate user story (Rule). +2. Write `Status: BASELINED (YYYY-MM-DD)` to the `.feature` file's discovery section. +3. Mark `Template §3: CONFIRMED`. Commit: `feat(discovery): baseline feature discovery` +### Decomposition Check + +After Session 3, before moving to Phase 3: + +Does this feature span **>2 distinct concerns** OR have **>8 candidate Examples**? + +- **YES** → split into separate `.feature` files in `backlog/`, each addressing a single cohesive concern. Re-run Phase 2 for any split feature that needs its own discovery. +- **NO** → proceed to Phase 3. + --- ## Phase 3 — Stories -**When**: After feature discovery is baselined. PO works alone. +**When**: After feature discovery is baselined and decomposition check passes. PO works alone. ### 3.1 Write Rule Blocks -Add one `Rule:` block per user story to the `.feature` file, after the discovery section. +Clusters from Phase 2 Session 2 → one `Rule:` block per user story. Add after the discovery section in the `.feature` file. Each `Rule:` block contains: - The rule title (2-4 words, kebab-friendly) @@ -174,18 +247,15 @@ Commit: `feat(stories): write user stories for ` ### 4.1 Silent Pre-mortem Per Rule -For each `Rule:` block, ask internally: +For each `Rule:` block, ask internally before writing any Examples: > "What observable behaviors must we prove for this Rule to be complete?" +All Rules must have their pre-mortems completed before any Examples are written. + ### 4.2 Write Example Blocks -Add `Example:` blocks under each `Rule:`. Each Example gets an `@id:<8-char-hex>` tag. - -**ID generation**: -```bash -uv run task gen-id -``` +Add `Example:` blocks under each `Rule:`. Each Example gets an `@id:<8-char-hex>` tag (generated with `uv run task gen-id`). **Format** (mandatory): @@ -221,14 +291,12 @@ uv run task gen-id | When I click the Login button | When Bob logs in | | Then I see "Welcome, Bob" on the dashboard | Then Bob sees a personalized welcome | -Write Examples that describe *what happens*, not *how the user clicks through the UI*. Imperative steps couple tests to specific UI layouts and break when the UI changes. - -**MoSCoW triage**: When a Rule spans multiple concerns or has many candidate Examples, ask for each one: is this a **Must** (required for the Rule to be correct), a **Should** (high value but deferrable), or a **Could** (nice-to-have edge case)? If the Rule spans >2 concerns or Musts alone exceed 8, the Rule needs splitting. +**MoSCoW triage**: For each candidate Example, classify as Must (required for the Rule to be correct), Should (high value but deferrable), or Could (nice-to-have edge case). If Musts alone exceed 8 or the Rule spans >2 concerns, split the Rule. **Common mistakes to avoid**: -- "Then: It works correctly" (not measurable) -- "Then: The system updates the database and sends an email" (split into two Examples) -- Multiple behaviors in one Example (split them) +- "Then: It works correctly" — not measurable +- "Then: The system updates the database and sends an email" — split into two Examples +- Multiple behaviors in one Example — split them - Examples that test implementation details ("Then: the Strategy pattern is used") - Imperative UI steps instead of declarative behavior descriptions @@ -236,7 +304,7 @@ Write Examples that describe *what happens*, not *how the user clicks through th Before committing: - [ ] Every `Rule:` block has at least one Example -- [ ] Every `@id` is unique within this feature (check: `grep "@id:" docs/features/backlog/.feature`) +- [ ] Every `@id` is unique within this feature - [ ] Every Example has `Given/When/Then` - [ ] Every `Then` is a single, observable, measurable outcome - [ ] No Example tests implementation details @@ -244,15 +312,7 @@ Before committing: - [ ] Each Example is observably distinct from every other - [ ] No single feature file spans multiple unrelated concerns -### 4.4 Final Pre-mortem - -Before committing, one last check: - -> "Imagine the developer builds exactly what these Examples say, all automated tests pass, but the feature doesn't work for the user. What would be missing?" - -Add any discoveries as new Examples. - -### 4.5 Commit and Freeze +### 4.4 Commit and Freeze ```bash git add docs/features/backlog/.feature @@ -280,8 +340,8 @@ Feature: Entities: | Type | Name | Candidate Class/Method | In Scope | |------|------|----------------------|----------| - | Noun | Ball | Ball | Yes | - | Verb | Bounce | Ball.bounce() | Yes | + | Noun | Ball | Ball | Yes | + | Verb | Bounce | Ball.bounce() | Yes | Rules (Business): - @@ -289,12 +349,24 @@ Feature: Constraints: - - Questions: + Session 1 — Individual Entity Elicitation: | ID | Question | Answer | Status | |----|----------|--------|--------| | Q1 | ... | ... | OPEN / ANSWERED | + Template §1: CONFIRMED + Synthesis: + + Session 2 — Cluster / Big Picture: + | ID | Question | Answer | Status | + |----|----------|--------|--------| + | Q2 | ... | ... | OPEN / ANSWERED | + Template §2: CONFIRMED + Clusters: + - : - All questions answered. Discovery frozen. + Session 3 — Feature Synthesis: + Synthesis: + Template §3: CONFIRMED — stakeholder approved YYYY-MM-DD Rule: As a @@ -314,22 +386,45 @@ Feature: Then ... ``` -The **Rules (Business)** section captures the business-rule layer: each rule may generate multiple Examples, and identifying rules first prevents redundant or contradictory Examples. +The **Rules (Business)** section captures business rules that hold across multiple Examples. Identifying rules first prevents redundant or contradictory Examples. The **Constraints** section captures non-functional requirements. Testable constraints should become `Example:` blocks with `@id` tags. -### Project-Level Discovery (`docs/features/discovery.md`) +--- + +## Project-Level Discovery (`docs/features/discovery.md`) ```markdown # Discovery: ## State -Status: ELICITING | BASELINED +Status: ELICITING | BASELINED (YYYY-MM-DD) + +## Session 1 — Individual Scope Elicitation -## Questions | ID | Question | Answer | Status | |----|----------|--------|--------| | Q1 | Who are the users? | ... | OPEN / ANSWERED | + +Template §1: CONFIRMED +Synthesis: +Pre-mortem: + +## Session 2 — Cluster / Big Picture + +| ID | Question | Answer | Status | +|----|----------|--------|--------| +| Q2 | ... | ... | OPEN / ANSWERED | + +Template §2: CONFIRMED +Clusters: +- : + +## Session 3 — Full Synthesis + +<3–6 paragraph synthesis of all scope, clusters, and boundaries> + +Template §3: CONFIRMED — stakeholder approved YYYY-MM-DD ``` No Entities table at project level. diff --git a/.opencode/skills/scope/discovery-template.md b/.opencode/skills/scope/discovery-template.md index 04329fa..c8e8c90 100644 --- a/.opencode/skills/scope/discovery-template.md +++ b/.opencode/skills/scope/discovery-template.md @@ -12,6 +12,22 @@ Feature: Constraints: - Questions: + Session 1 — Individual Entity Elicitation: | ID | Question | Answer | Status | |----|----------|--------|--------| + + Template §1: PENDING + Synthesis: (fill after stakeholder confirms) + Pre-mortem: (fill after synthesis is confirmed) + + Session 2 — Cluster / Big Picture: + | ID | Question | Answer | Status | + |----|----------|--------|--------| + + Template §2: PENDING + Clusters: + - (fill after all cluster questions are answered) + + Session 3 — Feature Synthesis: + (fill after Sessions 1 and 2 are complete) + Template §3: PENDING diff --git a/AGENTS.md b/AGENTS.md index 15a13d2..f3d493b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -11,7 +11,7 @@ Features flow through 6 steps with a WIP limit of 1 feature at a time. The files ``` STEP 1: SCOPE (product-owner) → discovery + Gherkin stories + criteria -STEP 2: ARCH (developer) → design module structure, get PO approval +STEP 2: ARCH (developer) → read all backlog features, design module structure STEP 3: TEST FIRST (developer) → sync stubs, write failing tests STEP 4: IMPLEMENT (developer) → Red-Green-Refactor, commit per green test STEP 5: VERIFY (reviewer) → run all commands, review code @@ -25,7 +25,7 @@ STEP 6: ACCEPT (product-owner) → demo, validate, move folder to compl ## Roles - **Product Owner (PO)** — AI agent. Interviews the stakeholder, writes discovery docs, Gherkin features, and acceptance criteria. Accepts or rejects deliveries. -- **Stakeholder** — Human. Answers PO's questions, provides domain knowledge, says "baseline" when discovery is complete. +- **Stakeholder** — Human. Answers PO's questions, provides domain knowledge, approves PO syntheses to confirm discovery is complete. - **Developer** — AI agent. Architecture, test bodies, implementation, git. Never edits `.feature` files. Escalates spec gaps to PO. - **Reviewer** — AI agent. Adversarial verification. Reports spec gaps to PO. @@ -55,21 +55,28 @@ STEP 6: ACCEPT (product-owner) → demo, validate, move folder to compl ## Step 1 — SCOPE (4 Phases) ### Phase 1 — Project Discovery (once per project) -PO creates `docs/features/discovery.md`. Asks stakeholder 7 standard questions (Who/What/Why/When/Success/Failure/Out-of-scope). Silent pre-mortem generates follow-up questions. All questions presented at once. Autonomous baseline when all questions are answered. PO identifies feature list and creates one `backlog/.feature` file per feature (discovery section only). +PO creates `docs/features/discovery.md` using the 3-session template. **Skip Phase 1 entirely if `discovery.md` Status is BASELINED.** To add features to an existing project: append new questions to Session 1 and re-fill from there. + +- **Session 1** — Individual scope elicitation: 5Ws + Success + Failure + Out-of-scope. Gap-finding per answer using CIT, Laddering, and CI Perspective Change. PO writes synthesis; stakeholder confirms or corrects. PO runs silent pre-mortem on confirmed synthesis. Template §1 must be confirmed before Session 2. +- **Session 2** — Cluster / big picture: questions target clusters and cross-cutting concerns. Gap-finding per cluster. Level 2 synthesis when transitioning between clusters. Template §2 must be complete before Session 3. +- **Session 3** — Synthesis approval + feature derivation: PO produces full synthesis of all clusters; stakeholder approves or corrects (PO refines until approved). Domain analysis: nouns/verbs → subject areas → FDD "Action object" feature names. Create `backlog/.feature` stubs. Write `Status: BASELINED` to `discovery.md`. ### Phase 2 — Feature Discovery (per feature) -PO derives targeted questions from feature entities: extract nouns/verbs from project discovery, populate the Entities table in the feature file description, then generate questions from gaps, ambiguities, and boundary conditions. Silent pre-mortem before the first interview round. Present all questions to the stakeholder at once; iterate with follow-up rounds (pre-mortem after each) until stakeholder says "baseline" to freeze discovery. +Each `.feature` file has its own 3-session discovery template in its description. **Sessions are enforced by the template: each section must be filled before proceeding to the next.** + +- **Session 1** — Individual entity elicitation: populate Entities table from project discovery; generate questions from entity gaps using CIT, Laddering, CI Perspective Change. PO writes synthesis; stakeholder confirms. Silent pre-mortem on confirmed synthesis. +- **Session 2** — Cluster / big picture: questions target clusters of behavior within this feature. Gap-finding per cluster. Level 2 cluster transition summaries. +- **Session 3** — Feature synthesis approval + story derivation: PO produces synthesis of feature scope and clusters; stakeholder approves or corrects (PO refines until approved). Clusters become candidate user stories (Rules). Write `Status: BASELINED` to `.feature` discovery section. + +**Decomposition check**: after Session 3, does this feature span >2 distinct concerns OR have >8 candidate Examples? YES → split into separate `.feature` files, re-run Phase 2. NO → proceed. ### Phase 3 — Stories (PO alone) -One `Rule:` block per user story within the feature's `.feature` file. Each `Rule:` has the user story header (`As a / I want / So that`) as its description — no `Example:` blocks yet. Commit: `feat(stories): write user stories for ` +Clusters from Phase 2 Session 2 → one `Rule:` block per user story. Each `Rule:` has the user story header (`As a / I want / So that`) as its description — no `Example:` blocks yet. INVEST gate: all 6 letters must pass. Commit: `feat(stories): write user stories for ` ### Phase 4 — Criteria (PO alone) -Silent pre-mortem per Rule. Write `Example:` blocks with `@id:<8-char-hex>` tags under each `Rule:`. Each Example must be observably distinct. If a single feature spans **>2 distinct concerns** OR has **>8 candidate Examples**, split into separate `.feature` files in `backlog/` before writing Rules. Commit: `feat(criteria): write acceptance criteria for ` - -### Feature Decomposition Threshold -Before moving to Phase 3, check: does this feature span **>2 distinct concerns** OR have **>8 candidate Examples**? If yes, split into separate `.feature` files in `backlog/` before writing Rules. Each feature file should address a single cohesive concern. +Pre-mortem per Rule (all Rules must be checked before writing Examples). Write `Example:` blocks — declarative Given/When/Then, MoSCoW triage (Must/Should/Could) per Example. Review checklist (4.3). Commit: `feat(criteria): write acceptance criteria for ` -**Baseline is frozen**: no `.feature` changes after criteria are written. Change = `@deprecated` tag + new Example. +**Criteria are frozen**: no `Example:` changes after commit. Change = `@deprecated` tag + new Example with new `@id`. ## Filesystem Structure @@ -287,23 +294,7 @@ Use `@developer /skill git-release` for the full release process. Every session: load `skill session-workflow`. Read `TODO.md` first, update it at the end. -`TODO.md` is a 15-line bookmark — not a project journal: -```markdown -# Current Work - -Feature: -Step: <1-6> () -Source: docs/features/in-progress/.feature - -## Progress -- [x] `<@id:hex>`: ← done -- [~] `<@id:hex>`: ← in progress -- [ ] `<@id:hex>`: ← next -- [-] `<@id:hex>`: ← cancelled - -## Next - -``` +`TODO.md` is a session bookmark — not a project journal. See `docs/workflow.md` for the full structure including the Cycle State and Self-Declaration blocks used during Step 4. ## Setup diff --git a/docs/features/discovery.md b/docs/features/discovery.md index 16ef362..d7d7c3c 100644 --- a/docs/features/discovery.md +++ b/docs/features/discovery.md @@ -3,6 +3,39 @@ ## State Status: ELICITING -## Questions +--- + +## Session 1 — Individual Scope Elicitation + | ID | Question | Answer | Status | |----|----------|--------|--------| +| Q1 | Who are the users of this product? | | OPEN | +| Q2 | What does the product do at a high level? | | OPEN | +| Q3 | Why does it exist — what problem does it solve? | | OPEN | +| Q4 | When and where is it used (environment, platform, context)? | | OPEN | +| Q5 | How do we know it works? What does "done" look like? | | OPEN | +| Q6 | What does failure look like? What must never happen? | | OPEN | +| Q7 | What are we explicitly not building? | | OPEN | + +Template §1: PENDING +Synthesis: (fill after stakeholder confirms answers) +Pre-mortem: (fill after synthesis is confirmed) + +--- + +## Session 2 — Cluster / Big Picture + +| ID | Question | Answer | Status | +|----|----------|--------|--------| + +Template §2: PENDING +Clusters: +- (fill after all cluster questions are answered) + +--- + +## Session 3 — Full Synthesis + +(fill after Sessions 1 and 2 are complete) + +Template §3: PENDING diff --git a/docs/workflow.md b/docs/workflow.md index 6618155..e678593 100644 --- a/docs/workflow.md +++ b/docs/workflow.md @@ -33,28 +33,77 @@ Each step has a designated agent and a specific deliverable. No step is skipped. │ STEP 1 — SCOPE agent: product-owner │ ├─────────────────────────────────────────────────────────────────────┤ │ │ -│ Phase 1 — Project Discovery (once per project) │ -│ PO asks stakeholder 7 questions → silent pre-mortem │ -│ → paraphrase + clarify + summarize → stakeholder confirms │ -│ → baseline docs/features/discovery.md │ -│ → create backlog/.feature stubs (discovery section only) │ +│ Phase 1 — Project Discovery │ +│ [runs ONCE; skip if discovery.md BASELINED] │ +│ [adding features later: append new Qs to Session 1, re-fill] │ │ │ -│ Phase 2 — Feature Discovery (per feature) │ -│ PO populates Entities table in .feature file description │ -│ → generates questions from gaps, ambiguities, boundaries │ -│ → interview rounds → after each round: │ -│ paraphrase + clarify + summarize → stakeholder confirms │ -│ → stakeholder says "baseline" to freeze discovery │ -│ → decomposition check (>2 concerns or >8 examples → split) │ -│ → Status: BASELINED written into .feature file description │ +│ Session 1 — Individual Scope Elicitation │ +│ 5Ws + Success + Failure + Out-of-scope │ +│ Gap-finding per answer: CIT · Laddering · CI Perspective │ +│ [new questions from elucidation added in the moment] │ +│ Level 1: paraphrase each answer on the spot │ +│ → PO writes synthesis → stakeholder confirms or corrects │ +│ → PO runs silent pre-mortem on confirmed synthesis │ +│ [template §1: synthesis confirmed → unlocks Session 2] │ +│ │ +│ Session 2 — Cluster / Big Picture │ +│ Questions target clusters and cross-cutting concerns │ +│ Gap-finding per cluster: CIT · Laddering · CI Perspective │ +│ [new questions from elucidation added in the moment] │ +│ Level 1: paraphrase each answer │ +│ Level 2: synthesis when transitioning between clusters │ +│ [template §2: all clusters answered → unlocks Session 3] │ +│ │ +│ Session 3 — Synthesis Approval + Feature Derivation │ +│ PO produces full synthesis across all clustered areas │ +│ → stakeholder approves or corrects; PO refines until approved │ +│ [template §3: approval → unlocks domain analysis] │ +│ Domain analysis: nouns/verbs → subject areas │ +│ Name features (FDD "Action object" / Affinity clusters) │ +│ Create backlog/.feature stubs │ +│ Status: BASELINED written to discovery.md │ +│ │ +│ Phase 2 — Feature Discovery (repeats per feature) │ +│ [each .feature has its own 3-session discovery template] │ +│ │ +│ Session 1 — Individual Entity Elicitation │ +│ Populate Entities table from project discovery │ +│ Gap-finding per answer: CIT · Laddering · CI Perspective │ +│ [new questions from elucidation added in the moment] │ +│ Level 1: paraphrase each answer │ +│ → PO writes synthesis → stakeholder confirms or corrects │ +│ → PO runs silent pre-mortem on confirmed synthesis │ +│ [template §1: synthesis confirmed → unlocks Session 2] │ +│ │ +│ Session 2 — Cluster / Big Picture for this Feature │ +│ Questions target clusters of behavior within this feature │ +│ Gap-finding per cluster: CIT · Laddering · CI Perspective │ +│ [new questions from elucidation added in the moment] │ +│ Level 1: paraphrase · Level 2: cluster transition summaries │ +│ [template §2: all clusters answered → unlocks Session 3] │ +│ │ +│ Session 3 — Feature Synthesis Approval + Story Derivation │ +│ PO produces synthesis of feature scope and clusters │ +│ → stakeholder approves or corrects; PO refines until approved │ +│ Clusters → candidate user stories (Rules) │ +│ Status: BASELINED written to .feature discovery section │ +│ [template §3: approval + stories → unlocks decomp check] │ +│ │ +│ DECOMPOSITION CHECK │ +│ >2 distinct concerns OR >8 candidate Examples? │ +│ YES → split into separate .feature files, re-run Phase 2 │ +│ NO → proceed │ │ │ │ Phase 3 — Stories (PO alone) │ -│ Write Rule: blocks with user story headers (no Examples yet) │ +│ Clusters from Phase 2 Session 2 → one Rule: block per story │ +│ INVEST gate: all 6 letters must pass before committing │ │ commit: feat(stories): write user stories for │ │ │ │ Phase 4 — Criteria (PO alone) │ -│ Silent pre-mortem per Rule │ -│ Write @id-tagged Example: blocks under each Rule: │ +│ 4.1 Pre-mortem per Rule (all Rules checked before Examples) │ +│ 4.2 Write @id-tagged Examples (Given/When/Then, declarative) │ +│ MoSCoW triage: Must / Should / Could per Example │ +│ 4.3 Review checklist │ │ commit: feat(criteria): write acceptance criteria for │ │ ★ FROZEN — changes require @deprecated + new Example │ │ │ @@ -175,7 +224,7 @@ Feature: Discovery: - Status: BASELINED (YYYY-MM-DD) + Status: ELICITING | BASELINED (YYYY-MM-DD) Entities: | Type | Name | Candidate Class/Method | In Scope | @@ -186,8 +235,17 @@ Feature: <title> Constraints: - <non-functional requirement> - Questions: + Session 1 — Individual Entity Elicitation: + | ID | Question | Answer | Status | ← OPEN / ANSWERED + Synthesis: <PO synthesis — confirmed by stakeholder> + + Session 2 — Cluster / Big Picture: | ID | Question | Answer | Status | + Clusters: <named topic clusters derived from answers> + + Session 3 — Feature Synthesis: + Synthesis: <full synthesis across clusters> + Approved: YES / NO Architecture: ← added at Step 2 by developer @@ -212,8 +270,8 @@ Feature: <title> ``` Two discovery sources: -- `docs/features/discovery.md` — project-level (Who/What/Why/When, once per project) -- Feature file description — per-feature discovery, entities, questions, architecture +- `docs/features/discovery.md` — project-level 3-session discovery (once per project; additive for new features) +- Feature file description — per-feature 3-session discovery, entities, clusters, architecture --- @@ -301,7 +359,7 @@ Phase: RED | GREEN | REFACTOR | SELF-DECLARE | REVIEWER(code-design) | COMMITTED | Role | Type | Responsibilities | |---|---|---| -| **Stakeholder** | Human | Answers questions, provides domain knowledge, says "baseline" | +| **Stakeholder** | Human | Answers questions, provides domain knowledge, approves syntheses | | **Product Owner** | AI agent | Interviews stakeholder, writes `.feature` files, picks features, accepts deliveries | | **Developer** | AI agent | Architecture, tests, code, git, releases | | **Reviewer** | AI agent | Adversarial verification — defaults to REJECTED until proven correct | From 4209dae6888a34d06711a1fa5ddc122544e5ad98 Mon Sep 17 00:00:00 2001 From: nullhack <nullhack@users.noreply.github.com> Date: Fri, 17 Apr 2026 16:00:22 -0400 Subject: [PATCH 02/12] feat(arch): add design-patterns skill and update Step 2 architecture diagram MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add .opencode/skills/design-patterns/SKILL.md with GoF pattern catalogue, smell-triggered patterns (8 with Python before/after), SOLID, Object Calisthenics, Python Zen, Law of Demeter, CQS, Tell Don't Ask - Update docs/workflow.md Step 2 box: prerequisites, domain analysis, pre-mortem, typed signatures in module structure, informative signature note, architecture smell check as hard gate (9 items incl. pattern smells) - Update docs/workflow.md Step 4 SELF-DECLARE: 24-item first-person format - Update docs/workflow.md TODO.md structure: first-person declaration format - Update implementation/SKILL.md: self-declaration format changed from checkboxes to 'As a developer I declare [rule] — YES | file:line', add 3 design pattern smell items, update reviewer template to match - Update AGENTS.md: add design-patterns skill row (developer, steps 2+4) - Update academic_research.md: entry 42 Hexagonal Architecture (Cockburn 2005), renumber old 42-49 to 43-50, add bibliography entries 65-66 --- .opencode/skills/design-patterns/SKILL.md | 572 ++++++++++++++++++++++ .opencode/skills/implementation/SKILL.md | 88 ++-- AGENTS.md | 1 + docs/academic_research.md | 379 ++++++++++++++ docs/workflow.md | 124 +++-- 5 files changed, 1098 insertions(+), 66 deletions(-) create mode 100644 .opencode/skills/design-patterns/SKILL.md diff --git a/.opencode/skills/design-patterns/SKILL.md b/.opencode/skills/design-patterns/SKILL.md new file mode 100644 index 0000000..917a83a --- /dev/null +++ b/.opencode/skills/design-patterns/SKILL.md @@ -0,0 +1,572 @@ +--- +name: design-patterns +description: Reference skill for GoF design patterns, SOLID, Object Calisthenics, Python Zen, and other SE principles — with smell triggers and Python before/after examples +version: "1.0" +author: developer +audience: developer +workflow: feature-lifecycle +--- + +# Design Patterns Reference + +Load this skill when: +- Running the architecture smell check in Step 2 and a smell is detected +- Refactoring in Step 4 and a pattern smell appears in the self-declaration + +--- + +## How to Use This Skill + +1. **Identify the smell** from the checklist in your self-declaration or architecture check +2. **Find the smell category** below (Creational / Structural / Behavioral) +3. **Read the trigger and the before/after example** +4. **Apply the pattern** and update the Architecture section (Step 2) or the refactored code (Step 4) + +--- + +## GoF Pattern Catalogue — One-Liner Reference + +### Creational +| Pattern | Intent | +|---|---| +| **Factory Method** | Delegate object creation to a subclass or factory function | +| **Abstract Factory** | Create families of related objects without specifying concrete classes | +| **Builder** | Construct complex objects step-by-step, separating construction from representation | +| **Prototype** | Clone existing objects instead of creating new ones from scratch | +| **Singleton** | Ensure a class has only one instance (use sparingly — prefer dependency injection) | + +### Structural +| Pattern | Intent | +|---|---| +| **Adapter** | Wrap an incompatible interface to match an expected interface | +| **Bridge** | Separate abstraction from implementation so both can vary independently | +| **Composite** | Treat individual objects and compositions uniformly via a shared interface | +| **Decorator** | Add responsibilities to an object dynamically without subclassing | +| **Facade** | Provide a simplified interface to a complex subsystem | +| **Flyweight** | Share fine-grained objects to reduce memory when many similar instances are needed | +| **Proxy** | Control access to an object via a surrogate (lazy init, access control, logging) | + +### Behavioral +| Pattern | Intent | +|---|---| +| **Chain of Responsibility** | Pass a request along a chain of handlers until one handles it | +| **Command** | Encapsulate a request as an object, enabling undo/redo and queuing | +| **Interpreter** | Define a grammar and an interpreter for a language | +| **Iterator** | Provide sequential access to elements without exposing the underlying structure | +| **Mediator** | Centralize complex communication between objects through a mediator object | +| **Memento** | Capture and restore object state without violating encapsulation | +| **Observer** | Define a one-to-many dependency so dependents are notified automatically | +| **State** | Allow an object to alter its behavior when its internal state changes | +| **Strategy** | Define a family of algorithms, encapsulate each, and make them interchangeable | +| **Template Method** | Define the skeleton of an algorithm; let subclasses fill in specific steps | +| **Visitor** | Separate an algorithm from the object structure it operates on | + +--- + +## Smell-Triggered Patterns — Python Examples + +### Creational Smells + +--- + +#### Smell: Scattered Object Construction +**Signal**: The same object is constructed in 3+ places with slightly different arguments, or construction logic is duplicated across callers. + +**Pattern**: Factory Method or Factory Function + +```python +# BEFORE — scattered construction +# in order_service.py +order = Order(id=uuid4(), status="pending", created_at=datetime.now()) + +# in test_order.py +order = Order(id=UUID("abc..."), status="pending", created_at=datetime(2026, 1, 1)) + +# in import_service.py +order = Order(id=uuid4(), status="pending", created_at=datetime.now()) +``` + +```python +# AFTER — factory function owns construction +def make_order( + *, + order_id: OrderId | None = None, + clock: Callable[[], datetime] = datetime.now, +) -> Order: + return Order( + id=order_id or OrderId(uuid4()), + status=OrderStatus.PENDING, + created_at=clock(), + ) +``` + +--- + +#### Smell: Multi-Step Construction with Optional Parts +**Signal**: An object requires several setup calls before it is valid. Callers must remember the correct sequence. + +**Pattern**: Builder + +```python +# BEFORE — callers must know the correct build sequence +report = Report() +report.set_title("Q4 Sales") +report.add_section(summary) +report.add_section(detail) +report.set_footer("Confidential") +# easy to forget a step or get the order wrong +``` + +```python +# AFTER — builder enforces sequence and provides defaults +report = ( + ReportBuilder("Q4 Sales") + .with_section(summary) + .with_section(detail) + .with_footer("Confidential") + .build() +) +``` + +--- + +### Structural Smells + +--- + +#### Smell: Type-Switching (if/elif on type or status) +**Signal**: A function or method contains `if isinstance(x, A): ... elif isinstance(x, B): ...` or `if x.type == "a": ... elif x.type == "b": ...`. Adding a new type requires editing this function. + +**Pattern**: Strategy (behavior varies) or Visitor (operation varies over a fixed structure) + +```python +# BEFORE — type switch must be updated for every new discount type +def apply_discount(order: Order, discount_type: str) -> Money: + if discount_type == "percentage": + return order.total * (1 - order.rate) + elif discount_type == "fixed": + return order.total - order.amount + elif discount_type == "bogo": + return order.total - (order.total / 2) + else: + raise ValueError(discount_type) +``` + +```python +# AFTER — Strategy: each discount is a callable, closed to modification +class DiscountStrategy(Protocol): + def apply(self, order: Order) -> Money: ... + +@dataclass +class PercentageDiscount: + rate: Decimal + def apply(self, order: Order) -> Money: + return order.total * (1 - self.rate) + +@dataclass +class FixedDiscount: + amount: Money + def apply(self, order: Order) -> Money: + return order.total - self.amount + +def apply_discount(order: Order, strategy: DiscountStrategy) -> Money: + return strategy.apply(order) +``` + +--- + +#### Smell: Feature Envy +**Signal**: A method in class A uses data from class B more than its own data. The method "envies" class B. + +**Pattern**: Move Method to the envied class (not a GoF pattern — a Fowler refactoring that often precedes Strategy or Command) + +```python +# BEFORE — OrderPrinter knows too much about Order internals +class OrderPrinter: + def format_total(self, order: Order) -> str: + subtotal = sum(item.price * item.quantity for item in order.items) + tax = subtotal * order.tax_rate + return f"{subtotal + tax:.2f}" +``` + +```python +# AFTER — total belongs on Order +@dataclass +class Order: + items: list[LineItem] + tax_rate: Decimal + + def total(self) -> Money: + subtotal = sum(item.subtotal() for item in self.items) + return subtotal * (1 + self.tax_rate) + +class OrderPrinter: + def format_total(self, order: Order) -> str: + return f"{order.total():.2f}" +``` + +--- + +#### Smell: Parallel Inheritance Hierarchies +**Signal**: Every time you add a subclass to hierarchy A, you must also add a corresponding subclass to hierarchy B. The two trees grow in lockstep. + +**Pattern**: Bridge + +```python +# BEFORE — adding a new Shape requires a new renderer subclass too +class Shape: ... +class Circle(Shape): ... +class Square(Shape): ... + +class SVGCircle(Circle): ... +class SVGSquare(Square): ... +class PNGCircle(Circle): ... +class PNGSquare(Square): ... +``` + +```python +# AFTER — Bridge separates shape from renderer +class Renderer(Protocol): + def render_circle(self, radius: float) -> None: ... + def render_square(self, side: float) -> None: ... + +@dataclass +class Circle: + radius: float + renderer: Renderer + def draw(self) -> None: + self.renderer.render_circle(self.radius) + +@dataclass +class Square: + side: float + renderer: Renderer + def draw(self) -> None: + self.renderer.render_square(self.side) +``` + +--- + +### Behavioral Smells + +--- + +#### Smell: Large State Machine in One Class +**Signal**: A class has a `status` or `state` field, and many methods begin with `if self.state == X: ... elif self.state == Y: ...`. Adding a new state requires editing all these methods. + +**Pattern**: State + +```python +# BEFORE — Order methods all branch on status +class Order: + def confirm(self) -> None: + if self.status == "pending": + self.status = "confirmed" + else: + raise InvalidTransition(self.status, "confirm") + + def ship(self) -> None: + if self.status == "confirmed": + self.status = "shipped" + else: + raise InvalidTransition(self.status, "ship") +``` + +```python +# AFTER — each state owns its own transitions +class OrderState(Protocol): + def confirm(self, order: Order) -> None: ... + def ship(self, order: Order) -> None: ... + +class PendingState: + def confirm(self, order: Order) -> None: + order.state = ConfirmedState() + def ship(self, order: Order) -> None: + raise InvalidTransition("pending", "ship") + +class ConfirmedState: + def confirm(self, order: Order) -> None: + raise InvalidTransition("confirmed", "confirm") + def ship(self, order: Order) -> None: + order.state = ShippedState() + +@dataclass +class Order: + state: OrderState = field(default_factory=PendingState) + def confirm(self) -> None: self.state.confirm(self) + def ship(self) -> None: self.state.ship(self) +``` + +--- + +#### Smell: Scattered Notification / Event Fan-Out +**Signal**: When something happens in class A, it directly calls methods on classes B, C, and D. Adding a new listener requires modifying class A. + +**Pattern**: Observer + +```python +# BEFORE — Order directly notifies every downstream system +class Order: + def confirm(self) -> None: + self.status = "confirmed" + EmailService().send_confirmation(self) # direct coupling + InventoryService().reserve(self) # direct coupling + AnalyticsService().record_conversion(self) # direct coupling +``` + +```python +# AFTER — Order emits an event; listeners register independently +class OrderConfirmedListener(Protocol): + def on_order_confirmed(self, order: Order) -> None: ... + +@dataclass +class Order: + _listeners: list[OrderConfirmedListener] = field(default_factory=list) + + def add_listener(self, listener: OrderConfirmedListener) -> None: + self._listeners.append(listener) + + def confirm(self) -> None: + self.status = OrderStatus.CONFIRMED + for listener in self._listeners: + listener.on_order_confirmed(self) +``` + +--- + +#### Smell: Repeated Algorithm Skeleton +**Signal**: Two or more functions share the same high-level structure (setup → process → teardown) but differ only in one or two steps. The structure is copied rather than shared. + +**Pattern**: Template Method + +```python +# BEFORE — CSV and JSON importers duplicate the pipeline structure +def import_csv(path: Path) -> list[Record]: + raw = path.read_text() + rows = parse_csv(raw) # varies + records = [validate(r) for r in rows] + save_all(records) + return records + +def import_json(path: Path) -> list[Record]: + raw = path.read_text() + rows = parse_json(raw) # varies + records = [validate(r) for r in rows] + save_all(records) + return records +``` + +```python +# AFTER — Template Method: skeleton in base, varying step overridden +class Importer(ABC): + def run(self, path: Path) -> list[Record]: + raw = path.read_text() + rows = self.parse(raw) # hook + records = [validate(r) for r in rows] + save_all(records) + return records + + @abstractmethod + def parse(self, raw: str) -> list[dict]: ... + +class CsvImporter(Importer): + def parse(self, raw: str) -> list[dict]: + return parse_csv(raw) + +class JsonImporter(Importer): + def parse(self, raw: str) -> list[dict]: + return parse_json(raw) +``` + +--- + +## SOLID — Python Examples + +### S — Single Responsibility +One class, one reason to change. + +```python +# WRONG — Report handles both data and formatting +class Report: + def generate(self) -> dict: ... + def to_pdf(self) -> bytes: ... # separate concern + def to_csv(self) -> str: ... # separate concern + +# RIGHT — split concerns +class Report: + def generate(self) -> ReportData: ... + +class PdfRenderer: + def render(self, data: ReportData) -> bytes: ... +``` + +### O — Open/Closed +Open for extension, closed for modification. + +```python +# WRONG — must edit this function to add a new format +def export(data: ReportData, fmt: str) -> bytes: + if fmt == "pdf": ... + elif fmt == "csv": ... + +# RIGHT — new formats extend without touching existing code +class Exporter(Protocol): + def export(self, data: ReportData) -> bytes: ... + +class PdfExporter: + def export(self, data: ReportData) -> bytes: ... +``` + +### L — Liskov Substitution +Subtypes must be fully substitutable for their base type. + +```python +# WRONG — ReadOnlyFile violates the contract of File +class File: + def write(self, content: str) -> None: ... + +class ReadOnlyFile(File): + def write(self, content: str) -> None: + raise PermissionError # narrows the contract — LSP violation + +# RIGHT — separate interfaces for readable and writable +class ReadableFile(Protocol): + def read(self) -> str: ... + +class WritableFile(Protocol): + def write(self, content: str) -> None: ... +``` + +### I — Interface Segregation +No implementor should be forced to implement methods it doesn't use. + +```python +# WRONG — Printer is forced to implement scan() and fax() +class Machine(Protocol): + def print(self, doc: Document) -> None: ... + def scan(self, doc: Document) -> None: ... + def fax(self, doc: Document) -> None: ... + +# RIGHT — each capability is its own Protocol +class Printer(Protocol): + def print(self, doc: Document) -> None: ... + +class Scanner(Protocol): + def scan(self, doc: Document) -> None: ... +``` + +### D — Dependency Inversion +Domain depends on abstractions (Protocols), not on concrete I/O or frameworks. + +```python +# WRONG — domain imports infrastructure directly +from app.db import PostgresConnection + +class OrderRepository: + def __init__(self) -> None: + self.db = PostgresConnection() # domain imports infra + +# RIGHT — domain defines the Protocol; infra implements it +class OrderRepository(Protocol): + def find(self, order_id: OrderId) -> Order: ... + def save(self, order: Order) -> None: ... + +class PostgresOrderRepository: # in adapters/ + def find(self, order_id: OrderId) -> Order: ... + def save(self, order: Order) -> None: ... +``` + +--- + +## Object Calisthenics — Python Rules + +Jeff Bay's 9 rules for object-oriented discipline. Each has a Python signal. + +| Rule | Constraint | Python Signal of Violation | +|---|---|---| +| **OC-1** | One indent level per method | `for` inside `if` inside a method body | +| **OC-2** | No `else` after `return` | `if cond: return x \n else: return y` | +| **OC-3** | Wrap all primitives that have domain meaning | `def process(user_id: int)` instead of `def process(user_id: UserId)` | +| **OC-4** | Wrap all collections that have domain meaning | `list[Order]` passed around instead of `OrderCollection` | +| **OC-5** | One dot per line | `obj.repo.find(id).name` | +| **OC-6** | No abbreviations | `usr`, `mgr`, `cfg`, `val`, `tmp` | +| **OC-7** | Keep classes small (≤50 lines) and methods short (≤20 lines) | Any method requiring scrolling | +| **OC-8** | No class with more than 2 instance variables | `__init__` with 3+ `self.x =` assignments | +| **OC-9** | No getters/setters | `def get_name(self)` / `def set_name(self, v)` | + +--- + +## Python Zen — Mapped to Code Practices + +The relevant items from PEP 20 (`import this`) with concrete code implications: + +| Zen Item | Code Practice | +|---|---| +| Beautiful is better than ugly | Name things clearly; prefer named types over bare primitives | +| Explicit is better than implicit | Explicit return types; explicit Protocol dependencies; no magic | +| Simple is better than complex | KISS — one function, one job; prefer a plain function over a class | +| Complex is better than complicated | A well-designed abstraction is acceptable; an accidental tangle is not | +| Flat is better than nested | OC-1 — one indent level; early returns | +| Sparse is better than dense | One statement per line; no semicolons; no lambda chains | +| Readability counts | OC-6 — no abbreviations; docstrings on every public function | +| Special cases aren't special enough to break the rules | Do not add `if isinstance` branches to avoid refactoring | +| Errors should never pass silently | No bare `except:`; no `except Exception: pass` | +| In the face of ambiguity, refuse the temptation to guess | Raise on invalid input; never silently return a default | +| There should be one obvious way to do it | DRY — every shared concept in exactly one place | +| If the implementation is hard to explain, it's a bad idea | KISS — if you can't describe the function in one sentence, split it | + +--- + +## Other Principles + +### Law of Demeter (Tell, Don't Ask) +A method should only call methods on: +- `self` +- Objects passed as parameters +- Objects it creates +- Direct component objects (`self.x`) + +**Violation signal**: `a.b.c()` — two dots. Assign `b = a.b` and call `b.c()`, or better: ask `a` to do what you need (`a.do_thing()`). + +### Command-Query Separation (CQS) +A method either **changes state** (command) or **returns a value** (query) — never both. + +```python +# WRONG — pop() both returns and mutates +value = stack.pop() + +# RIGHT (CQS strict) +value = stack.peek() # query — no mutation +stack.remove_top() # command — no return value +``` + +Note: Python's standard library violates CQS in places (`list.pop()`, `dict.update()`). Apply CQS to your domain objects; do not fight the stdlib. + +### Tell, Don't Ask +Instead of querying an object's state and acting on it externally, tell the object to do the work itself. + +```python +# WRONG — ask state, decide externally +if order.status == OrderStatus.PENDING: + order.status = OrderStatus.CONFIRMED + +# RIGHT — tell the object +order.confirm() # Order decides if the transition is valid +``` + +--- + +## Quick Smell → Pattern Lookup + +| Smell | Pattern | +|---|---| +| Same object constructed in 3+ places | Factory Method / Factory Function | +| Multi-step setup before object is valid | Builder | +| `if type == X: ... elif type == Y:` | Strategy | +| Method uses another class's data more than its own | Move Method (Fowler) | +| Two class hierarchies that grow in lockstep | Bridge | +| `if self.state == X:` in multiple methods | State | +| Class directly calls B, C, D on state change | Observer | +| Two functions share the same skeleton, differ in one step | Template Method | +| Subsystem is complex and callers need a simple entry point | Facade | +| Object needs logging/caching without changing its class | Decorator / Proxy | diff --git a/.opencode/skills/implementation/SKILL.md b/.opencode/skills/implementation/SKILL.md index 34bbb2e..60a4c92 100644 --- a/.opencode/skills/implementation/SKILL.md +++ b/.opencode/skills/implementation/SKILL.md @@ -203,34 +203,41 @@ Update `## Cycle State` Phase: `REFACTOR` After refactor is complete and `test-fast` passes, write the self-declaration **into `TODO.md`** under a `## Self-Declaration` block (replacing any prior one), then request the reviewer check. The reviewer will read `TODO.md` directly — do not paste the checklist into a separate message. +If a pattern smell was detected during refactor, load `skill design-patterns` to identify and apply the correct pattern before writing this block. + **Write this block into `TODO.md` now, filling in every item before requesting review:** ```markdown ## Self-Declaration (@id:<hex>) -- [ ] YAGNI-1: No abstractions beyond current AC — `file:line` -- [ ] YAGNI-2: No speculative parameters or flags for hypothetical future use — `file:line` -- [ ] KISS-1: Every function has one job, describable in one sentence without "and" — `file:line` -- [ ] KISS-2: No unnecessary indirection, wrapper layers, or complexity — `file:line` -- [ ] DRY-1: No logic block duplicated across two or more locations — `file:line` -- [ ] DRY-2: Every shared concept extracted to exactly one place — `file:line` -- [ ] SOLID-S: Each class/function has one reason to change — `file:line` -- [ ] SOLID-O: New behavior added by extension, no existing class body edited — `file:line` or N/A -- [ ] SOLID-L: Every subtype fully substitutable; no narrowed contract or surprise raise — `file:line` or N/A -- [ ] SOLID-I: No Protocol/ABC forces an implementor to leave a method as `...` or raise — `file:line` or N/A -- [ ] SOLID-D: Domain classes depend on Protocols, not on I/O or framework imports directly — `file:line` -- [ ] OC-1: Max one indent level per method; inner blocks extracted to named helpers — deepest: `file:line` -- [ ] OC-2: No `else` after `return`; all branches return early and the happy path is flat — `file:line` or N/A -- [ ] OC-3: No bare `int`/`str`/`float` as domain concepts in public signatures; each wrapped in a named type — `file:line` or N/A -- [ ] OC-4: No bare `list[X]`/`set[X]` as domain values; each wrapped in a named collection class — `file:line` or N/A -- [ ] OC-5: No `a.b.c()` chains; each dot navigation step assigned to a named local — `file:line` or N/A -- [ ] OC-6: No abbreviations anywhere; every name is a full word readable without context — `file:line` or N/A -- [ ] OC-7: Every function ≤ 20 lines, every class ≤ 50 lines — longest: `file:line` -- [ ] OC-8: Every class has ≤ 2 `self.x` in `__init__`; if > 2 before this cycle, name the new value object extracted and cite `file:line` per class -- [ ] OC-9: No `get_x()`/`set_x()` pairs; state changes via commands, queries return values — `file:line` or N/A -- [ ] Semantic: test Given/When/Then operates at the same abstraction level as the AC — `file:line` +As a developer I declare this code follows YAGNI-1 (no abstractions beyond current AC) — YES | `file:line` +As a developer I declare this code follows YAGNI-2 (no speculative parameters or flags) — YES | `file:line` +As a developer I declare this code follows KISS-1 (every function has one job) — YES | `file:line` +As a developer I declare this code follows KISS-2 (no unnecessary indirection) — YES | `file:line` +As a developer I declare this code follows DRY-1 (no duplicated logic) — YES | `file:line` +As a developer I declare this code follows DRY-2 (every shared concept in one place) — YES | `file:line` +As a developer I declare this code follows SOLID-S (one reason to change) — YES | `file:line` +As a developer I declare this code follows SOLID-O (extension not modification) — YES | `file:line` or N/A | reason +As a developer I declare this code follows SOLID-L (subtypes fully substitutable) — YES | `file:line` or N/A | reason +As a developer I declare this code follows SOLID-I (no forced stub methods) — YES | `file:line` or N/A | reason +As a developer I declare this code follows SOLID-D (domain depends on Protocols) — YES | `file:line` +As a developer I declare this code follows OC-1 (max one indent level per method) — YES | deepest: `file:line` +As a developer I declare this code follows OC-2 (no else after return) — YES | `file:line` or N/A | reason +As a developer I declare this code follows OC-3 (no bare primitives as domain concepts) — YES | `file:line` or N/A | reason +As a developer I declare this code follows OC-4 (no bare collections as domain values) — YES | `file:line` or N/A | reason +As a developer I declare this code follows OC-5 (no chained dot navigation) — YES | `file:line` or N/A | reason +As a developer I declare this code follows OC-6 (no abbreviations) — YES | `file:line` or N/A | reason +As a developer I declare this code follows OC-7 (functions ≤20 lines, classes ≤50 lines) — YES | longest: `file:line` +As a developer I declare this code follows OC-8 (≤2 instance variables per class) — YES | `file:line` +As a developer I declare this code follows OC-9 (no getters/setters) — YES | `file:line` or N/A | reason +As a developer I declare this code has no missing Creational pattern (no smell: repeated construction or scattered instantiation) — YES | `file:line` or N/A | reason +As a developer I declare this code has no missing Structural pattern (no smell: feature envy or parallel conditionals on type) — YES | `file:line` or N/A | reason +As a developer I declare this code has no missing Behavioral pattern (no smell: large state machine, scattered notification, or repeated algorithm skeleton) — YES | `file:line` or N/A | reason +As a developer I declare test abstraction matches AC level (semantic alignment) — YES | `file:line` ``` -*For every item: check the box AND cite `file:line` evidence, or write `N/A` with a one-line reason. An unchecked box or missing evidence is an automatic REJECTED.* +**A `NO` answer is not an automatic rejection** — it is a flag. State the reason inline and fix before requesting review. Do not submit a self-declaration with a `NO` item unresolved. + +*For every item: provide `file:line` evidence, or write `N/A` with a one-line reason. A missing answer or missing evidence is an automatic REJECTED.* Update `## Cycle State` Phase: `SELF-DECLARE` @@ -245,7 +252,7 @@ The reviewer is scoped to **code design only** (not full Step 5): **What the reviewer receives**: The developer's completed `## Self-Declaration` block in `TODO.md`, with `file:line` evidence for each rule. -**What the reviewer does**: Independently inspects the actual code for each rule the developer claimed compliant. The self-declaration is an audit target — the reviewer verifies claims, not just reads them. +**What the reviewer does**: Independently inspects the actual code for each rule the developer claimed `YES`. The self-declaration is an audit target — the reviewer verifies YES claims, not just reads them. The reviewer does NOT re-audit items the developer already flagged as N/A with a reason. **What the reviewer does NOT check** (deferred to Step 5): - Lint compliance @@ -260,17 +267,30 @@ The reviewer responds using this template: | Rule | Developer Claims | Reviewer Verdict | Evidence | |------|-----------------|------------------|----------| -| YAGNI | <summary> | PASS / FAIL | `file:line` or N/A | -| KISS | <summary> | PASS / FAIL | `file:line` or N/A | -| DRY | <summary> | PASS / FAIL | `file:line` or N/A | -| SOLID-S | <summary> | PASS / FAIL | `file:line` or N/A | -| SOLID-O | <summary> | PASS / FAIL | `file:line` or N/A | -| SOLID-L | <summary> | PASS / FAIL | `file:line` or N/A | -| SOLID-I | <summary> | PASS / FAIL | `file:line` or N/A | -| SOLID-D | <summary> | PASS / FAIL | `file:line` or N/A | -| OC-1 thru OC-9 | <summary> | PASS / FAIL | `file:line` or N/A | -| Design patterns | <summary> | PASS / FAIL | `file:line` or N/A | -| Semantic alignment | <summary> | PASS / FAIL | `file:line` or N/A | +| YAGNI-1 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| YAGNI-2 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| KISS-1 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| KISS-2 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| DRY-1 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| DRY-2 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| SOLID-S | YES / N/A | PASS / FAIL | `file:line` or N/A | +| SOLID-O | YES / N/A | PASS / FAIL | `file:line` or N/A | +| SOLID-L | YES / N/A | PASS / FAIL | `file:line` or N/A | +| SOLID-I | YES / N/A | PASS / FAIL | `file:line` or N/A | +| SOLID-D | YES / N/A | PASS / FAIL | `file:line` or N/A | +| OC-1 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| OC-2 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| OC-3 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| OC-4 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| OC-5 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| OC-6 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| OC-7 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| OC-8 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| OC-9 | YES / N/A | PASS / FAIL | `file:line` or N/A | +| Creational pattern | YES / N/A | PASS / FAIL | `file:line` or N/A | +| Structural pattern | YES / N/A | PASS / FAIL | `file:line` or N/A | +| Behavioral pattern | YES / N/A | PASS / FAIL | `file:line` or N/A | +| Semantic alignment | YES / N/A | PASS / FAIL | `file:line` or N/A | Decision: APPROVED / REJECTED ``` diff --git a/AGENTS.md b/AGENTS.md index f3d493b..e4f1399 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -44,6 +44,7 @@ STEP 6: ACCEPT (product-owner) → demo, validate, move folder to compl | `scope` | product-owner | 1 | | `tdd` | developer | 3 | | `implementation` | developer | 4 | +| `design-patterns` | developer | 2 (on-demand, if smell detected), 4 (refactor) | | `verify` | reviewer | 5 | | `code-quality` | developer | pre-handoff (redirects to `verify`) | | `pr-management` | developer | 6 | diff --git a/docs/academic_research.md b/docs/academic_research.md index 5117907..b0c3591 100644 --- a/docs/academic_research.md +++ b/docs/academic_research.md @@ -432,6 +432,352 @@ This document explains the cognitive and social-science mechanisms that justify --- +### 28a. Active Listening — Three-Level Structure and Level 3 Uses (Synthesis) + +| | | +|---|---| +| **Source** | Synthesis of: Nielsen, J. (2010). *Interviewing Users*. Nielsen Norman Group. + Farrell, S. (2017). UX Research Cheat Sheet. NN/G. + Ambler, S. W. (2002). *Agile Modeling*. Wiley (agilemodeling.com). + Wynne, M. (2015). Introducing Example Mapping. Cucumber Blog. | +| **Date** | 2010–2015 (synthesis) | +| **URL** | https://www.nngroup.com/articles/interviewing-users/ ; https://www.agilemodeling.com/essays/fdd.htm ; https://cucumber.io/blog/bdd/example-mapping-introduction/ | +| **Alternative** | — | +| **Status** | Synthesized rule of thumb — each component individually confirmed; the three-level structure is a practitioner synthesis | +| **Core finding** | Active listening in requirements interviews operates at three granularities simultaneously, not as a single end-of-interview act: **Level 1** (per answer) — immediate paraphrase to catch misunderstanding on the spot; **Level 2** (per topic cluster) — transition summary before moving to the next area, acting as a recovery point; **Level 3** (end of interview) — full synthesis, which serves four distinct downstream purposes. | +| **Mechanism** | Each level addresses a different failure mode. Level 1 prevents individual answer misreads from propagating. Level 2 prevents topic-cluster drift and allows mid-interview correction. Level 3 crystallizes scope and triggers the formal baseline. Without the level structure, practitioners collapse all three into a single end-of-interview summary, which is too late for Level 1 and 2 misunderstandings to be caught cheaply. | +| **Level 3 — four uses** | 1. **Accuracy gate** (NN/G): stakeholder confirms or corrects the summary before it is used downstream — prevents misread requirements from being frozen. 2. **Scope crystallization** (Ambler/FDD): the summary answers "what problems must this system solve?" and becomes the initial requirements stack. 3. **Input to domain modeling** (Ambler/FDD): nouns and verbs extracted from the Level 3 summary are the raw material for the Entities table — domain analysis cannot begin before this summary exists. 4. **Baseline trigger** (Wynne/Cucumber Example Mapping): when the stakeholder says "yes, that's right" to the summary, discovery is considered complete and frozen. | +| **Where used** | Phase 1 and Phase 2 of `scope/SKILL.md`: PO applies Level 1 during each exchange, Level 2 when transitioning between topic areas, and Level 3 at the end of each interview phase before proceeding to feature stubs (Phase 1) or user stories (Phase 2). | + +--- + +### 29. The Kipling Method — Five Ws and One H + +| | | +|---|---| +| **Source** | Kipling, R. (1902). *Just So Stories*. Macmillan. | +| **Date** | 1902 | +| **URL** | — | +| **Alternative** | Hermagoras of Temnos (2nd century BCE) — seven circumstances of rhetoric; Thomas Wilson (1560) — "The Arte of Rhetoric"; Aristotle's Nicomachean Ethics | +| **Status** | Practitioner synthesis — journalism, business analysis, and investigative methodology | +| **Core finding** | The six interrogative questions (Who, What, When, Where, Why, How) form a complete framework for gathering all essential facts about any event or situation. No single question can be answered with a simple yes/no. Together they ensure completeness and prevent gaps in understanding. | +| **Mechanism** | The framework originated in ancient Greek rhetoric (Aristotle's "elements of circumstance"), was formalized in 16th-century English rhetoric (Wilson), popularized by Kipling's 1902 poem calling them "six honest serving-men," and became standard in journalism by 1917. The BA community adapted it to requirements gathering by adding "How" as the sixth question, creating the 5W1H framework used in business analysis today. | +| **Where used** | Phase 1 project discovery: the initial seven questions (Who, What, Why, When, Where, Success, Failure, Out-of-scope) are an adaptation of the 5W1H framework. "Success" maps to "Why" (purpose), "Failure" maps to constraints, "Out-of-scope" defines project boundaries. | + +--- + +### 30. BA Requirements Question Framework + +| | | +|---|---| +| **Source** | Brandenburg, L. (2025). *Requirements Discovery Checklist Pack*. TechCanvass. | +| **Date** | 2025 | +| **URL** | https://businessanalyst.techcanvass.com/requirements-gathering-questions-for-ba/ | +| **Alternative** | Sherwen (2025). "10 Questions to Consider During Requirements Gathering."; Practical Analyst (2024). "Requirements Elicitation: Most Valuable Questions." | +| **Status** | Practitioner synthesis — consolidated BA methodology, not peer-reviewed | +| **Core finding** | Ten questions consistently make the most difference in requirements elicitation: (1) What problem are we solving? (2) What happens if we do nothing? (3) Who uses this? (4) What does success look like? (5) Walk me through how this works today (6) Where does this usually break? (7) What decisions will this help? (8) What should definitely not happen? (9) What happens if input is wrong? (10) What assumptions are we making? | +| **Mechanism** | The first four questions define scope and purpose. Questions 5-6 probe current state and pain points. Questions 7-8 identify business value and constraints. Questions 9-10 surface edge cases and hidden assumptions. This sequence ensures negative requirements (what should NOT happen) are captured, which often contain the most important business rules. | +| **Where used** | Phase 1 project discovery: the "Success" question maps to "What does success look like?" (question 4), "Failure" maps to "What should definitely not happen?" (question 8), "Out-of-scope" maps to boundary-setting from the 10-question framework. | + +--- + +### 31. Domain-Driven Design — Bounded Contexts and Feature Identification + +| | | +|---|---| +| **Source** | Evans, E. (2003). *Domain-Driven Design: Tackling Complexity in the Heart of Software*. Addison-Wesley. | +| **Date** | 2003 | +| **URL** | — | +| **Alternative** | Context Mapper (2025). Rapid Object-Oriented Analysis and Design. https://contextmapper.org/docs/rapid-ooad | +| **Status** | Confirmed — foundational DDD literature | +| **Core finding** | A Bounded Context is a boundary within which a particular ubiquitous language is consistent. Features are identified by grouping related user stories that share the same language. Features can be decomposed into subdomains, and subdomains can be grouped into Bounded Contexts. The decomposition criterion is "single responsibility per context" + "consistency of language." | +| **Mechanism** | In DDD: (1) Extract ubiquitous language from requirements → (2) Group by language consistency → (3) Each group is a candidate bounded context → (4) Each bounded context maps to a feature. Context Mapper automates this: User Stories → Subdomains (via noun/verb extraction) → Bounded Contexts of type FEATURE. | +| **Where used** | Phase 1: after feature list identification, verify each feature has consistent language. Phase 2: noun/verb extraction from project discovery answers populates the Entities table, which is the DDD candidate model. The "Rules (Business)" section captures the ubiquitous language rules that govern each feature. | + +--- + +### 32. Object Calisthenics — Nine Rules + +| | | +|---|---| +| **Source** | Bay, J. "Object Calisthenics." *The Thoughtworks Anthology* (PragProg, 2008). Original in IEEE Software/DevX, ~2005. | +| **Date** | ~2005 | +| **URL** | https://www.bennadel.com/resources/uploads/2012/objectcalisthenics.pdf | +| **Alternative** | — | +| **Status** | Practitioner synthesis | +| **Core finding** | 9 rules to internalize OOP: (1) One level indentation per method, (2) No ELSE, (3) Wrap primitives/Strings, (4) First class collections, (5) One dot per line, (6) No abbreviations, (7) Classes ≤50 lines, (8) ≤2 instance variables, (9) No getters/setters. 7 of 9 enforce data encapsulation; 1 drives polymorphism; 1 drives naming. | +| **Mechanism** | Restrictions force decomposition. When you cannot use getters, behavior must move into the object. When you cannot use ELSE, you use polymorphism. When classes must be ≤2 ivars, you discover missing abstractions. | +| **Where used** | Refactor phase in `implementation/SKILL.md`: rule checklist with PASS/FAIL per rule. | + +--- + +### 33. Refactoring + +| | | +|---|---| +| **Source** | Fowler, M. (1999/2018). *Refactoring: Improving the Design of Existing Code* (2nd ed.). Addison-Wesley. | +| **Date** | 1999, 2018 | +| **URL** | https://martinfowler.com/books/refactoring.html | +| **Alternative** | — | +| **Status** | Confirmed — foundational | +| **Core finding** | Refactoring = behavior-preserving transformations. 68 catalogued refactorings, each small enough to do safely but cumulative effect significant. Code smells (duplicate code, long methods, feature envy) indicate refactoring opportunities. | +| **Mechanism** | Small steps reduce risk. Each refactoring is reversible. Test suite validates behavior unchanged. | +| **Where used** | Refactor phase in `implementation/SKILL.md`: smell detection triggers refactoring. | + +--- + +### 34. Design Patterns + +| | | +|---|---| +| **Source** | Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1995). *Design Patterns: Elements of Reusable Object-Oriented Software*. Addison-Wesley. | +| **Date** | 1995 | +| **URL** | — | +| **Alternative** | — | +| **Status** | Confirmed — foundational | +| **Core finding** | 23 patterns catalogued in 3 categories: Creational (5), Structural (7), Behavioral (11). Key principles: "Favor composition over inheritance," "Program to an interface, not an implementation." | +| **Mechanism** | Patterns are recurring solutions to common problems. Named and catalogued so developers don't rediscover them. | +| **Where used** | Refactor phase: when ObjCal rules fail, patterns provide alternative structure. | + +--- + +### 35. SOLID Principles + +| | | +|---|---| +| **Source** | Martin, R. C. (2000). "Principles of OOD." *ButUncleBob.com*. Acronym coined by Michael Feathers (2004). | +| **Date** | 2000 | +| **URL** | https://blog.interface-solv.com/wp-content/uploads/2020/07/Principles-Of-OOD.pdf | +| **Alternative** | — | +| **Status** | Confirmed | +| **Core finding** | S: One reason to change. O: Open extension, closed modification. L: Subtypes substitutable. I: No forced stub methods. D: Depend on abstractions, not concretes. | +| **Mechanism** | Each principle targets a specific coupling failure mode. Together they produce low coupling, high cohesion. | +| **Where used** | Refactor self-check table in `implementation/SKILL.md`: 5-row SOLID table with PASS/FAIL. | + +--- + +### 36. QDIR — Bad-Smells + OO Metrics Prioritization + +| | | +|---|---| +| **Source** | Malhotra, R., Singh, P. (2020). "Exploiting bad-smells and object-oriented characteristics to prioritize classes for refactoring." *Int. J. Syst. Assur. Eng. Manag.* 11(Suppl 2), 133–144. Springer. | +| **Date** | 2020 | +| **URL** | https://doi.org/10.1007/s13198-020-01001-x | +| **Alternative** | — | +| **Status** | Confirmed — empirical | +| **Core finding** | QDIR (Quality Depreciation Index Rule) combines bad-smell severity with OO metrics (LOC, WMC, CBO, RFC, DIT) to prioritize classes for refactoring. Validated on 8 open-source Java systems. | +| **Mechanism** | Classes with high smell severity AND high OO metrics are prioritized. QDIR = weighted sum. | +| **Where used** | Refactor prioritization in Step 4: when smell detected, check OO metrics to prioritize. | + +--- + +### 37. Smells + Architectural Refactoring + +| | | +|---|---| +| **Source** | Silva, C. et al. (2020). "When Are Smells Indicators of Architectural Refactoring Opportunities." *Proc. 28th Int. Conf. on Program Comprehension*. ACM. | +| **Date** | 2020 | +| **URL** | https://doi.org/10.1145/3387904.3389276 | +| **Alternative** | — | +| **Status** | Confirmed — empirical | +| **Core finding** | Study of 50 projects, 52,667 refactored elements. 67.53% of smells co-occur. Smells that co-occur are indicators of architectural refactoring in 88.53% of cases. | +| **Mechanism** | Single smells are often code-level; co-occurring smells indicate architectural problems. Pattern catalog for smells→specific architectural refactorings. | +| **Where used** | Smell detection triggers architectural analysis when co-occurrence patterns detected. | + +--- + +### 38. SPIRIT Tool — Code Smell Prioritization + +| | | +|---|---| +| **Source** | Vidal, S. A., Marcos, C., Díaz-Pace, J. A. (2014). "An Approach to Prioritize Code Smells for Refactoring." *Automated Software Engineering*, 23(3), 501–532. Carleton University/Springer. | +| **Date** | 2014 | +| **URL** | https://doi.org/10.1007/s10515-014-0175-x | +| **Alternative** | — | +| **Status** | Confirmed — tool | +| **Core finding** | SPIRIT (Smart Identification of Refactoring opportunITies) prioritizes smells by 3 criteria: (1) component stability, (2) impact on modifiability scenarios, (3) smell relevance. Top-ranked smells correlate with expert developer judgment. | +| **Mechanism** | Semi-automated ranking. Combines version history (stable vs. unstable), impact analysis, and smell type. | +| **Where used** | Refactor prioritization: stability = has the class changed recently? Unstable + smelly = prioritize. | + +--- + +### 39. Bad Engineering Properties of OOP + +| | | +|---|---| +| **Source** | Cardelli, L. (1996). "Bad Engineering Properties of Object-Oriented Languages." *ACM Computing Surveys*, 28(4), 150. | +| **Date** | 1996 | +| **URL** | https://www.microsoft.com/en-us/research/publication/bad-engineering-properties-of-object-oriented-languages/ | +| **Alternative** | — | +| **Status** | Confirmed — foundational critique | +| **Core finding** | OOP has 5 "economy" problems: (1) Execution (virtual methods prevent inlining), (2) Compilation (no code/interface separation), (3) Small-scale dev (expressive type systems missing), (4) Large-scale dev (poor class extension/modification), (5) Language features (baroque complexity). | +| **Mechanism** | OOP is not universally superior. Trade-offs exist. Knowing these helps avoid over-engineering. | +| **Where used** | Anti-pre-pattern: know when OOP adds complexity vs. value. Feedback item 2 rationale. | + +--- + +### 40. Code Complexity Model of OOP + +| | | +|---|---| +| **Source** | Aluthwaththage, J. H., Thathsarani, H. A. N. N. (2024). "A Novel OO-Based Code Complexity Metric." *Proc. Future Technologies Conference (FTC)*, 616–628. Springer/IEEE. | +| **Date** | 2024 | +| **URL** | https://link.springer.com/chapter/10.1007/978-3-031-73125-9_39 | +| **Alternative** | Misra et al. (2024). "A Suite of Object Oriented Cognitive Complexity Metrics." IEEE. | +| **Status** | Partially confirmed — recent | +| **Core finding** | CWC (Combined Weighted Complexity) measures OOP complexity at statement level, considering 8 factors: nesting depth, control types, compound conditions, try-catch, threads, pointers, references, dynamic memory. Addresses gap in existing metrics ignoring cognitive load. | +| **Mechanism** | Granular complexity scoring. Higher scores indicate more cognitively demanding code. | +| **Where used** | Complexity measurement in Step 4 refactor: when function >20 lines, compute CWC-style granular score. | + +--- + +### 41. Metric Thresholds for Smell Detection + +| | | +|---|---| +| **Source** | Bigonha, M. A. S., et al. (2019). "The usefulness of software metric thresholds for detection of bad smells and fault prediction." *Information and Software Technology*, 115, 79–92. | +| **Date** | 2019 | +| **URL** | https://doi.org/10.1016/j.infsof.2019.08.005 | +| **Alternative** | Catal et al. (2018). "Software metrics thresholds calculation techniques." Info. Softw. Technol. | +| **Status** | Confirmed | +| **Core finding** | Metric thresholds (e.g., LOC > 600) used for smell detection are unreliable. Study on 92 open-source systems found precision too low for practical use. Neither heuristic-based (DECOR) nor ML approaches achieve acceptable accuracy. ROC Curves best of 3 threshold techniques but still insufficient alone. | +| **Mechanism** | Fixed thresholds are context-dependent. Thresholds should be project-specific, not universal. | +| **Where used** | Anti-pre-pattern: do not rely on fixed thresholds. Use co-occurrence patterns (Entry 37) instead. | + +--- + +### 42. Hexagonal Architecture — Ports and Adapters + +| | | +|---|---| +| **Source** | Cockburn, A. (2005). "Hexagonal Architecture." *alistair.cockburn.us*. https://alistair.cockburn.us/hexagonal-architecture/ | +| **Date** | 2005 | +| **URL** | https://alistair.cockburn.us/hexagonal-architecture/ | +| **Alternative** | Freeman, S., & Pryce, N. (2009). *Growing Object-Oriented Software, Guided by Tests*. Addison-Wesley. (Chapter 7: "Ports and Adapters") | +| **Status** | Confirmed — foundational; widely adopted as Clean Architecture, Onion Architecture | +| **Core finding** | The application domain should have no knowledge of external systems (databases, filesystems, network, UI). All contact between the domain and the outside world passes through a **port** (an interface / Protocol) and an **adapter** (a concrete implementation of that port). This makes the domain independently testable without any infrastructure. The key structural rule: dependency arrows point inward — domain code never imports from adapters; adapters import from domain. | +| **Mechanism** | Two distinct sides of any application: the "driving side" (actors who initiate action — tests, UI, CLI) and the "driven side" (actors the application drives — databases, filesystems, external services). Each driven-side dependency is hidden behind a port. Tests supply a test adapter; production supplies a real adapter. Substituting adapters requires no domain code changes. This is what SOLID-D ("depend on abstractions") looks like at the architectural layer — not just at the class level. | +| **Where used** | Step 2 (Architecture): every external dependency identified during domain analysis must be assigned a port (Protocol) and a concrete adapter. Module structure always includes `<package>/adapters/<dep>.py` alongside `<package>/domain/`. The `adapters/` layer is decided at Step 2, not discovered during Step 4 refactoring. | + +--- + +### 43. Feature-Driven Development — Domain Modeling to Feature List + +| | | +|---|---| +| **Source** | Ambler, S. W. (2002). *Agile Modeling: Effective Practices for eXtreme Programming and the Unified Process*. Wiley. Supplemented by: agilemodeling.com — "Feature Driven Development and Agile Modeling." | +| **Date** | 2002 | +| **URL** | https://www.agilemodeling.com/essays/fdd.htm | +| **Alternative** | Palmer, S. R., & Felsing, J. M. (2002). *A Practical Guide to Feature-Driven Development*. Prentice Hall. | +| **Status** | Confirmed | +| **Core finding** | FDD requires domain modeling *before* feature naming. Features are expressed as "Action result object" triples (e.g., "Enroll a student in a seminar"). Features group into Feature Sets (shared domain object), which group into Subject Areas. 78% of organisations doing Agile also do initial high-level agile requirements modeling; 85% find it worthwhile. | +| **Mechanism** | Domain modeling extracts the vocabulary (nouns = candidate classes, verbs = candidate methods). Feature identification then asks: "what verbs act on each noun?" This produces a list of small, deliverable units that are coherent with the domain rather than reflecting technical or organisational boundaries. | +| **Where used** | Phase 1 of `scope/SKILL.md`: after the interview summary is confirmed, PO performs domain analysis (nouns/verbs → subject areas → FDD "Action object" feature names) before creating `.feature` file stubs. | + +--- + +### 44. Affinity Mapping / KJ Method — Bottom-Up Feature Identification + +| | | +|---|---| +| **Source** | Krause, R., & Pernice, K. (2024). Affinity Diagramming for Collaboratively Sorting UX Findings and Design Ideas. *Nielsen Norman Group*. https://www.nngroup.com/articles/affinity-diagram/ | +| **Date** | 2024 (method origin: Kawakita, J., 1960s) | +| **URL** | https://www.nngroup.com/articles/affinity-diagram/ | +| **Alternative** | Kawakita, J. (1967). *Abduction*. Chuokoronsha (KJ Method original). | +| **Status** | Confirmed | +| **Core finding** | Affinity diagramming (KJ Method) groups raw observations/requirements into clusters by bottom-up similarity — no categories are named until grouping is complete. This prevents confirmation bias from top-down pre-labelling. Each named cluster becomes a candidate feature. Dot voting on clusters produces a prioritized feature list. Small clusters must not be discarded — they often represent minority concerns or genuinely novel features. | +| **Mechanism** | Bottom-up category emergence: when categories are not imposed in advance, the grouping reflects actual similarity in the data rather than the analyst's prior mental model. NN/G: "the journey is more important than the destination — the discussions that occurred while building the diagram are more impactful than the final format." | +| **Where used** | Phase 1 of `scope/SKILL.md` (alternative to FDD domain modeling): PO uses affinity mapping on interview answers to derive feature clusters before creating `.feature` stubs. Best suited when working from interview transcripts solo rather than with a cross-silo team. | + +--- + +### 45. Event Storming — Domain Events to Functional Areas + +| | | +|---|---| +| **Source** | Brandolini, A. (2013–present). *EventStorming*. Leanpub / eventstorming.com. https://eventstorming.com | +| **Date** | 2013 | +| **URL** | https://eventstorming.com; Bourgau, P. (2017). Detailed Agenda of a DDD Big Picture Event Storming. https://philippe.bourgau.net/detailed-agenda-of-a-ddd-big-picture-event-storming-part-1/ | +| **Alternative** | Brandolini, A. (2021). *Introducing EventStorming*. Leanpub. | +| **Status** | Confirmed | +| **Core finding** | Event Storming is a collaborative workshop where domain experts place past-tense domain events on a timeline. Sorting the events creates natural Functional Area clusters — these are candidate feature groups / Subject Areas. The workshop also produces Ubiquitous Language (shared vocabulary), a Problem Inventory (open questions), and Actor roles (for user story "As a [role]" parts). It does NOT produce Gherkin directly; its output feeds into Example Mapping per story. | +| **Mechanism** | Temporal sequencing of domain events forces resolution of conflicting mental models across organisational silos. Clusters emerge from shared vocabulary and causal proximity — not from the facilitator's prior structure. Bourgau: "Although nobody understands Bounded Context from the start, everyone gets Functional Area." | +| **Where used** | Optional alternative in Phase 1 of `scope/SKILL.md` for cross-silo discovery. Best suited when multiple stakeholders from different departments need to build shared understanding. Outputs (Functional Areas + Ubiquitous Language) map directly to Subject Areas (feature groups) and the Entities table in `.feature` file discovery sections. | + +--- + +### 46. Critical Incident Technique — Gap-Finding via Past Events + +| | | +|---|---| +| **Source** | Flanagan, J. C. (1954). "The critical incident technique." *Psychological Bulletin*, 51(4), 327–357. | +| **Date** | 1954 | +| **URL** | https://doi.org/10.1037/h0061470 | +| **Alternative** | Rosala, M. (2020). The Critical Incident Technique in UX. *Nielsen Norman Group*. https://www.nngroup.com/articles/critical-incident-technique/ | +| **Status** | Confirmed — foundational; ~200 follow-on empirical studies in marketing alone (Gremler 2004) | +| **Core finding** | Anchoring an interview on a specific past incident ("Tell me about a time when X broke down") breaks schema-based recall. Stakeholders describing actual past events report real workarounds, edge cases, and failure modes that never surface when asked "how does this usually work?" The technique explicitly requires both positive and negative incidents — positive first to establish rapport, negative second to surface failures. | +| **Mechanism** | Direct questions ("how does the system work?") elicit the stakeholder's mental schema — a sanitized, normalized, gap-free description of how things *should* work. Incidents bypass the schema because episodic memory is anchored to specific sensory and emotional detail that the schema lacks. Flanagan: "a critical incident must occur in a situation where the purpose or intent of the act seems fairly clear to the observer and where its consequences are sufficiently definite to leave little doubt." | +| **Where used** | Session 2 (gap-finding) of Phase 1 and Phase 2 in `scope/SKILL.md`. CIT prompts: "Tell me about a specific time this worked well / broke down." Follow up: "What were you trying to do? What made it difficult? What did you do instead?" | + +--- + +### 47. Cognitive Interview — Memory-Enhancing Elicitation Technique + +| | | +|---|---| +| **Source** | Fisher, R. P., & Geiselman, R. E. (1992). *Memory-Enhancing Techniques for Investigative Interviewing: The Cognitive Interview*. Charles C. Thomas. | +| **Date** | 1984 (original); 1987 (enhanced CI); 1992 (manual) | +| **URL** | DOI: 10.1037/0021-9010.74.5.722 (1989 field study) | +| **Alternative** | Moody, W., Will, R. P., & Blanton, J. E. (1996). "Enhancing knowledge elicitation using the cognitive interview." *Expert Systems with Applications*, 10(1), 127–133. DOI: 10.1016/0957-4174(95)00039-9 | +| **Status** | Confirmed — meta-analysis: Köhnken, Milne, Memon & Bull (1999), *Psychology, Crime & Law*, 5(1-2), 3–27. DOI: 10.1080/10683169908414991 | +| **Core finding** | The enhanced CI elicits ~35% more correct information than standard interviews with equal accuracy rates (85% vs. 82%). Moody et al. (1996) directly applied CI to knowledge elicitation from domain experts, finding it superior for capturing episodic knowledge that standard structured interviews miss. | +| **Mechanism** | Four retrieval mnemonics: (1) **Mental reinstatement of context** — stakeholder mentally returns to a specific past situation; (2) **Report everything** — all details including seemingly minor ones; (3) **Temporal reversal** — narrate events from a different starting point to disrupt schema-based reconstruction; (4) **Perspective change** — describe the situation from another actor's viewpoint. Each mnemonic opens a different memory access route, collectively surfacing what direct questions cannot. | +| **Where used** | Session 2 (gap-finding) of Phase 1 and Phase 2 in `scope/SKILL.md`. CI perspective change prompt: "What do you think the end user experiences in that situation?" CI reversal prompt: "Walk me through that scenario starting from when it went wrong." | + +--- + +### 48. Laddering / Means-End Chain — Surfacing Unstated Motivations + +| | | +|---|---| +| **Source** | Reynolds, T. J., & Gutman, J. (1988). "Laddering theory, method, analysis, and interpretation." *Journal of Advertising Research*, 28(1), 11–31. | +| **Date** | 1988 (method origin: Kelly, G. (1955). *The Psychology of Personal Constructs*. Norton.) | +| **URL** | https://en.wikipedia.org/wiki/Repertory_grid | +| **Alternative** | Hunter, M. G., & Beck, J. E. (2000). "Using repertory grids to conduct cross-cultural information systems research." *Information Systems Research*, 11(1), 93–101. DOI: 10.1287/isre.11.1.93.11786 | +| **Status** | Confirmed — operationalised in IS research (Hunter & Beck 2000); embedded in NNG interview protocols (Rosala 2021) | +| **Core finding** | Repeatedly asking "Why is that important to you?" climbs a means-end chain from concrete attribute → functional consequence → psychosocial consequence → terminal value. The stakeholder's first answer is rarely the real constraint — it is the socially expected, conscious-level response. The real motivation (and the actual constraint that requirements must satisfy) emerges two or three levels up the ladder. | +| **Mechanism** | The Gherkin "So that [benefit]" clause is structurally a single-rung means-end ladder. Full laddering reveals the value conflicts between stakeholders whose surface requirements look identical but whose ladders diverge at the consequence level. Without laddering, requirements that satisfy the stated attribute may fail the underlying goal. | +| **Where used** | Session 2 (gap-finding) of Phase 1 and Phase 2 in `scope/SKILL.md`. Laddering probe: "Why is that important to you?", "What does that enable for you?", "What would break if that weren't available?" Climb until the stakeholder reaches a terminal value they cannot explain further. | + +--- + +### 49. Funnel Technique — Question Ordering to Prevent Priming + +| | | +|---|---| +| **Source** | Rosala, M., & Moran, K. (2022). The Funnel Technique in Qualitative User Research. *Nielsen Norman Group*. https://www.nngroup.com/articles/the-funnel-technique-in-qualitative-user-research/ | +| **Date** | 2022 | +| **URL** | https://www.nngroup.com/articles/the-funnel-technique-in-qualitative-user-research/ | +| **Alternative** | Christel, M. G., & Kang, K. C. (1992). *Issues in Requirements Elicitation*. CMU/SEI-92-TR-012. https://www.sei.cmu.edu/library/abstracts/reports/92tr012.cfm | +| **Status** | Confirmed — standard NNG qualitative research protocol | +| **Core finding** | Starting with broad open-ended questions before narrowing to specifics prevents the interviewer from priming the interviewee's responses. Once a category label is introduced, the interviewee interprets subsequent questions through that frame and under-reports items that don't fit it. Broad-to-narrow sequencing within each topic cluster is the evidence-based default for discovery interviews. | +| **Mechanism** | Priming bias is structural: human memory is associative, so any category name the interviewer introduces activates a schema that filters what the interviewee considers worth reporting. The funnel sequences questions so the interviewee's own categories emerge first, before the interviewer's categories are introduced. | +| **Where used** | Within each session of Phase 1 and Phase 2 in `scope/SKILL.md`. Within each topic cluster: start with "Tell me about..." before asking specific follow-up probes. Applies alongside CIT, CI, and Laddering — all of which are also open-ended by design. | + +--- + +### 50. Issues in Requirements Elicitation — Why Direct Questions Fail + +| | | +|---|---| +| **Source** | Christel, M. G., & Kang, K. C. (1992). *Issues in Requirements Elicitation*. CMU/SEI-92-TR-012. Software Engineering Institute, Carnegie Mellon University. | +| **Date** | 1992 | +| **URL** | https://www.sei.cmu.edu/library/abstracts/reports/92tr012.cfm | +| **Alternative** | Sommerville, I., & Sawyer, P. (1997). *Requirements Engineering: A Good Practice Guide*. Wiley. | +| **Status** | Confirmed — foundational SEI technical report; widely cited in RE literature | +| **Core finding** | Stakeholders have three structural problems that make direct questioning insufficient: (1) they omit information that is "obvious" to them but unknown to the analyst; (2) they have trouble communicating needs they have never had to articulate; (3) they may not know what they want until they see what they don't want. These are not stakeholder failures — they are structural properties of tacit knowledge. | +| **Mechanism** | Expert knowledge is largely procedural and tacit. When asked "how does the system work?", experts describe what they believe happens, not what actually happens. This sanitized account is internally consistent but incomplete. Gap-finding techniques (CIT, CI, Laddering) are required because they bypass the expert's mental schema and access the episodic memory layer where real complexity lives. | +| **Where used** | Theoretical justification for the 3-session interview structure and the use of CIT, CI, and Laddering in `scope/SKILL.md`. Answers the question: "why not just ask the stakeholder directly what they need?" | + +--- + ## Bibliography 1. Cialdini, R. B. (2001). *Influence: The Psychology of Persuasion* (rev. ed.). HarperBusiness. @@ -467,3 +813,36 @@ This document explains the cognitive and social-science mechanisms that justify 31. Sharma, A., & Henley, A. (2026). Modular Prompt Optimization. arXiv:2601.04055. https://arxiv.org/abs/2601.04055 32. Rogers, C. R., & Farson, R. E. (1957). *Active Listening*. Industrial Relations Center, University of Chicago. 33. McNaughton, D., Hamlin, D., McCarthy, J., Head-Reeves, D., & Schreiner, M. (2008). Learning to Listen: Teaching an Active Listening Strategy to Preservice Education Professionals. *Topics in Early Childhood Special Education*, 27(4), 223–231. +34. Kipling, R. (1902). *Just So Stories*. Macmillan. +35. Brandenburg, L. (2025). *Requirements Discovery Checklist Pack*. TechCanvass. https://www.businessanalyststoolkit.com/requirements-elicitation-questions/ +36. Sherwen. (2025). "10 Questions to Consider During Requirements Gathering." https://www.sherwen.com/insights/10-questions-you-must-ask-during-requirements-gathering +37. Evans, E. (2003). *Domain-Driven Design: Tackling Complexity in the Heart of Software*. Addison-Wesley. +38. Context Mapper. (2025). Rapid Object-Oriented Analysis and Design. https://contextmapper.org/docs/rapid-ooad +39. Bay, J. (2005). "Object Calisthenics." *IEEE Software/DevX*. https://www.bennadel.com/resources/uploads/2012/objectcalisthenics.pdf +40. Fowler, M. (1999/2018). *Refactoring: Improving the Design of Existing Code*. Addison-Wesley. https://martinfowler.com/books/refactoring.html +41. Gamma, E., Helm, R., Johnson, R., Vlissides, J. (1995). *Design Patterns: Elements of Reusable Object-Oriented Software*. Addison-Wesley. +42. Martin, R. C. (2000). "Principles of OOD." *ButUncleBob.com*. https://blog.interface-solv.com/wp-content/uploads/2020/07/Principles-Of-OOD.pdf +43. Malhotra, R., & Singh, P. (2020). Exploiting bad-smells and object-oriented characteristics to prioritize classes for refactoring. *Int. J. Syst. Assur. Eng. Manag.*, 11(Suppl 2), 133–144. https://doi.org/10.1007/s13198-020-01001-x +44. Silva, C. et al. (2020). When Are Smells Indicators of Architectural Refactoring Opportunities. *Proc. 28th Int. Conf. on Program Comprehension*. ACM. https://doi.org/10.1145/3387904.3389276 +45. Vidal, S. A., Marcos, C., & Díaz-Pace, J. A. (2014). An Approach to Prioritize Code Smells for Refactoring. *Automated Software Engineering*, 23(3), 501–532. https://doi.org/10.1007/s10515-014-0175-x +46. Cardelli, L. (1996). Bad Engineering Properties of Object-Oriented Languages. *ACM Computing Surveys*, 28(4), 150. https://www.microsoft.com/en-us/research/publication/bad-engineering-properties-of-object-oriented-languages/ +47. Aluthwaththage, J. H., & Thathsarani, H. A. N. N. (2024). A Novel OO-Based Code Complexity Metric. *Proc. Future Technologies Conference (FTC)*, 616–628. https://link.springer.com/chapter/10.1007/978-3-031-73125-9_39 +48. Bigonha, M. A. S., et al. (2019). The usefulness of software metric thresholds for detection of bad smells and fault prediction. *Information and Software Technology*, 115, 79–92. https://doi.org/10.1016/j.infsof.2019.08.005 +49. Ambler, S. W. (2002). *Agile Modeling: Effective Practices for eXtreme Programming and the Unified Process*. Wiley. https://www.agilemodeling.com/essays/fdd.htm +50. Palmer, S. R., & Felsing, J. M. (2002). *A Practical Guide to Feature-Driven Development*. Prentice Hall. +51. Krause, R., & Pernice, K. (2024). Affinity Diagramming for Collaboratively Sorting UX Findings and Design Ideas. *Nielsen Norman Group*. https://www.nngroup.com/articles/affinity-diagram/ +52. Brandolini, A. (2013–present). *EventStorming*. Leanpub / eventstorming.com. https://eventstorming.com +53. Bourgau, P. (2017). Detailed Agenda of a DDD Big Picture Event Storming. https://philippe.bourgau.net/detailed-agenda-of-a-ddd-big-picture-event-storming-part-1/ +54. Nielsen, J. (2010). *Interviewing Users*. Nielsen Norman Group. https://www.nngroup.com/articles/interviewing-users/ +55. Farrell, S. (2017). UX Research Cheat Sheet. *Nielsen Norman Group*. https://www.nngroup.com/articles/ux-research-cheat-sheet/ +56. Flanagan, J. C. (1954). The critical incident technique. *Psychological Bulletin*, 51(4), 327–357. https://doi.org/10.1037/h0061470 +57. Fisher, R. P., & Geiselman, R. E. (1992). *Memory-Enhancing Techniques for Investigative Interviewing: The Cognitive Interview*. Charles C. Thomas. +58. Fisher, R. P., Geiselman, R. E., & Amador, M. (1989). Field test of the cognitive interview: Enhancing the recollection of actual victims and witnesses of crime. *Journal of Applied Psychology*, 74(5), 722–727. https://doi.org/10.1037/0021-9010.74.5.722 +59. Köhnken, G., Milne, R., Memon, A., & Bull, R. (1999). The cognitive interview: A meta-analysis. *Psychology, Crime & Law*, 5(1-2), 3–27. https://doi.org/10.1080/10683169908414991 +60. Moody, W., Will, R. P., & Blanton, J. E. (1996). Enhancing knowledge elicitation using the cognitive interview. *Expert Systems with Applications*, 10(1), 127–133. https://doi.org/10.1016/0957-4174(95)00039-9 +61. Reynolds, T. J., & Gutman, J. (1988). Laddering theory, method, analysis, and interpretation. *Journal of Advertising Research*, 28(1), 11–31. +62. Christel, M. G., & Kang, K. C. (1992). *Issues in Requirements Elicitation*. CMU/SEI-92-TR-012. Software Engineering Institute, Carnegie Mellon University. https://www.sei.cmu.edu/library/abstracts/reports/92tr012.cfm +63. Rosala, M. (2020). The Critical Incident Technique in UX. *Nielsen Norman Group*. https://www.nngroup.com/articles/critical-incident-technique/ +64. Rosala, M., & Moran, K. (2022). The Funnel Technique in Qualitative User Research. *Nielsen Norman Group*. https://www.nngroup.com/articles/the-funnel-technique-in-qualitative-user-research/ +65. Cockburn, A. (2005). Hexagonal Architecture. *alistair.cockburn.us*. https://alistair.cockburn.us/hexagonal-architecture/ +66. Freeman, S., & Pryce, N. (2009). *Growing Object-Oriented Software, Guided by Tests*. Addison-Wesley. diff --git a/docs/workflow.md b/docs/workflow.md index e678593..f55e0b2 100644 --- a/docs/workflow.md +++ b/docs/workflow.md @@ -113,15 +113,71 @@ Each step has a designated agent and a specific deliverable. No step is skipped. │ STEP 2 — ARCHITECTURE agent: developer │ ├─────────────────────────────────────────────────────────────────────┤ │ │ +│ PREREQUISITES (stop if any fail — escalate to PO) │ +│ [ ] in-progress/ has no .feature file (WIP = 1) │ +│ [ ] feature Status: BASELINED │ +│ [ ] feature has Rule: + Example: + @id tags │ +│ [ ] package name confirmed (pyproject.toml → directory exists) │ +│ │ │ mv backlog/<name>.feature → in-progress/<name>.feature │ -│ Read docs/features/discovery.md (project-level) │ -│ Read ALL backlog .feature files (discovery + entities sections) │ -│ Read in-progress .feature file (full) │ -│ Identify cross-feature entities, shared interfaces, extension pts │ -│ Silent pre-mortem (YAGNI/KISS/DRY/SOLID/OC/patterns) │ -│ Append Architecture section to in-progress .feature description │ -│ (Module Structure + ADRs + Build Changes) │ -│ Architecture contradiction check — resolve with PO if needed │ +│ │ +│ READ (all before writing anything) │ +│ docs/features/discovery.md (project-level) │ +│ ALL backlog .feature files (discovery + entities sections) │ +│ in-progress .feature file (full: Rules + Examples + @id) │ +│ │ +│ DOMAIN ANALYSIS │ +│ From Entities table + Rules (Business) in .feature file: │ +│ Nouns → named classes, value objects, aggregates │ +│ Verbs → method names with typed signatures │ +│ Datasets → named types (not bare dict/list) │ +│ Bounded Context check: same word, different meaning across │ +│ features? → module boundary goes there │ +│ Cross-feature entities → candidate shared domain layer │ +│ │ +│ SILENT PRE-MORTEM (before writing anything) │ +│ "In 6 months this design is a mess. What mistakes did we make?" │ +│ For each candidate class: >2 ivars? >1 reason to change? │ +│ For each external dep: is it behind a Protocol? │ +│ Any noun serving double duty across modules? │ +│ Any structure missing a named design pattern? │ +│ → If pattern smell detected: load skill design-patterns │ +│ │ +│ Write Architecture section in in-progress .feature file │ +│ ### Module Structure │ +│ <package>/domain/<noun>.py │ +│ class <Noun>: ← named class + responsibilities │ +│ field: Type │ +│ def <verb>(<Noun>) -> <Type>: ... ← typed signatures │ +│ class <DepName>(Protocol): ... ← external dep contract │ +│ <package>/domain/service.py ← cross-entity operations │ +│ <package>/adapters/<dep>.py ← Protocol impl │ +│ ### Key Decisions │ +│ ADR-NNN: <title> │ +│ Decision: <what> │ +│ Reason: <why in one sentence> │ +│ Alternatives considered: <what was rejected and why> │ +│ ### Build Changes (new runtime deps — requires PO approval) │ +│ │ +│ NOTE: signatures are informative — tests/implementation may │ +│ refine them; record significant changes as ADR updates │ +│ │ +│ ARCHITECTURE SMELL CHECK — hard gate (fix before commit) │ +│ [ ] No planned class with >2 responsibilities (SOLID-S) │ +│ [ ] No planned class with >2 instance variables (OC-8) │ +│ [ ] All external deps assigned a Protocol/Adapter (SOLID-D + │ +│ Hexagonal Architecture) │ +│ [ ] No noun with different meaning across planned modules │ +│ (DDD Bounded Context) │ +│ [ ] No missing Creational pattern: repeated construction │ +│ without Factory/Builder │ +│ [ ] No missing Structural pattern: type-switching logic │ +│ without Strategy/Visitor │ +│ [ ] No missing Behavioral pattern: state machine or scattered │ +│ notification without State/Observer │ +│ [ ] Each ADR consistent with each @id AC — no contradictions │ +│ [ ] Technically infeasible story → escalate to PO │ +│ │ │ commit: feat(<name>): add architecture │ │ │ └─────────────────────────────────────────────────────────────────────┘ @@ -158,9 +214,10 @@ Each step has a designated agent and a specific deliverable. No step is skipped. │ REFACTOR: DRY → SOLID → Object Calisthenics (9 rules) │ │ → type hints → docstrings │ │ SELF-DECLARE: write ## Self-Declaration block in TODO.md │ -│ 21-item checklist (YAGNI×2, KISS×2, DRY×2, │ -│ SOLID×5, OC×9, Semantic×1) with file:line evidence │ -│ each item: checked box + evidence, or N/A + reason │ +│ 24 first-person declarations (YAGNI×2, KISS×2, │ +│ DRY×2, SOLID×5, OC×9, Patterns×3, Semantic×1) │ +│ "As a developer I declare [rule] — YES | file:line" │ +│ or N/A | reason; load design-patterns if smell found │ │ REVIEWER: code-design check only (no lint/pyright/coverage) │ │ reviewer independently verifies YES claims │ │ reviewer does NOT re-audit self-declared failures │ @@ -320,27 +377,30 @@ Test: @id:<hex> — <description> Phase: RED | GREEN | REFACTOR | SELF-DECLARE | REVIEWER(code-design) | COMMITTED ## Self-Declaration (@id:<hex>) -- [x] YAGNI-1: No abstractions beyond current AC — `file:line` -- [x] YAGNI-2: No speculative parameters or flags — `file:line` -- [x] KISS-1: Every function has one job — `file:line` -- [x] KISS-2: No unnecessary indirection — `file:line` -- [x] DRY-1: No duplicated logic — `file:line` -- [x] DRY-2: Every shared concept in one place — `file:line` -- [x] SOLID-S: One reason to change — `file:line` -- [x] SOLID-O: Extension, not modification — `file:line` or N/A -- [x] SOLID-L: Subtypes fully substitutable — `file:line` or N/A -- [x] SOLID-I: No forced stub methods — `file:line` or N/A -- [x] SOLID-D: Domain depends on Protocols — `file:line` -- [x] OC-1: One indent level per method — `file:line` -- [x] OC-2: No else after return — `file:line` or N/A -- [x] OC-3: No bare primitives as domain concepts — `file:line` or N/A -- [x] OC-4: No bare collections as domain values — `file:line` or N/A -- [x] OC-5: No chained dot navigation — `file:line` or N/A -- [x] OC-6: No abbreviations — `file:line` or N/A -- [x] OC-7: Functions ≤ 20 lines, classes ≤ 50 lines — `file:line` -- [x] OC-8: ≤ 2 instance variables per class — `file:line` -- [x] OC-9: No getters/setters — `file:line` or N/A -- [x] Semantic: test abstraction matches AC level — `file:line` +As a developer I declare this code follows YAGNI-1 (no abstractions beyond current AC) — YES | `file:line` +As a developer I declare this code follows YAGNI-2 (no speculative parameters or flags) — YES | `file:line` +As a developer I declare this code follows KISS-1 (every function has one job) — YES | `file:line` +As a developer I declare this code follows KISS-2 (no unnecessary indirection) — YES | `file:line` +As a developer I declare this code follows DRY-1 (no duplicated logic) — YES | `file:line` +As a developer I declare this code follows DRY-2 (every shared concept in one place) — YES | `file:line` +As a developer I declare this code follows SOLID-S (one reason to change) — YES | `file:line` +As a developer I declare this code follows SOLID-O (extension not modification) — YES | `file:line` or N/A | reason +As a developer I declare this code follows SOLID-L (subtypes fully substitutable) — YES | `file:line` or N/A | reason +As a developer I declare this code follows SOLID-I (no forced stub methods) — YES | `file:line` or N/A | reason +As a developer I declare this code follows SOLID-D (domain depends on Protocols) — YES | `file:line` +As a developer I declare this code follows OC-1 (max one indent level per method) — YES | deepest: `file:line` +As a developer I declare this code follows OC-2 (no else after return) — YES | `file:line` or N/A | reason +As a developer I declare this code follows OC-3 (no bare primitives as domain concepts) — YES | `file:line` or N/A | reason +As a developer I declare this code follows OC-4 (no bare collections as domain values) — YES | `file:line` or N/A | reason +As a developer I declare this code follows OC-5 (no chained dot navigation) — YES | `file:line` or N/A | reason +As a developer I declare this code follows OC-6 (no abbreviations) — YES | `file:line` or N/A | reason +As a developer I declare this code follows OC-7 (functions ≤20 lines, classes ≤50 lines) — YES | longest: `file:line` +As a developer I declare this code follows OC-8 (≤2 instance variables per class) — YES | `file:line` +As a developer I declare this code follows OC-9 (no getters/setters) — YES | `file:line` or N/A | reason +As a developer I declare this code has no missing Creational pattern (no smell: repeated construction or scattered instantiation) — YES | `file:line` or N/A | reason +As a developer I declare this code has no missing Structural pattern (no smell: feature envy or parallel conditionals on type) — YES | `file:line` or N/A | reason +As a developer I declare this code has no missing Behavioral pattern (no smell: large state machine, scattered notification, or repeated algorithm skeleton) — YES | `file:line` or N/A | reason +As a developer I declare test abstraction matches AC level (semantic alignment) — YES | `file:line` ## Progress - [x] @id:<hex>: <done> — reviewer(code-design) APPROVED From d692eeadd1b699f62b534ecac94776e21eb9e124 Mon Sep 17 00:00:00 2001 From: nullhack <nullhack@users.noreply.github.com> Date: Fri, 17 Apr 2026 17:35:18 -0400 Subject: [PATCH 03/12] refactor: align workflow to new 5-step diagram MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename developer to software-engineer - Steps 3+4 merged into TDD Loop (Steps 2-3) - Steps 5→4 Verify, 6→5 Accept - Remove gen-tests/gen-id/deprecated/tdd skill - Self-declaration: AGREE/DISAGREE | file:line, 24 items, once per feature - Per-test reviewer gate removed - Add software-engineer.md agent - Update all skills and agents to match --- .opencode/agents/product-owner.md | 20 +- .opencode/agents/reviewer.md | 18 +- .../{developer.md => software-engineer.md} | 21 +- .opencode/skills/implementation/SKILL.md | 499 +++++++++--------- .opencode/skills/session-workflow/SKILL.md | 54 +- .opencode/skills/tdd/SKILL.md | 201 ------- .opencode/skills/verify/SKILL.md | 251 +++++---- AGENTS.md | 134 +---- docs/academic_research.md | 64 +++ docs/workflow.md | 281 ++++++---- feedback.md | 54 ++ 11 files changed, 756 insertions(+), 841 deletions(-) rename .opencode/agents/{developer.md => software-engineer.md} (65%) delete mode 100644 .opencode/skills/tdd/SKILL.md diff --git a/.opencode/agents/product-owner.md b/.opencode/agents/product-owner.md index ed8a8a2..081653d 100644 --- a/.opencode/agents/product-owner.md +++ b/.opencode/agents/product-owner.md @@ -26,36 +26,36 @@ Load `skill session-workflow` first — it reads TODO.md, orients you to the cur | Step | Action | |---|---| | **Step 1 — SCOPE** | Load `skill scope` — contains the full 4-phase discovery and criteria protocol | -| **Step 6 — ACCEPT** | See acceptance protocol below | +| **Step 5 — ACCEPT** | See acceptance protocol below | ## Ownership Rules - You are the **sole owner** of `.feature` files and `docs/features/discovery.md` - No other agent may edit these files -- Developer escalates spec gaps to you; you decide whether to extend criteria -- **You pick** the next feature from backlog — the developer never self-selects +- Software-engineer escalates spec gaps to you; you decide whether to extend criteria +- **You pick** the next feature from backlog — the software-engineer never self-selects -## Step 6 — Accept +## Step 5 — Accept -After the reviewer approves (Step 5): +After the reviewer approves (Step 4): 1. Run or observe the feature yourself. If user interaction is involved, interact with it. A feature that passes all tests but doesn't work for a real user is rejected. 2. Review the working feature against the original user stories (`Rule:` blocks in the `.feature` file). -3. **If accepted**: move `docs/features/in-progress/<name>.feature` → `docs/features/completed/<name>.feature`; update TODO.md; ask the developer to create a PR and tag a release. +3. **If accepted**: move `docs/features/in-progress/<name>.feature` → `docs/features/completed/<name>.feature`; update TODO.md; notify stakeholder. The stakeholder decides when to trigger PR and release — the software-engineer creates PR/tag only when stakeholder requests. 4. **If rejected**: write specific feedback in TODO.md, send back to the relevant step. ## Handling Gaps -When a gap is reported (by developer or reviewer): +When a gap is reported (by software-engineer or reviewer): | Situation | Action | |---|---| -| Edge case within current user stories | Add a new Example with a new `@id` to the relevant `.feature` file. Run `uv run task gen-tests`. | +| Edge case within current user stories | Add a new Example with a new `@id` to the relevant `.feature` file. | | New behavior beyond current stories | Add to backlog as a new feature. Do not extend the current feature. | -| Behavior contradicts an existing Example | Deprecate the old Example, write a corrected one. | +| Behavior contradicts an existing Example | Write a new Example with new `@id`. | | Post-merge defect | Move the `.feature` file back to `in-progress/`, add new Example with `@id`, resume at Step 3. | ## Available Skills - `session-workflow` — session start/end protocol -- `scope` — Step 1: 3-session discovery (Phase 1 + 2), stories (Phase 3), and criteria (Phase 4) +- `scope` — Step 1: 3-session discovery (Phase 1 + 2), stories (Phase 3), and criteria (Phase 4) \ No newline at end of file diff --git a/.opencode/agents/reviewer.md b/.opencode/agents/reviewer.md index 2e6b8ef..415d07f 100644 --- a/.opencode/agents/reviewer.md +++ b/.opencode/agents/reviewer.md @@ -1,5 +1,5 @@ --- -description: Reviewer responsible for Step 5 verification — runs all commands and checks code quality +description: Reviewer responsible for Step 4 verification — runs all commands and checks code quality mode: subagent temperature: 0.3 tools: @@ -33,17 +33,12 @@ You verify that work is done correctly by running commands and reading code. You ## Session Start -Load `skill session-workflow` first. Then load the skill for the review type requested: - -| Review type | Skill to load | -|---|---| -| **Step 5 — full verification** | Load `skill verify` | -| **Step 4 — per-test code-design check** | Load `skill implementation` (use the REVIEWER CHECK section) | +Load `skill session-workflow` first. Then load `skill verify` for Step 4 verification. ## Zero-Tolerance Rules -- **Never approve without running commands** (Step 5 only — Step 4 code-design checks have no commands). -- **Never skip a check.** If a command fails, report it. Do not work around it. +- **Never approve without running commands**. +- **Never skip a check.** If a command fails, report it. - **Never suggest `noqa`, `type: ignore`, or `pytest.skip` as a fix.** These are bypasses, not solutions. - **Report specific locations.** "`physics/engine.py:47`: unreachable return" not "there is dead code." - **Every PASS/FAIL cell must have evidence.** Empty evidence = UNCHECKED = REJECTED. @@ -56,12 +51,11 @@ If you discover an observable behavior with no acceptance criterion: |---|---| | Edge case within current user stories | Report to PO with suggested Example text. PO decides. | | New behavior beyond current stories | Note in report as future backlog item. Do not add criteria. | -| Behavior contradicts an existing Example | REJECTED — report contradiction to developer and PO. | +| Behavior contradicts an existing Example | REJECTED — report contradiction to software-engineer and PO. | You never edit `.feature` files or add Examples yourself. ## Available Skills - `session-workflow` — session start/end protocol -- `verify` — Step 5: full verification protocol with all tables, gates, and report template -- `implementation` — Step 4: REVIEWER CHECK section for per-test code-design checks +- `verify` — Step 4: full verification protocol with all tables, gates, and report template diff --git a/.opencode/agents/developer.md b/.opencode/agents/software-engineer.md similarity index 65% rename from .opencode/agents/developer.md rename to .opencode/agents/software-engineer.md index 9b8a2bf..d7a8bff 100644 --- a/.opencode/agents/developer.md +++ b/.opencode/agents/software-engineer.md @@ -1,5 +1,5 @@ --- -description: Developer responsible for Steps 2–4 — architecture, tests, implementation, git, and releases +description: Software Engineer responsible for Steps 2-3 — architecture, TDD loop, git, and releases mode: subagent temperature: 0.3 tools: @@ -25,7 +25,7 @@ permissions: allow: ask --- -# Developer +# Software Engineer You build everything: architecture, tests, code, and releases. You own technical decisions entirely. The product owner defines what to build; you decide how. @@ -37,10 +37,9 @@ Load `skill session-workflow` first — it reads TODO.md, orients you to the cur | Step | Action | |---|---| -| **Step 2 — ARCH** | Load `skill implementation` — contains full Step 2 architecture protocol | -| **Step 3 — TEST FIRST** | Load `skill tdd` — contains full Step 3 test-writing protocol | -| **Step 4 — IMPLEMENT** | Load `skill implementation` — contains full Step 4 Red-Green-Refactor cycle | -| **Step 6 — after PO accepts** | Load `skill pr-management` and `skill git-release` as needed | +| **Step 2 — ARCH** | Load `skill implementation` — contains Step 2 architecture protocol | +| **Step 3 — TDD LOOP** | Load `skill implementation` — contains Step 3 TDD Loop | +| **Step 5 — after PO accepts** | Load `skill pr-management` and `skill git-release` as needed | ## Ownership Rules @@ -57,8 +56,8 @@ If during implementation you discover behavior not covered by existing acceptanc ## Available Skills - `session-workflow` — session start/end protocol -- `tdd` — Step 3: failing tests with `@id` traceability -- `implementation` — Step 2: architecture + Step 4: Red-Green-Refactor cycle -- `pr-management` — Step 6: PRs with conventional commits -- `git-release` — Step 6: calver versioning and themed release naming -- `create-skill` — meta: create new skills when needed +- `implementation` — Steps 2-3: architecture + TDD loop +- `design-patterns` — on-demand when smell detected during refactor +- `pr-management` — Step 5: PRs with conventional commits +- `git-release` — Step 5: calver versioning and themed release naming +- `create-skill` — meta: create new skills when needed \ No newline at end of file diff --git a/.opencode/skills/implementation/SKILL.md b/.opencode/skills/implementation/SKILL.md index 60a4c92..30fc2a2 100644 --- a/.opencode/skills/implementation/SKILL.md +++ b/.opencode/skills/implementation/SKILL.md @@ -1,83 +1,94 @@ --- name: implementation -description: Step 4 — Red-Green-Refactor cycle, one test at a time, with commit per green test -version: "2.2" -author: developer -audience: developer +description: Steps 2-3 — Architecture + TDD Loop, one @id at a time +version: "3.0" +author: software-engineer +audience: software-engineer workflow: feature-lifecycle --- # Implementation -Make the failing tests pass one at a time. Each green test gets its own commit after reviewer approval. Refactor only after tests are green. +Steps 2 (Architecture) and 3 (TDD Loop) combined into a single skill. The software-engineer owns both. ## Developer Quality Gate Priority Order -During Step 4, correctness priorities are (in order): +During implementation, correctness priorities are (in order): 1. **Design correctness** — YAGNI > KISS > DRY > SOLID > Object Calisthenics > appropriate design patterns -2. **One test green** — the specific test under work passes, plus `test-fast` still passes -3. **Reviewer code-design check** — reviewer verifies design + semantic alignment (no lint/pyright/coverage) -4. **Commit** — only after reviewer APPROVED -5. **Quality tooling** — `lint`, `static-check`, full `test` with coverage run only at developer handoff (before Step 5) +2. **One @id green** — the specific test under work passes, plus `test-fast` still passes +3. **Commit** — when a meaningful increment is green +4. **Quality tooling** — `lint`, `static-check`, full `test` with coverage run at end-of-feature handoff -Design correctness is far more important than lint/pyright/coverage compliance. Never run lint, static-check, or coverage during the Red-Green-Refactor cycle — those are handoff-only checks. +Design correctness is far more important than lint/pyright/coverage compliance. Never run lint, static-check, or coverage during the TDD loop — those are handoff-only checks. -## The Cycle +--- -``` -Pick one failing test - → RED: confirm it fails - → GREEN: write the minimum code to make it pass - → REFACTOR: clean up, apply design principles - → SELF-DECLARE: complete the Design Self-Declaration checklist - ─── STOP ─── do not proceed until reviewer checks ─── - → REVIEWER CHECK: reviewer audits self-declaration against actual code - ─── WAIT for APPROVED ─── - → COMMIT (only after reviewer APPROVED) - → Update TODO.md: mark @id [x], update Cycle State to next test - → pick next failing test +## Step 2 — Architecture + +### Prerequisites (stop if any fail — escalate to PO) + +1. `docs/features/in-progress/` contains only `.gitkeep` (no `.feature` files). If another `.feature` file exists, **STOP** — another feature is already in progress. +2. The feature file's discovery section has `Status: BASELINED`. If not, escalate to PO — Step 1 is incomplete. +3. The feature file contains `Rule:` blocks with `Example:` blocks and `@id` tags. If not, escalate to PO — criteria have not been written. +4. Package name confirmed: read `pyproject.toml` → locate `[tool.setuptools]` → confirm directory exists on disk. + +### Package Verification (mandatory — before writing any code) + +1. Read `pyproject.toml` → locate `[tool.setuptools]` → record `packages = ["<name>"]` +2. Confirm directory exists: `ls <name>/` +3. All new source files go under `<name>/` — never under a template placeholder. + +### Move Feature File + +```bash +mv docs/features/backlog/<name>.feature docs/features/in-progress/<name>.feature ``` -**Hard gates**: The cycle has two hard gates — you must STOP before the reviewer check, and WAIT for APPROVED before committing. Never batch multiple tests before a reviewer interaction. Never commit without reviewer approval. +Update `TODO.md` Source path from `backlog/` to `in-progress/`. -Never write production code before picking a specific failing test. Never refactor while tests are red. +### Read Phase (all before writing anything) -**TODO.md Cycle State is mandatory.** Update `## Cycle State` at every phase transition (RED → GREEN → REFACTOR → SELF-DECLARE → REVIEWER → COMMITTED). If the Cycle State block is missing, add it before proceeding. +1. Read `docs/features/discovery.md` (project-level) +2. Read **ALL** `.feature` files in `docs/features/backlog/` (discovery + entities sections) +3. Read in-progress `.feature` file (full: Rules + Examples + @id) -## Step 2 — Architecture (do this first) +### Domain Analysis -### Package Verification (mandatory — before writing any code) +From Entities table + Rules (Business) in `.feature` file: +- **Nouns** → named classes, value objects, aggregates +- **Verbs** → method names with typed signatures +- **Datasets** → named types (not bare dict/list) +- **Bounded Context check**: same word, different meaning across features? → module boundary +- **Cross-feature entities** → candidate shared domain layer -1. Read `pyproject.toml` → locate `[tool.setuptools]` → record the value of `packages = ["<name>"]` -2. Confirm that directory exists on disk: `ls <name>/` -3. Write the correct package name at the top of your working notes for this session -4. All new source files go under `<name>/` — never under a template placeholder or any other directory +### Silent Pre-mortem (before writing anything) -If `packages` is missing or the directory does not exist, stop and resolve with the stakeholder before writing any code. +> "In 6 months this design is a mess. What mistakes did we make?" -**Prerequisites — verify before starting:** -1. `docs/features/in-progress/` contains only `.gitkeep` (no `.feature` files). If another `.feature` file exists, **STOP** — another feature is already in progress. -2. The feature file's discovery section has `Status: BASELINED`. If not, escalate to the PO — Step 1 is incomplete. -3. The feature file contains `Rule:` blocks with `Example:` blocks and `@id` tags. If not, escalate to PO — criteria have not been written. +For each candidate class: +- >2 ivars? → split +- >1 reason to change? → isolate + +For each external dep: +- Is it behind a Protocol? → if not, add + +For each noun: +- Serving double duty across modules? → isolate + +If pattern smell detected, load `skill design-patterns`. -**Steps:** +### Write Architecture Section -1. Move the feature file from `backlog/` to `in-progress/`: - ```bash - mv docs/features/backlog/<name>.feature docs/features/in-progress/<name>.feature - ``` -2. Update `TODO.md` Source path from `backlog/` to `in-progress/`. -3. Read both `docs/features/discovery.md` (project-level) and the feature file's discovery section -4. Run a silent pre-mortem: YAGNI, KISS, DRY, SOLID, Object Calisthenics, design patterns -5. Add the Architecture section to `docs/features/in-progress/<name>.feature` (append to the feature description, before the first `Rule:`): +Append to `docs/features/in-progress/<name>.feature` (before first `Rule:`): ```gherkin Architecture: ### Module Structure - - `<package>/domain/entity.py` — data classes and value objects - - `<package>/domain/service.py` — business logic + - `<package>/domain/<noun>.py` — named class + responsibilities + - `<package>/domain/service.py` — cross-entity operations + - `<package>/adapters/<dep>.py` — Protocol implementation ### Key Decisions ADR-001: <title> @@ -89,247 +100,251 @@ If `packages` is missing or the directory does not exist, stop and resolve with - New runtime dependency: <name> — reason: <why> ``` -6. **Architecture contradiction check**: Compare each ADR against each AC. If any architectural decision contradicts or circumvents an acceptance criterion, flag it and resolve with the PO before writing any production code. -7. **PO domain acknowledgement**: Share the Architecture section with the PO for domain model acknowledgement before Step 3 begins. A one-line response ("no contradictions") is sufficient. -8. If a user story is not technically feasible, escalate to the PO. -9. If any build changes need PO approval, stop and ask before proceeding. +Signatures are informative — tests/implementation may refine them. Record significant changes as ADR updates. -Commit: `feat(<feature-name>): add architecture` - -**After committing:** Run `uv run task gen-tests -- --check` to verify stub sync. If changes are shown, run `uv run task gen-tests` to apply them. +### Architecture Smell Check (hard gate) -## Implementation Order +- [ ] No planned class with >2 responsibilities (SOLID-S) +- [ ] No planned class with >2 instance variables (OC-8) +- [ ] All external deps assigned a Protocol/Adapter (SOLID-D + Hexagonal) +- [ ] No noun with different meaning across planned modules (DDD BC) +- [ ] No missing Creational pattern +- [ ] No missing Structural pattern +- [ ] No missing Behavioral pattern +- [ ] Each ADR consistent with each @id AC — no contradictions -1. Start with the simplest test: data classes, value objects, pure functions -2. Work outward: state machines, I/O, orchestration -3. Follow the order of acceptance criteria in the `.feature` files +If any check fails: fix before committing. -## RED — Confirm the Test Fails +Commit: `feat(<feature-name>): add architecture` -```bash -uv run pytest tests/features/<name>/<story>_test.py::test_<func> -v -``` +--- -Expected: `FAILED` or `ERROR`. If it passes before you've written code, the test is wrong — fix it. +## Step 3 — TDD Loop -Update `## Cycle State` in TODO.md: -``` -Test: `@id:<hex>` — <description> -Phase: RED -``` +### Prerequisites -## GREEN — Minimum Implementation +- [ ] Architecture section present in in-progress `.feature` file +- [ ] All tests written in `tests/features/<feature-name>/` -Write the least code that makes **this one test** pass. "Green" means the specific test under work passes — not the full suite. +### Build TODO.md Test List -Apply during GREEN: -- **YAGNI**: if the test doesn't require it, don't write it -- **KISS**: the simplest code that passes +1. List all `@id` tags from in-progress `.feature` file +2. Order: fewest dependencies first; most impactful within that set +3. Each `@id` = one TODO item, status: `pending` -Do NOT apply during GREEN: DRY, SOLID, Object Calisthenics — those come in refactor. +### Outer Loop — One @id at a time -```bash -uv run pytest tests/features/<name>/<story>_test.py::test_<func> -v # this test must PASS -uv run task test-fast # no regressions -``` +**WIP limit**: exactly one `in_progress` at all times. -Update `## Cycle State` Phase: `GREEN` - -## REFACTOR — Apply Principles (in priority order) - -1. **DRY**: extract duplication -2. **SOLID**: split classes that have grown beyond one responsibility -3. **Object Calisthenics** (enforce all 9 rules): - 1. One level of indentation per method — extract inner blocks to named helpers - 2. No `else` after `return` — return early, flatten the happy path - 3. Wrap all primitives — `EmailAddress(str)` not raw `str` for domain concepts - 4. First-class collections — wrap `list[User]` in a `UserList` class - 5. One dot per line — `user.address` then `address.city`, never `user.address.city` - 6. No abbreviations — `calculate` not `calc`, `manager` not `mgr` - 7. Small entities — functions ≤ 20 lines, classes ≤ 50 lines - 8. ≤ 2 instance variables — if a class has 3+ `self.x` in `__init__`, group related - fields into a new named value object (Rule 3) or collection class (Rule 4). The fix - must produce a **new named class** — hardcoding constants, inlining literals, - using class-level variables, or moving fields to a parent class are all invalid - workarounds and remain FAIL. - 9. No getters/setters — use commands (`activate()`) and queries (`is_active()`) -4. **Type hints**: add/fix type annotations on all public functions and classes -5. **Docstrings**: Google-style on all public functions and classes - -### Design Pattern Decision Table - -Use when a pattern solves a structural problem you already have: - -| If your code has... | Consider... | Why | -|---|---|---| -| Multiple `if/elif` branches on type/state | State or Strategy pattern | Eliminates conditional complexity | -| Constructor that does complex setup | Factory or Builder | Separates construction from use | -| Multiple components that must work together | Facade | Single entry point reduces coupling | -| External dependency (I/O, DB, network) | Repository/Adapter pattern | Enables testing via Protocol | -| Event-driven flow | Observer or pub/sub | Decouples producers from consumers | +For each pending `@id`: -### Doctest Check +``` +INNER LOOP +├── RED +│ ├── Write test body (Given/When/Then → Arrange/Act/Assert) +│ ├── uv run task test-fast +│ └── EXIT: this @id FAILS +│ (if it passes: test is wrong — fix it first) +│ +├── GREEN +│ ├── Write minimum code — YAGNI + KISS only +│ │ (no DRY, SOLID, OC here — those belong in REFACTOR) +│ ├── uv run task test-fast +│ └── EXIT: this @id passes AND all prior tests pass +│ (fix implementation only; do not advance to next @id) +│ +└── REFACTOR + ├── Apply: DRY → SOLID → OC → patterns + ├── Load design-patterns skill if smell detected + ├── Add type hints and docstrings + ├── uv run task test-fast after each change + └── EXIT: test-fast passes; no smells remain + +Mark @id completed in TODO.md +Commit when a meaningful increment is green +``` -If you added or modified a `Examples:` block in a Google-style docstring, verify it passes: +### Quality Gate (all @id green) ```bash -uv run pytest --doctest-modules <module_path> +uv run task lint +uv run task static-check +uv run task test # coverage must be 100% +timeout 10s uv run task run ``` -> **Note**: `uv run task test` runs `--doctest-modules`. Keep `Examples:` blocks in Google-style docstrings valid and executable. +If coverage < 100%: add test in `tests/unit/` for uncovered branch (do NOT add @id tests for coverage). -### Refactor Self-Check Gates +All must pass before Self-Declaration. -After refactor, before moving to self-declaration: +### Self-Declaration (once, after all quality gates pass) -| If you see... | Then you must... | Before proceeding | -|---|---|---| -| Function > 20 lines | Extract helper | Verify line count | -| Nesting > 2 levels | Extract to function | Verify max depth | -| Bare `int`/`str` as domain concept | Wrap in value object | Verify no raw primitives in signatures | -| > 4 positional parameters | Group into dataclass | Verify parameter count | -| `list[X]` as domain collection | Wrap in collection class | Verify no bare lists | -| Class with 3+ `self.x` in `__init__` | Group related fields into a new named value object (OC-3) or collection class (OC-4) — **not** a dict, tuple, class variable, constant, or parent class | Count `self.` assignments again; each fix must produce a new named class | +Write into `TODO.md` under a `## Self-Declaration` block: -```bash -uv run task test-fast # must still pass — the ONLY check during refactor +```markdown +## Self-Declaration +As a software-engineer I declare: +* YAGNI: no code without a failing test — AGREE/DISAGREE | file:line +* YAGNI: no speculative abstractions — AGREE/DISAGREE | file:line +* KISS: simplest solution that passes — AGREE/DISAGREE | file:line +* KISS: no premature optimization — AGREE/DISAGREE | file:line +* DRY: no duplication — AGREE/DISAGREE | file:line +* DRY: no redundant comments — AGREE/DISAGREE | file:line +* SOLID-S: one reason to change per class — AGREE/DISAGREE | file:line +* SOLID-O: open for extension, closed for modification — AGREE/DISAGREE | file:line +* SOLID-L: subtypes substitutable — AGREE/DISAGREE | file:line +* SOLID-I: no forced unused deps — AGREE/DISAGREE | file:line +* SOLID-D: depend on abstractions, not concretions — AGREE/DISAGREE | file:line +* OC-1: one level of indentation per method — AGREE/DISAGREE | deepest: file:line +* OC-2: no else after return — AGREE/DISAGREE | file:line +* OC-3: primitive types wrapped — AGREE/DISAGREE | file:line +* OC-4: first-class collections — AGREE/DISAGREE | file:line +* OC-5: one dot per line — AGREE/DISAGREE | file:line +* OC-6: no abbreviations — AGREE/DISAGREE | file:line +* OC-7: ≤20 lines per function, ≤50 per class — AGREE/DISAGREE | longest: file:line +* OC-8: ≤2 instance variables per class — AGREE/DISAGREE | file:line +* OC-9: no getters/setters — AGREE/DISAGREE | file:line +* Patterns: no creational smell — AGREE/DISAGREE | file:line +* Patterns: no structural smell — AGREE/DISAGREE | file:line +* Patterns: no behavioral smell — AGREE/DISAGREE | file:line +* Semantic: tests operate at same abstraction as AC — AGREE/DISAGREE | file:line ``` -Do NOT run `uv run task lint` or `uv run task static-check` during the cycle. Those are handoff-only checks (before Step 5). +A `DISAGREE` answer is not automatic rejection — state the reason inline and fix before handing off. -Update `## Cycle State` Phase: `REFACTOR` +### Hand off to Step 4 (Verify) -### Design Self-Declaration +Signal completion to the reviewer. Provide: +- Feature file path +- Self-Declaration from TODO.md +- Summary of what was implemented -After refactor is complete and `test-fast` passes, write the self-declaration **into `TODO.md`** under a `## Self-Declaration` block (replacing any prior one), then request the reviewer check. The reviewer will read `TODO.md` directly — do not paste the checklist into a separate message. +--- -If a pattern smell was detected during refactor, load `skill design-patterns` to identify and apply the correct pattern before writing this block. +## Test Writing Conventions -**Write this block into `TODO.md` now, filling in every item before requesting review:** +### Test File Layout -```markdown -## Self-Declaration (@id:<hex>) -As a developer I declare this code follows YAGNI-1 (no abstractions beyond current AC) — YES | `file:line` -As a developer I declare this code follows YAGNI-2 (no speculative parameters or flags) — YES | `file:line` -As a developer I declare this code follows KISS-1 (every function has one job) — YES | `file:line` -As a developer I declare this code follows KISS-2 (no unnecessary indirection) — YES | `file:line` -As a developer I declare this code follows DRY-1 (no duplicated logic) — YES | `file:line` -As a developer I declare this code follows DRY-2 (every shared concept in one place) — YES | `file:line` -As a developer I declare this code follows SOLID-S (one reason to change) — YES | `file:line` -As a developer I declare this code follows SOLID-O (extension not modification) — YES | `file:line` or N/A | reason -As a developer I declare this code follows SOLID-L (subtypes fully substitutable) — YES | `file:line` or N/A | reason -As a developer I declare this code follows SOLID-I (no forced stub methods) — YES | `file:line` or N/A | reason -As a developer I declare this code follows SOLID-D (domain depends on Protocols) — YES | `file:line` -As a developer I declare this code follows OC-1 (max one indent level per method) — YES | deepest: `file:line` -As a developer I declare this code follows OC-2 (no else after return) — YES | `file:line` or N/A | reason -As a developer I declare this code follows OC-3 (no bare primitives as domain concepts) — YES | `file:line` or N/A | reason -As a developer I declare this code follows OC-4 (no bare collections as domain values) — YES | `file:line` or N/A | reason -As a developer I declare this code follows OC-5 (no chained dot navigation) — YES | `file:line` or N/A | reason -As a developer I declare this code follows OC-6 (no abbreviations) — YES | `file:line` or N/A | reason -As a developer I declare this code follows OC-7 (functions ≤20 lines, classes ≤50 lines) — YES | longest: `file:line` -As a developer I declare this code follows OC-8 (≤2 instance variables per class) — YES | `file:line` -As a developer I declare this code follows OC-9 (no getters/setters) — YES | `file:line` or N/A | reason -As a developer I declare this code has no missing Creational pattern (no smell: repeated construction or scattered instantiation) — YES | `file:line` or N/A | reason -As a developer I declare this code has no missing Structural pattern (no smell: feature envy or parallel conditionals on type) — YES | `file:line` or N/A | reason -As a developer I declare this code has no missing Behavioral pattern (no smell: large state machine, scattered notification, or repeated algorithm skeleton) — YES | `file:line` or N/A | reason -As a developer I declare test abstraction matches AC level (semantic alignment) — YES | `file:line` +``` +tests/features/<feature-name>/<rule-slug>_test.py ``` -**A `NO` answer is not an automatic rejection** — it is a flag. State the reason inline and fix before requesting review. Do not submit a self-declaration with a `NO` item unresolved. +- `<feature-name>` = the `.feature` file stem +- `<rule-slug>` = the `Rule:` title slugified -*For every item: provide `file:line` evidence, or write `N/A` with a one-line reason. A missing answer or missing evidence is an automatic REJECTED.* +### Function Naming -Update `## Cycle State` Phase: `SELF-DECLARE` +```python +def test_<rule_slug>_<8char_hex>() -> None: +``` + +- `rule_slug` = the `Rule:` title with spaces/hyphens replaced by underscores, lowercase +- `8char_hex` = the `@id` from the `Example:` block -## REVIEWER CHECK — Code Design Only +### Docstring Format (mandatory) + +```python +@pytest.mark.unit +def test_wall_bounce_a3f2b1c4() -> None: + """ + Given: A ball moving upward reaches y=0 + When: The physics engine processes the next frame + Then: The ball velocity y-component becomes positive + """ + # Given + # When + # Then +``` -After each test goes green + refactor + self-declaration, **STOP** and request a reviewer check. The reviewer will read the `## Self-Declaration` block from `TODO.md` directly — point them to it. +**Rules**: +- Docstring contains `Given:/When:/Then:` on separate indented lines +- No extra metadata in docstring — traceability comes from function name `@id` suffix -**STOP — request a reviewer check of code design and semantic alignment.** -**WAIT for APPROVED before committing.** +### Markers -The reviewer is scoped to **code design only** (not full Step 5): +Every test gets exactly one of: +- `@pytest.mark.unit` — isolated, no external state +- `@pytest.mark.integration` — multiple components, external state -**What the reviewer receives**: The developer's completed `## Self-Declaration` block in `TODO.md`, with `file:line` evidence for each rule. +Additionally: +- `@pytest.mark.slow` — takes > 50ms (Hypothesis, DB, network, terminal I/O) -**What the reviewer does**: Independently inspects the actual code for each rule the developer claimed `YES`. The self-declaration is an audit target — the reviewer verifies YES claims, not just reads them. The reviewer does NOT re-audit items the developer already flagged as N/A with a reason. +```python +@pytest.mark.unit +def test_wall_bounce_a3f2b1c4() -> None: + ... + +@pytest.mark.integration +@pytest.mark.slow +def test_checkout_flow_b2c3d4e5() -> None: + ... +``` -**What the reviewer does NOT check** (deferred to Step 5): -- Lint compliance -- Pyright/type checking -- Coverage metrics -- Full test suite +### Hypothesis Tests -The reviewer responds using this template: +When using `@given` in `tests/unit/`: -```markdown -## Code-Design Check — @id:<hex> - -| Rule | Developer Claims | Reviewer Verdict | Evidence | -|------|-----------------|------------------|----------| -| YAGNI-1 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| YAGNI-2 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| KISS-1 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| KISS-2 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| DRY-1 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| DRY-2 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| SOLID-S | YES / N/A | PASS / FAIL | `file:line` or N/A | -| SOLID-O | YES / N/A | PASS / FAIL | `file:line` or N/A | -| SOLID-L | YES / N/A | PASS / FAIL | `file:line` or N/A | -| SOLID-I | YES / N/A | PASS / FAIL | `file:line` or N/A | -| SOLID-D | YES / N/A | PASS / FAIL | `file:line` or N/A | -| OC-1 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| OC-2 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| OC-3 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| OC-4 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| OC-5 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| OC-6 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| OC-7 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| OC-8 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| OC-9 | YES / N/A | PASS / FAIL | `file:line` or N/A | -| Creational pattern | YES / N/A | PASS / FAIL | `file:line` or N/A | -| Structural pattern | YES / N/A | PASS / FAIL | `file:line` or N/A | -| Behavioral pattern | YES / N/A | PASS / FAIL | `file:line` or N/A | -| Semantic alignment | YES / N/A | PASS / FAIL | `file:line` or N/A | - -Decision: APPROVED / REJECTED +```python +@pytest.mark.unit +@pytest.mark.slow +@given(x=st.floats(min_value=-100, max_value=100, allow_nan=False)) +@example(x=0.0) +def test_wall_bounce_c4d5e6f7(x: float) -> None: + """ + Given: Any floating point input value + When: compute_distance is called + Then: The result is >= 0 + """ + assume(x != 0.0) + result = compute_distance(x) + assert result >= 0 ``` -Any row where Reviewer Verdict = FAIL is a rejection. The reviewer must cite `file:line` evidence for every FAIL. +**Rules**: +- `@pytest.mark.slow` is mandatory on every `@given`-decorated test +- `@example(...)` is optional but encouraged +- Never use Hypothesis for: I/O, side effects, network calls, database writes -If REJECTED: -- Mark the `@id` row as `[~]` in TODO.md (do not downgrade to `[ ]`) -- Update `## Cycle State` Phase to `REVIEWER(code-design)` -- Fix the specific issues raised -- Do not commit -- Request re-review after fix +### Semantic Alignment Rule -This is a **hard gate** — do not commit until APPROVED. +The test's Given/When/Then must operate at the **same abstraction level** as the AC's Given/When/Then. -Update `## Cycle State` Phase: `REVIEWER(code-design)` +| AC says | Test must do | +|---|---| +| "When the user presses W" | Send `"W"` through the actual input mechanism | +| "When `update_player` receives 'W'" | Call `update_player("W")` directly | -## COMMIT (after reviewer approval) +If testing through the real entry point is infeasible, escalate to PO to adjust the AC boundary. -```bash -git add -A -git commit -m "feat(<feature-name>): implement <what this test covers>" -``` +### Quality Rules + +- Write every test as if you cannot see the production code — test what a caller observes +- No `isinstance()`, `type()`, or internal attribute (`_x`) checks in assertions +- One assertion concept per test (multiple `assert` ok if they verify the same thing) +- No `pytest.skip` or `pytest.mark.xfail` without written justification +- Test data embedded directly in the test, not loaded from external files + +### Test Tool Decision -Update TODO.md: -- Mark the `@id` row `[x]` with ` — reviewer(code-design) APPROVED` -- Update `## Cycle State` Phase to `COMMITTED` -- Update `## Next` to the next failing test +| Situation | Location | Tool | +|---|---|---| +| Deterministic scenario from a `.feature` `@id` | `tests/features/` | Plain pytest | +| Property holding across many input values | `tests/unit/` | Hypothesis `@given` | +| Specific behavior or single edge case | `tests/unit/` | Plain pytest | +| Stateful system with sequences of operations | `tests/unit/` | Hypothesis stateful testing | -Then move to the next failing test. +--- ## Handling Spec Gaps If during implementation you discover a behavior not covered by existing acceptance criteria: -- **Do not extend criteria yourself** — escalate to the PO +- **Do not extend criteria yourself** — escalate to PO - Note the gap in TODO.md under `## Next` - The PO will decide whether to add a new Example to the `.feature` file -Extra tests in `tests/unit/` are allowed freely (coverage, edge cases, etc.) — these do not need `@id` traceability. Use Hypothesis (`@given`) for properties that hold across many inputs; use plain pytest for specific behaviors or single edge cases. `@pytest.mark.slow` is mandatory on every `@given`-decorated test. +Extra tests in `tests/unit/` are allowed freely (coverage, edge cases, etc.) — these do not need `@id` traceability. + +--- ## Signature Design @@ -346,7 +361,6 @@ class EmailAddress: value: str def __post_init__(self) -> None: - """Validate the email format on creation.""" if "@" not in self.value: raise ValueError(f"Invalid email: {self.value!r}") @@ -355,25 +369,4 @@ class UserRepository(Protocol): def save(self, user: "User") -> None: ... def find_by_email(self, email: EmailAddress) -> "User | None": ... -``` - -## Self-Verification Before Handoff - -After all tests are green and every per-test cycle has been committed with reviewer approval, complete these final checks before handing off to the reviewer for full Step 5 verification. - -**Manual verification**: Run the app and verify it does what the AC says, not just what the tests check. - -**Production-grade check**: If you change an input, does the output change accordingly? If any output is static regardless of input, the implementation is not complete. - -**Developer pre-mortem**: In 2-3 sentences, answer: "If this feature shipped but was broken for the user, what would be the most likely reason?" Include this in the handoff message. - -**Quality tooling** — run all four, all must pass: - -```bash -uv run task lint -uv run task static-check -uv run task test -timeout 10s uv run task run -``` - -Do not hand off broken work. These are the only commands that run at handoff — the Design Self-Declaration was already completed and verified per-test during each REFACTOR cycle. +``` \ No newline at end of file diff --git a/.opencode/skills/session-workflow/SKILL.md b/.opencode/skills/session-workflow/SKILL.md index 95562a4..f653458 100644 --- a/.opencode/skills/session-workflow/SKILL.md +++ b/.opencode/skills/session-workflow/SKILL.md @@ -1,8 +1,8 @@ --- name: session-workflow description: Session start and end protocol — read TODO.md, continue from checkpoint, update and commit -version: "2.1" -author: developer +version: "3.0" +author: software-engineer audience: all-agents workflow: session-management --- @@ -14,14 +14,20 @@ Every session starts by reading state. Every session ends by writing state. This ## Session Start 1. Read `TODO.md` — find current feature, current step, and the "Next" line. - - If `TODO.md` does not exist, run `uv run task gen-todo` to create it, then read the result. + - If `TODO.md` does not exist, create a basic one: + ```markdown + # Current Work + + No feature in progress. + Next: PO picks feature from docs/features/backlog/ and moves it to docs/features/in-progress/. + ``` 2. If a feature is active, read: - `docs/features/in-progress/<name>.feature` — feature file (discovery + architecture + Rules + Examples) - `docs/features/discovery.md` — project-level discovery (for context) 3. Run `git status` — understand what is committed vs. what is not 4. Confirm scope: you are working on exactly one step of one feature -If TODO.md says "No feature in progress", report to the PO that backlog features are waiting. **The developer never self-selects a feature from the backlog — only the PO picks.** +If TODO.md says "No feature in progress", report to the PO that backlog features are waiting. **The software-engineer never self-selects a feature from the backlog — only the PO picks.** ## Session End @@ -55,7 +61,7 @@ When a step completes within a session: # Current Work Feature: <name> -Step: <1-6> (<step name>) +Step: <1-5> (<step name>) Source: docs/features/in-progress/<name>.feature ## Progress @@ -69,8 +75,8 @@ Source: docs/features/in-progress/<name>.feature **Source path by step:** - Step 1: `Source: docs/features/backlog/<name>.feature` -- Steps 2–5: `Source: docs/features/in-progress/<name>.feature` -- Step 6: `Source: docs/features/completed/<name>.feature` +- Steps 2–4: `Source: docs/features/in-progress/<name>.feature` +- Step 5: `Source: docs/features/completed/<name>.feature` Status markers: - `[ ]` — not started @@ -86,34 +92,28 @@ No feature in progress. Next: PO picks feature from docs/features/backlog/ and moves it to docs/features/in-progress/. ``` -## Step 4 Cycle-Aware TODO Format - -During Step 4 (Implementation), TODO.md **must** include a `## Cycle State` block to track Red-Green-Refactor-Review progress. This block is **mandatory** — missing it means the cycle is unverifiable. +## Step 3 (TDD Loop) Cycle-Aware TODO Format -When `Phase: SELF-DECLARE` or later, a `## Self-Declaration` block is also **mandatory**. The reviewer reads it directly from TODO.md. A missing or incomplete self-declaration (unchecked boxes, missing `file:line`) = automatic REJECTED. +During Step 3 (TDD Loop), TODO.md **must** include a `## Cycle State` block to track Red-Green-Refactor progress. -For the full Self-Declaration checklist template (21 items), see `implementation/SKILL.md` — the "Design Self-Declaration" section under REFACTOR. +When `Phase: REFACTOR` is complete, a `## Self-Declaration` block is also **mandatory** before handing off to Step 4. ```markdown # Current Work Feature: <name> -Step: 4 (implement) +Step: 3 (TDD Loop) Source: docs/features/in-progress/<name>.feature ## Cycle State Test: `@id:<hex>` — <description> -Phase: RED | GREEN | REFACTOR | SELF-DECLARE | REVIEWER(code-design) | COMMITTED +Phase: RED | GREEN | REFACTOR -## Self-Declaration (@id:<hex>) -- [x] YAGNI-1: … — `file:line` -- [x] YAGNI-2: … — `file:line` -- [x] KISS-1: … — `file:line` - … (full checklist from implementation/SKILL.md) -- [x] Semantic: test abstraction matches AC abstraction — `file:line` +## Self-Declaration +As a software-engineer I declare this code follows YAGNI-1 ... (full checklist in implementation/SKILL.md) ## Progress -- [x] `@id:<hex>`: <description> — reviewer(code-design) APPROVED +- [x] `@id:<hex>`: <description> - [~] `@id:<hex>`: <description> ← in progress (see Cycle State) - [ ] `@id:<hex>`: <description> ← next @@ -121,11 +121,11 @@ Phase: RED | GREEN | REFACTOR | SELF-DECLARE | REVIEWER(code-design) | COMMITTED <One actionable sentence> ``` -### Reviewer Scope Legend +### Phase Transitions -When referencing reviewer interactions in TODO.md: -- `reviewer(code-design)` — per-test design check during Step 4 (YAGNI/KISS/DRY/SOLID/ObjCal/patterns + semantic alignment only) -- `reviewer(full-verify)` — Step 5 full verification (lint, pyright, coverage, semantic review, adversarial testing) +- Move from `RED` → `GREEN` when the test fails with a real assertion +- Move from `GREEN` → `REFACTOR` when the test passes +- Move from `REFACTOR` → mark `@id` complete in `## Progress` when test-fast passes ## gen-todo Script @@ -151,6 +151,6 @@ Run `gen-todo` at session start (after reading TODO.md) and at session end (befo 3. Never leave uncommitted changes — commit as WIP if needed 4. One step per session where possible; do not start Step N+1 in the same session as Step N 5. The "Next" line must be actionable enough that a fresh AI can execute it without asking questions -6. During Step 4, always update `## Cycle State` when transitioning between RED/GREEN/REFACTOR/SELF-DECLARE/REVIEWER phases +6. During Step 3, always update `## Cycle State` when transitioning between RED/GREEN/REFACTOR phases 7. When a step completes, update TODO.md and commit **before** any further work -8. During Step 4, write the `## Self-Declaration (@id:<hex>)` block into TODO.md at SELF-DECLARE phase — every checkbox must be checked with a `file:line` or `N/A` before requesting reviewer(code-design) +8. During Step 3, write the `## Self-Declaration` block into TODO.md after all quality gates pass — every claim must have YES/NO with `file:line` evidence diff --git a/.opencode/skills/tdd/SKILL.md b/.opencode/skills/tdd/SKILL.md deleted file mode 100644 index fb4d5a8..0000000 --- a/.opencode/skills/tdd/SKILL.md +++ /dev/null @@ -1,201 +0,0 @@ ---- -name: tdd -description: Step 3 — write failing tests mapped 1:1 to @id acceptance criteria with proper markers and docstrings -version: "2.1" -author: developer -audience: developer -workflow: feature-lifecycle ---- - -# TDD — Test First - -Write tests before writing any production code. Every test must fail when first run. Every test maps to exactly one `@id` acceptance criterion from a `.feature` file. - -## Step 3 Workflow - -1. Run `uv run task gen-tests -- --check` to preview what will be created/updated. -2. Run `uv run task gen-tests` to generate/sync test stubs from `.feature` files. -3. Run a silent pre-mortem: does the architecture fit? Is this the minimal solution? -4. Write failing test bodies (real assertions, not `raise NotImplementedError`) -5. Run `pytest` — confirm every new test fails with `ImportError` or `AttributeError` -6. **STOP — request a reviewer check of test design and semantic alignment. WAIT for APPROVED before committing or implementing.** -7. Commit: `test(<feature-name>): write failing tests` - -## Test Stub Generation - -```bash -uv run task gen-tests -- --check # dry run — review what would change -uv run task gen-tests # apply: sync all features -uv run task gen-tests -- --orphans # list orphaned tests -``` - -Always run `--check` first to review planned changes before applying them. - -The script reads `.feature` files from `docs/features/{backlog,in-progress,completed}/` and creates/updates test files in `tests/features/<feature-name>/`. - -For each feature file, the script iterates over `Rule:` blocks. Each Rule maps to one test file named `<rule-slug>_test.py`. Examples within a Rule map to test functions in that file. - -| `.feature` state | Script action | -|---|---| -| New `@id` Example | Create stub with `raise NotImplementedError` | -| Example title/Given/When/Then changed | Update docstring + rename function | -| `@deprecated` tag added | Add `@pytest.mark.deprecated` decorator | -| `@deprecated` tag removed | Remove `@pytest.mark.deprecated` decorator | -| Test `@id` matches no Example | Mark orphan: `@pytest.mark.skip(reason="orphan: ...")` | -| completed features | Only toggle `@deprecated` (no docstring changes) | -| Never | Touch function body | - -## Test File Structure - -``` -tests/features/<feature-name>/<rule-slug>_test.py ← one per Rule: block -tests/unit/<anything>_test.py ← developer-authored extras -``` - -- `<feature-name>` = the `.feature` file stem (kebab-case folder name) -- `<rule-slug>` = the `Rule:` title slugified (hyphens replaced by underscores, lowercase) - -## Test Function Naming - -Generated by `gen-tests`: - -```python -def test_<rule_slug>_<8char_hex>() -> None: -``` - -- `rule_slug` = the `Rule:` title with spaces and hyphens replaced by underscores, lowercased -- `8char_hex` = the `@id` from the `Example:` block - -## Docstring Format (mandatory) - -```python -@pytest.mark.unit -def test_wall_bounce_a3f2b1c4() -> None: - """ - Given: A ball moving upward reaches y=0 - When: The physics engine processes the next frame - Then: The ball velocity y-component becomes positive - """ - # Given - ball = Ball(x=5, y=0, vy=-1) - # When - result = physics.update(ball) - # Then - assert result.vy > 0 -``` - -**Rules**: -- Docstring contains `Given:/When:/Then:` on separate indented lines -- `# Given`, `# When`, `# Then` comments in the test body mirror the docstring -- No extra metadata in docstring — traceability comes from the function name `@id` suffix - -**A test that looks correct but is wrong:** - -```python -def test_bounce_physics_a3f2b1c4() -> None: - """...""" - ball = Ball(x=5, y=0, vy=-1) - physics.update(ball) - assert ball._velocity_y > 0 # WRONG: tests internal attribute, not observable behavior -``` - -The correct test asserts on the return value. The wrong test breaks if you rename an internal field. - -## Test Tool Decision - -Tests in `tests/features/` are generated from `@id` criteria — use plain pytest there. - -Tests in `tests/unit/` cover gaps not represented by any acceptance criterion. Any test style is valid — plain `assert` or Hypothesis `@given`. Use Hypothesis when the test covers a **property** that holds across many inputs (mathematical invariants, parsing contracts, value object constraints). Use plain pytest for specific behaviors or single edge cases discovered during refactoring. - -| Situation | Location | Tool | -|---|---|---| -| Deterministic scenario from a `.feature` `@id` | `tests/features/` | Plain pytest (generated) | -| Property holding across many input values | `tests/unit/` | Hypothesis `@given` | -| Specific behavior or single edge case | `tests/unit/` | Plain pytest | -| Stateful system with sequences of operations | `tests/unit/` | Hypothesis stateful testing | - -**Never use Hypothesis for**: I/O, side effects, network calls, database writes. - -### `tests/unit/` Rules - -- `@pytest.mark.slow` is **mandatory** on every `@given`-decorated test (Hypothesis is genuinely slow) -- `@example(...)` is optional but encouraged when using `@given` to document known corner cases -- `@pytest.mark.unit` or `@pytest.mark.integration` still required (one each) - -## Markers (4 total) - -Every test gets exactly one of: -- `@pytest.mark.unit` — isolated, no external state -- `@pytest.mark.integration` — multiple components, external state - -Additionally: -- `@pytest.mark.slow` — takes > 50ms (DB, network, Hypothesis, terminal I/O) -- `@pytest.mark.deprecated` — auto-skipped by conftest hook; added by `gen-tests` - -```python -@pytest.mark.unit -def test_wall_bounce_a3f2b1c4() -> None: - ... - -@pytest.mark.integration -@pytest.mark.slow -def test_checkout_flow_b2c3d4e5() -> None: - ... -``` - -When in doubt, start with `unit`. Upgrade to `integration` if the implementation requires external state. - -## Hypothesis Tests - -When using `@given` in `tests/unit/`, the required decorator order is: - -```python -@pytest.mark.unit # required: exactly one of unit or integration -@pytest.mark.slow # required: mandatory on all @given tests -@given(x=st.floats(min_value=-100, max_value=100, allow_nan=False)) -@example(x=0.0) # optional: document known corner cases -@settings(max_examples=200) -def test_wall_bounce_c4d5e6f7(x: float) -> None: - """ - Given: Any floating point input value - When: compute_distance is called - Then: The result is >= 0 - """ - assume(x != 0.0) - result = compute_distance(x) - assert result >= 0 -``` - -A `@given`-decorated test missing `@pytest.mark.slow` is a FAIL at Step 5 review. - -### Meaningful vs. Tautological Property Tests - -| Tautological (useless) | Meaningful (tests the contract) | -|---|---| -| `assert Score(x).value == x` | `assert Score(x).value >= 0` | -| `assert sorted(list) == sorted(list)` | `assert sorted(list) == sorted(list, key=...)` | -| `assert EmailAddress(valid).value == valid` | `assert "@" in EmailAddress(valid).value` | - -## Integration Test Requirement - -For any feature with multiple components or user interaction, at least one `@pytest.mark.integration` test must exercise the public entry point with realistic input. This test must NOT call internal helpers directly — it must go through the same path a real user would. - -## Semantic Alignment Rule - -The test's Given/When/Then must operate at the **same abstraction level** as the AC's Given/When/Then. - -| AC says | Test must do | -|---|---| -| "When the user presses W" | Send `"W"` through the actual input mechanism | -| "When `update_player` receives 'W'" | Call `update_player("W")` directly — the boundary is explicit | - -If testing through the real entry point is infeasible, escalate to the PO to adjust the AC boundary. **Never silently shift abstraction levels.** - -## Quality Rules - -- Write every test as if you cannot see the production code — test what a caller observes -- No `isinstance()`, `type()`, or internal attribute (`_x`) checks in assertions -- One assertion concept per test (multiple `assert` ok if they verify the same thing) -- No `pytest.skip` or `pytest.mark.xfail` without written justification in the docstring -- Never use `noqa` — fix the underlying issue instead -- Test data embedded directly in the test, not loaded from external files diff --git a/.opencode/skills/verify/SKILL.md b/.opencode/skills/verify/SKILL.md index a78ec81..a508da5 100644 --- a/.opencode/skills/verify/SKILL.md +++ b/.opencode/skills/verify/SKILL.md @@ -1,7 +1,7 @@ --- name: verify -description: Step 5 — run all verification commands, review code quality, and produce a written report -version: "2.2" +description: Step 4 — run all verification commands, review code quality, and produce a written report +version: "3.0" author: reviewer audience: reviewer workflow: feature-lifecycle @@ -9,26 +9,15 @@ workflow: feature-lifecycle # Verify -This skill guides the reviewer through Step 5: independent verification that the feature works correctly and meets quality standards. The output is a written report with a clear APPROVED or REJECTED decision. +This skill guides the reviewer through Step 4: independent verification that the feature works correctly and meets quality standards. The output is a written report with a clear APPROVED or REJECTED decision. **Your default hypothesis is that the code is broken despite passing automated checks. Your job is to find the failure mode. If you cannot find one after thorough investigation, APPROVE. If you find one, REJECTED.** **Every PASS/FAIL cell must have evidence.** Empty evidence = UNCHECKED = REJECTED. -## Scope Guard — Step 4 vs. Step 5 +## When to Use (Step 4) -If you are invoked for a **per-test code-design check during Step 4** (not a full Step 5 review): -- The developer's completed **Design Self-Declaration** is in the `## Self-Declaration` block of `TODO.md`. Read it first. -- **Independently verify each claim** against the actual code using sections 4a–4e (YAGNI, KISS, DRY, SOLID, Object Calisthenics, Design Patterns) and the semantic alignment check. -- If any item in the `## Self-Declaration` block is unchecked or has no `file:line` evidence, reject immediately — the developer has not completed the self-declaration. -- Do **NOT** run any commands (no lint, no static-check, no test suite). -- Respond using the verification table template in `implementation/SKILL.md` — compare developer claims vs. your independent findings for each rule. - -This full skill applies only when the developer signals Step 4 is complete and hands off for Step 5. - -## When to Use (Step 5) - -After the developer signals Step 4 is complete and all self-verification checks pass. Do not start verification until the developer has committed all work. +After the software-engineer signals Step 3 is complete and all self-verification checks pass. Do not start verification until the software-engineer has committed all work and written the Self-Declaration. ## Step-by-Step @@ -37,10 +26,18 @@ After the developer signals Step 4 is complete and all self-verification checks Read `docs/features/in-progress/<name>.feature`. Extract: - All `@id` tags and their Example titles from `Rule:` blocks - The interaction model (if the feature involves user interaction) -- The developer's pre-mortem (if present in the Architecture section of the feature description) -- The Rules (Business) and Constraints sections from the feature description +- The Architecture section (module structure, ADRs) +- The software-engineer's Self-Declaration from `TODO.md` -### 2. Check Commit History +### 2. pyproject.toml Gate + +```bash +git diff main -- pyproject.toml +``` + +Any change → REJECT immediately. The software-engineer must revert and get stakeholder approval. + +### 3. Check Commit History ```bash git log --oneline -20 @@ -48,159 +45,179 @@ git status ``` Verify: -- There is a commit per green test (not one giant commit at the end) -- Every step has a commit (architecture, failing tests, per-feature-name commits) +- Commits follow conventional commit format +- No "fix tests", "wip", "temp" commits - No uncommitted changes: `git status` should be clean -### 3. Production-Grade Gate +### 4. Production-Grade Gate Run before code review. If any row is FAIL, stop immediately with REJECTED. | Check | How to check | PASS | FAIL | Fix | |---|---|---|---|---| -| Developer declared production-grade | Read feature doc pre-mortem or handoff message | Explicit statement present | Absent or says "demo" or "incomplete" | Developer must complete the implementation | -| App exits cleanly | `timeout 10s uv run task run` | Exit 0 or non-124 | Exit 124 (timeout/hang) | Developer must fix the hang | -| Output changes when input changes | Run app, change an input or condition, observe output | Output changes accordingly | Output is static regardless of input | Developer must implement real logic | +| App exits cleanly | `timeout 10s uv run task run` | Exit 0 or non-124 | Exit 124 (timeout/hang) | Fix the hang | +| Output changes when input changes | Run app, change an input or condition, observe output | Output changes accordingly | Output is static | Implement real logic | -### 4. Code Review +### 5. Code Review Read the source files changed in this feature. **Do this before running lint/static-check/test** — if code review finds a design problem, commands will need to re-run after the fix anyway. -**Stop on first failure category — do not accumulate issues.** When a category FAILs, stop code review, write the report, and send REJECTED. In the report, mark all skipped sections as `NOT CHECKED (stopped at <category>)` — this is valid evidence of a deliberate stop, not an unchecked cell. +**Stop on first failure category — do not accumulate issues.** -#### 4a. Correctness — any FAIL → REJECTED +#### 5a. Correctness — any FAIL → REJECTED | Check | How to check | PASS | FAIL | Fix | |---|---|---|---|---| -| No dead code | Read for unreachable statements, unused variables, impossible branches | None found | Any found | Remove or fix the unreachable path | +| No dead code | Read for unreachable statements, unused variables, impossible branches | None found | Any found | Remove or fix | | No duplicate logic (DRY) | Search for repeated blocks doing the same thing | None found | Duplication found | Extract to shared function | -| No over-engineering (YAGNI) | Check for abstractions with no current use | None found | Unused abstraction or premature generalization | Remove unused code | +| No over-engineering (YAGNI) | Check for abstractions with no current use | None found | Unused abstraction | Remove unused code | -#### 4b. Simplicity (KISS) — any FAIL → REJECTED +#### 5b. Simplicity (KISS) — any FAIL → REJECTED | Check | How to check | PASS | FAIL | Fix | |---|---|---|---|---| | Functions do one thing | Read each function; can you describe it without `and`? | Yes | No | Split into focused functions | -| Nesting ≤ 2 levels | Count indent levels in each function | ≤ 2 | > 2 | Extract inner block to helper | +| Nesting ≤ 2 levels | Count indent levels in each function | ≤ 2 | > 2 | Extract inner block | | Functions ≤ 20 lines | Count lines | ≤ 20 | > 20 | Extract helper | | Classes ≤ 50 lines | Count lines | ≤ 50 | > 50 | Split class | -#### 4c. SOLID — any FAIL → REJECTED - -| Principle | Why it matters | What to check | How to check | PASS/FAIL | Evidence (`file:line`) | -|---|---|---|---|---|---| -| SRP | Multiple change-reasons accumulate bugs at every change site | Each class/function has one reason to change | Count distinct concerns; each `and` in its description = warning sign | | | -| OCP | Modifying existing code for new behavior invalidates existing tests | New behavior via extension, not modification | Check if adding the new case required editing existing class bodies | | | -| LSP | Substitution failures cause silent runtime errors tests miss | Subtypes behave identically to base type at all call sites | Check if any subtype narrows a contract or raises where base does not | | | -| ISP | Fat interfaces force implementors to have methods they cannot meaningfully implement | No Protocol/ABC forces unused method implementations | Check if any implementor raises `NotImplementedError` or passes on inherited methods | | | -| DIP | Depending on concrete I/O makes unit testing impossible | High-level modules depend on abstractions (Protocols) | Check if any domain class imports from I/O, DB, or framework layers directly | | | - -#### 4d. Object Calisthenics — any FAIL → REJECTED - -| # | Rule | Why it matters | How to check | PASS/FAIL | Evidence (`file:line`) | -|---|---|---|---|---|---| -| 1 | One indent level per method | Reduces cognitive load per function | Count max nesting in source | | | -| 2 | No `else` after `return` | Eliminates hidden control flow paths | Search for `else` inside functions with early returns | | | -| 3 | Primitives wrapped | Prevents primitive obsession; enables validation at construction | Bare `int`/`str` in domain signatures = FAIL | | | -| 4 | Collections wrapped in classes | Encapsulates iteration and filtering logic | `list[X]` as domain value = FAIL | | | -| 5 | One dot per line | Reduces coupling to transitive dependencies | `a.b.c()` chains = FAIL | | | -| 6 | No abbreviations | Names are documentation; abbreviations lose meaning | `mgr`, `tmp`, `calc` = FAIL | | | -| 7 | Small entities | Smaller units are easier to test, read, and replace | Functions > 20 lines or classes > 50 lines = FAIL | | | -| 8 | ≤ 2 instance variables | Forces responsibility splitting by making it structurally impossible to hold too much state in one class | For EVERY class: count `self.x` in `__init__`. If > 2: FAIL immediately. The only valid fix is a new named value object (OC-3) or collection class (OC-4). Invalid workarounds = FAIL: hardcoded constants, inlined literals, class-level variables, moving fields to a parent class, or merging into a dict/tuple. | | | -| 9 | No getters/setters | Enforces tell-don't-ask; behavior lives with data | `get_x()`/`set_x()` pairs = FAIL | | | - -#### 4e. Design Patterns — any FAIL → REJECTED - -| Code smell | Pattern missed | Why it matters | How to check | PASS/FAIL | Evidence (`file:line`) | -|---|---|---|---|---|---| -| Multiple if/elif on type/state | State or Strategy | Eliminates conditional complexity | Search for chains of `isinstance` or string-based dispatch | | | -| Complex `__init__` with side effects | Factory or Builder | Separates construction from use | Check `__init__` line count and side effects | | | -| Callers must know multiple internal components | Facade | Single entry point reduces coupling | Check how callers interact with the subsystem | | | -| External dep without Protocol | Repository/Adapter | Enables testing without real I/O; enforces DIP | Check if the dep is injected via abstraction | | | -| 0 domain classes, many functions | Missing domain model | Procedural code has no encapsulation boundary | Count classes vs functions in domain code | | | - -#### 4f. Tests — any FAIL → REJECTED - -| Check | How to check | PASS | FAIL | Fix | -|---|---|---|---|---| -| Docstring format | Read each test docstring | Given/When/Then lines only (no UUID) | Extra metadata or missing G/W/T | Fix docstring to match canonical format | -| Contract test | Would this test survive a full internal rewrite? | Yes | No | Rewrite assertion to test observable output, not internals | -| No internal attribute access | Search for `_x` in assertions | None found | `_x`, `isinstance`, `type()` found | Replace with public API assertion | -| Every `@id` has a mapped test | Match `@id` tags in `.feature` files to test functions | All mapped | Missing test | Write the missing test | -| No `@id` used by two functions | Check for duplicate `@id` hex in test function names | None | Duplicate found | Consolidate into Hypothesis `@given` + `@example` or escalate to PO | -| Function naming | Test names match `test_<rule_slug>_<8char_hex>` | All match | Mismatch | Rename function | -| All Hypothesis tests have `@pytest.mark.slow` | Read every `@given`-decorated test for the `@slow` marker | All present | Any missing | Add `@pytest.mark.slow` | +#### 5c. SOLID — any FAIL → REJECTED -#### 4g. Code Quality — any FAIL → REJECTED +| Principle | Why it matters | What to check | How to check | +|---|---|---|---| +| SRP | Multiple change-reasons accumulate bugs | Each class/function has one reason to change | Count distinct concerns | +| OCP | Modifying existing code invalidates tests | New behavior via extension, not modification | Check if adding new case required editing existing class | +| LSP | Substitution failures cause silent errors | Subtypes behave identically to base | Check for narrowed contracts | +| ISP | Fat interfaces force unused methods | No Protocol forces stub implementations | Check for NotImplementedError | +| DIP | Concrete I/O makes unit testing impossible | High-level depends on abstractions | Check domain imports no I/O/DB | -| Check | How to check | PASS | FAIL | Fix | -|---|---|---|---|---| -| No `noqa` comments | `grep -r "noqa" <package>/` | None found | Any found | Fix the underlying issue | -| No `type: ignore` comments | `grep -r "type: ignore" <package>/` | None found | Any found | Fix the underlying type error | -| All public functions have type hints | Read signatures | All annotated | Missing hints | Add type annotations | -| All public functions have docstrings | Read source | Google-style present | Missing docstring | Add docstring | -| Coverage target matches package | Check `--cov=<package>` in test config matches `[tool.setuptools] packages` in `pyproject.toml` | Matches | Wrong package name | Fix the `--cov` argument | -| All declared packages exist on disk | Check `[tool.setuptools] packages` in `pyproject.toml` against filesystem | All directories present | Missing directory | Add directory or remove declaration | -| Imports use correct package name | Search production code and tests for import statements; confirm they match `[tool.setuptools] packages`, not a template placeholder | All match | Any import from wrong package | Fix imports and move misplaced source files | +#### 5d. Object Calisthenics — any FAIL → REJECTED -### 5. Run Verification Commands (in order, stop on first failure) +| # | Rule | How to check | +|---|---|---| +| 1 | One indent level per method | Count max nesting | +| 2 | No `else` after `return` | Search for `else` after early returns | +| 3 | Primitives wrapped | Bare `int`/`str` in domain signatures = FAIL | +| 4 | Collections wrapped | `list[X]` as domain value = FAIL | +| 5 | One dot per line | `a.b.c()` chains = FAIL | +| 6 | No abbreviations | `mgr`, `tmp`, `calc` = FAIL | +| 7 | Small entities | Functions > 20 lines or classes > 50 lines = FAIL | +| 8 | ≤ 2 instance variables | Count `self.x` in `__init__` — >2 = FAIL | +| 9 | No getters/setters | `get_x()`/`set_x()` = FAIL | + +#### 5e. Design Patterns — any FAIL → REJECTED + +| Code smell | Pattern missed | How to check | +|---|---|---| +| Multiple if/elif on type/state | State or Strategy | Search for `isinstance` chains | +| Complex `__init__` | Factory or Builder | Check line count and side effects | +| Callers know multiple components | Facade | Check caller coupling | +| External dep without Protocol | Repository/Adapter | Check dep injection | +| 0 domain classes, many functions | Missing domain model | Count classes vs functions | + +#### 5f. Tests — any FAIL → REJECTED + +| Check | How to check | PASS | FAIL | +|---|---|---|---| +| Docstring format | Read each test docstring | Given/When/Then only | Extra metadata | +| Contract test | Would test survive internal rewrite? | Yes | No | +| No internal attribute access | Search for `_x` in assertions | None found | `_x`, `isinstance`, `type()` | +| Every `@id` has a mapped test | Match `@id` to test functions | All mapped | Missing test | +| Function naming | Matches `test_<rule_slug>_<8char_hex>` | All match | Mismatch | +| Hypothesis tests have `@slow` | Read every `@given` for `@slow` marker | All present | Any missing | + +#### 5g. Code Quality — any FAIL → REJECTED + +| Check | How to check | PASS | FAIL | +|---|---|---|---| +| No `noqa` comments | `grep -r "noqa" <package>/` | None found | Any found | +| No `type: ignore` | `grep -r "type: ignore" <package>/` | None found | Any found | +| Public functions have type hints | Read signatures | All annotated | Missing | +| Public functions have docstrings | Read source | Google-style | Missing | + +### 6. Run Verification Commands ```bash -uv run task gen-tests -- --orphans # any output = FAIL uv run task lint uv run task static-check uv run task test ``` -Expected for each: exit 0, no output/errors. Record exact output on failure. +Expected for each: exit 0, no errors. Record exact output on failure. If a command fails, stop and REJECT immediately. Do not run subsequent commands. -### 6. Interactive Verification +### 7. Interactive Verification -If the feature involves user interaction: run the app, provide real input, verify the output changes in response. An app that produces the same output regardless of input is NOT verified. +If the feature involves user interaction: run the app, provide real input, verify output changes. Record what input was given and what output was observed. -### 7. Write the Report +### 8. Self-Declaration Audit + +Read the software-engineer's Self-Declaration from `TODO.md`. + +For every **YES** claim: +- Find the `file:line` — does it hold? + +For every **NO** claim: +- Is the deviation justified? + +Undeclared violations → REJECT. + +### 9. Write the Report ```markdown -## Step 5 Verification Report — <feature-name> +## Step 4 Verification Report — <feature-name> + +### pyproject.toml Gate +| Check | Result | Notes | +|---|---|---| +| No changes from main | PASS / FAIL | | ### Production-Grade Gate | Check | Result | Notes | |---|---|---| -| Developer declared production-grade | PASS / FAIL | | | App exits cleanly | PASS / FAIL / TIMEOUT | | -| Output driven by real logic | PASS / FAIL | | +| Output driven by input | PASS / FAIL | | ### Commands | Command | Result | Notes | |---------|--------|-------| -| uv run task gen-tests -- --orphans | PASS / FAIL | <orphans listed if fail> | -| uv run task lint | PASS / FAIL | <details if fail> | -| uv run task static-check | PASS / FAIL | <errors if fail> | -| uv run task test | PASS / FAIL | <failures or coverage% if fail> | -| Interactive run (if user interaction involved) | PASS / SKIP (no UI) / FAIL | <what was tested> | - -### @id Traceability -| @id | Example Title | Test | Status | -|-----|---------------|------|--------| -| `@id:a3f2b1c4` | <title> | `tests/features/<name>/<rule>_test.py::test_<rule_slug>_a3f2b1c4` | COVERED / NOT COVERED | - -### Code Review Findings -- PASS: <aspect> -- FAIL: `<file>:<line>` — <specific issue> -- NOT CHECKED (stopped at <category>): <sections skipped> - -### Gap Report (if any) -- `<suggested Example text>` — reported to PO for decision +| uv run task lint | PASS / FAIL | | +| uv run task static-check | PASS / FAIL | | +| uv run task test | PASS / FAIL | | + +### Self-Declaration Audit +| Claim | Software-Engineer Claims | Reviewer Verdict | Evidence | +|------|-------------------------|------------------|----------| +| YAGNI | AGREE/DISAGREE | PASS/FAIL | | +| KISS | AGREE/DISAGREE | PASS/FAIL | | +| DRY | AGREE/DISAGREE | PASS/FAIL | | +| SOLID-S | AGREE/DISAGREE | PASS/FAIL | | +| SOLID-O | AGREE/DISAGREE | PASS/FAIL | | +| SOLID-L | AGREE/DISAGREE | PASS/FAIL | | +| SOLID-I | AGREE/DISAGREE | PASS/FAIL | | +| SOLID-D | AGREE/DISAGREE | PASS/FAIL | | +| OC-1 | AGREE/DISAGREE | PASS/FAIL | | +| OC-2 | AGREE/DISAGREE | PASS/FAIL | | +| OC-3 | AGREE/DISAGREE | PASS/FAIL | | +| OC-4 | AGREE/DISAGREE | PASS/FAIL | | +| OC-5 | AGREE/DISAGREE | PASS/FAIL | | +| OC-6 | AGREE/DISAGREE | PASS/FAIL | | +| OC-7 | AGREE/DISAGREE | PASS/FAIL | | +| OC-8 | AGREE/DISAGREE | PASS/FAIL | | +| OC-9 | AGREE/DISAGREE | PASS/FAIL | | +| Patterns Creational | AGREE/DISAGREE | PASS/FAIL | | +| Patterns Structural | AGREE/DISAGREE | PASS/FAIL | | +| Patterns Behavioral | AGREE/DISAGREE | PASS/FAIL | | +| Semantic | AGREE/DISAGREE | PASS/FAIL | | ### Decision -**APPROVED** — work meets all standards. Developer may proceed to Step 6. +**APPROVED** — all gates passed, no undeclared violations OR -**REJECTED** — fix the following before resubmitting: -1. `<file>:<line>` — <specific, actionable fix required> +**REJECTED** — fix the following: +1. `<file>:<line>` — <specific, actionable fix> ``` ## Standards Summary diff --git a/AGENTS.md b/AGENTS.md index e4f1399..e7a4158 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -11,11 +11,10 @@ Features flow through 6 steps with a WIP limit of 1 feature at a time. The files ``` STEP 1: SCOPE (product-owner) → discovery + Gherkin stories + criteria -STEP 2: ARCH (developer) → read all backlog features, design module structure -STEP 3: TEST FIRST (developer) → sync stubs, write failing tests -STEP 4: IMPLEMENT (developer) → Red-Green-Refactor, commit per green test -STEP 5: VERIFY (reviewer) → run all commands, review code -STEP 6: ACCEPT (product-owner) → demo, validate, move folder to completed/ +STEP 2: ARCH (software-engineer) → read all backlog features, design module structure +STEP 3: TDD LOOP (software-engineer) → RED → GREEN → REFACTOR, one @id at a time +STEP 4: VERIFY (reviewer) → run all commands, review code +STEP 5: ACCEPT (product-owner) → demo, validate, move folder to completed/ ``` **PO picks the next feature from backlog. Developer never self-selects.** @@ -26,14 +25,14 @@ STEP 6: ACCEPT (product-owner) → demo, validate, move folder to compl - **Product Owner (PO)** — AI agent. Interviews the stakeholder, writes discovery docs, Gherkin features, and acceptance criteria. Accepts or rejects deliveries. - **Stakeholder** — Human. Answers PO's questions, provides domain knowledge, approves PO syntheses to confirm discovery is complete. -- **Developer** — AI agent. Architecture, test bodies, implementation, git. Never edits `.feature` files. Escalates spec gaps to PO. +- **Software Engineer** — AI agent. Architecture, test bodies, implementation, git. Never edits `.feature` files. Escalates spec gaps to PO. - **Reviewer** — AI agent. Adversarial verification. Reports spec gaps to PO. ## Agents - **product-owner** — defines scope (4 phases), picks features, accepts deliveries -- **developer** — architecture, tests, code, git, releases (Steps 2-4 + release) -- **reviewer** — runs commands and reviews code at Step 5, produces APPROVED/REJECTED report +- **software-engineer** — architecture, tests, code, git, releases (Steps 2-3 + release) +- **reviewer** — runs commands and reviews code at Step 4, produces APPROVED/REJECTED report - **setup-project** — one-time setup to initialize a new project from this template ## Skills @@ -42,14 +41,13 @@ STEP 6: ACCEPT (product-owner) → demo, validate, move folder to compl |---|---|---| | `session-workflow` | all agents | every session | | `scope` | product-owner | 1 | -| `tdd` | developer | 3 | -| `implementation` | developer | 4 | -| `design-patterns` | developer | 2 (on-demand, if smell detected), 4 (refactor) | -| `verify` | reviewer | 5 | -| `code-quality` | developer | pre-handoff (redirects to `verify`) | -| `pr-management` | developer | 6 | -| `git-release` | developer | 6 | -| `create-skill` | developer | meta | +| `implementation` | software-engineer | 2, 3 | +| `design-patterns` | software-engineer | 2 (on-demand, if smell detected), 3 (refactor) | +| `verify` | reviewer | 4 | +| `code-quality` | software-engineer | pre-handoff (redirects to `verify`) | +| `pr-management` | software-engineer | 5 | +| `git-release` | software-engineer | 5 | +| `create-skill` | software-engineer | meta | **Session protocol**: Every agent loads `skill session-workflow` at session start. Load additional skills as needed for the current step. @@ -77,7 +75,7 @@ Clusters from Phase 2 Session 2 → one `Rule:` block per user story. Each `Rule ### Phase 4 — Criteria (PO alone) Pre-mortem per Rule (all Rules must be checked before writing Examples). Write `Example:` blocks — declarative Given/When/Then, MoSCoW triage (Must/Should/Could) per Example. Review checklist (4.3). Commit: `feat(criteria): write acceptance criteria for <name>` -**Criteria are frozen**: no `Example:` changes after commit. Change = `@deprecated` tag + new Example with new `@id`. +**Criteria are frozen**: no `Example:` changes after commit. Adding new Example with new `@id` replaces old. ## Filesystem Structure @@ -86,89 +84,22 @@ docs/features/ discovery.md ← project-level (Status + Questions only) backlog/<feature-name>.feature ← one per feature; discovery + Rules + Examples in-progress/<feature-name>.feature ← file moves here at Step 2 - completed/<feature-name>.feature ← file moves here at Step 6 + completed/<feature-name>.feature ← file moves here at Step 5 tests/ features/<feature-name>/ - <rule-slug>_test.py ← one per Rule: block, stubs from gen-tests + <rule-slug>_test.py ← one per Rule: block, software-engineer-written unit/ - <anything>_test.py ← developer-authored extras (no @id traceability) + <anything>_test.py ← software-engineer-authored extras (no @id traceability) ``` -Tests in `tests/unit/` are developer-authored extras not covered by any `@id` criterion. Any test style is valid — plain `assert` or Hypothesis `@given`. Use Hypothesis when the test covers a **property** that holds across many inputs (mathematical invariants, parsing contracts, value object constraints). Use plain pytest for specific behaviors or single edge cases discovered during refactoring. +Tests in `tests/unit/` are software-engineer-authored extras not covered by any `@id` criterion. Any test style is valid — plain `assert` or Hypothesis `@given`. Use Hypothesis when the test covers a **property** that holds across many inputs (mathematical invariants, parsing contracts, value object constraints). Use plain pytest for specific behaviors or single edge cases discovered during refactoring. - `@pytest.mark.slow` is mandatory on every `@given`-decorated test (Hypothesis is genuinely slow) - `@example(...)` is optional but encouraged when using `@given` to document known corner cases -- No `@id` tags — tests with `@id` belong in `tests/features/`, generated by `gen-tests` +- No `@id` tags — tests with `@id` belong in `tests/features/`, written by software-engineer -## Gherkin Format - -```gherkin -Feature: Bounce physics - - Discovery: - - Status: BASELINED (2026-01-10) - - Entities: - | Type | Name | Candidate Class/Method | In Scope | - |------|------|----------------------|----------| - | Noun | Ball | Ball | Yes | - | Verb | Bounce | Ball.bounce() | Yes | - - Rules (Business): - - Ball velocity reverses on wall contact - - Constraints: - - Physics runs at 60fps - - Questions: - | ID | Question | Answer | Status | - |----|----------|--------|--------| - | Q1 | Does gravity apply? | No, constant velocity | ANSWERED | - - All questions answered. Discovery frozen. - - Rule: Wall bounce - As a game engine - I want balls to bounce off walls - So that gameplay feels physical - - @id:a3f2b1c4 - Example: Ball bounces off top wall - Given a ball moving upward reaches y=0 - When the physics engine processes the next frame - Then the ball velocity y-component becomes positive - - @deprecated @id:b5c6d7e8 - Example: Old behavior no longer needed - Given ... - When ... - Then ... -``` - -- Each feature is a **single `.feature` file**; user stories are `Rule:` blocks within it -- The feature description (free text before the first `Rule:`) contains all discovery content: Status, Entities, Rules (business), Constraints, Questions, and later Architecture -- `@id:<8-char-hex>` — generated with `uv run task gen-id` -- `@deprecated` — marks superseded criteria; `gen-tests` adds `@pytest.mark.deprecated` to the mapped test -- `Example:` keyword (not `Scenario:`) -- Each Example must be observably distinct from every other - -## Test Conventions - -### Test Stub Generation - -```bash -uv run task gen-tests # sync all features -uv run task gen-tests -- --check # dry run -uv run task gen-tests -- --orphans # list orphaned tests -``` - -- backlog / in-progress: full write (create stubs, update docstrings, rename functions) -- completed: only toggle `@pytest.mark.deprecated` (no docstring changes) -- Orphaned tests (no matching `@id`) get `@pytest.mark.skip(reason="orphan: ...")` - -### Test File Layout +## Test File Layout ``` tests/features/<feature-name>/<rule-slug>_test.py @@ -200,9 +131,12 @@ def test_wall_bounce_a3f2b1c4() -> None: - `@pytest.mark.unit` — isolated, one function/class, no external state - `@pytest.mark.integration` — multiple components, external state - `@pytest.mark.slow` — takes > 50ms; additionally applied alongside `unit` or `integration` -- `@pytest.mark.deprecated` — auto-skipped by conftest hook; added by `gen-tests` +- Tests do not use markers — software-engineer writes test bodies directly -Every test gets exactly one of `unit` or `integration`. Slow tests additionally get `slow`. +### Markers (available if needed) +- `@pytest.mark.unit` — isolated, one function/class, no external state +- `@pytest.mark.integration` — multiple components, external state +- `@pytest.mark.slow` — takes > 50ms; additionally applied alongside `unit` or `integration` ## Development Commands @@ -231,12 +165,6 @@ uv run task lint # Type checking uv run task static-check -# Generate an 8-char hex ID -uv run task gen-id - -# Sync test stubs from .feature files -uv run task gen-tests - # Serve documentation uv run task doc-serve ``` @@ -262,7 +190,7 @@ During Step 4 (Implementation), correctness priorities are: 2. **One test green** — the specific test under work passes, plus `test-fast` still passes 3. **Reviewer code-design check** — reviewer verifies design + semantic alignment (no lint/pyright/coverage) 4. **Commit** — only after reviewer APPROVED -5. **Quality tooling** — `lint`, `static-check`, full `test` with coverage run only at developer handoff (before Step 5) +5. **Quality tooling** — `lint`, `static-check`, full `test` with coverage run only at software-engineer handoff (before Step 5) Design correctness is far more important than lint/pyright/coverage compliance. A well-designed codebase with minor lint issues is better than a lint-clean codebase with poor design. @@ -275,11 +203,7 @@ Design correctness is far more important than lint/pyright/coverage compliance. ## Deprecation Process -1. PO adds `@deprecated` tag to Example in `.feature` file -2. Run `uv run task gen-tests` — script adds `@pytest.mark.deprecated` to mapped test -3. Deprecated tests auto-skip via conftest hook -4. Feature is done when all non-deprecated tests pass -5. No special folder — features move to `completed/` normally +This template does not support deprecation. Criteria changes are handled by adding new Examples with new `@id` tags. ## Release Management @@ -289,7 +213,7 @@ Version format: `v{major}.{minor}.{YYYYMMDD}` - Same-day second release: increment minor, keep same date - Each release gets a unique adjective-animal name -Use `@developer /skill git-release` for the full release process. +Use `@software-engineer /skill git-release` for the full release process. ## Session Management diff --git a/docs/academic_research.md b/docs/academic_research.md index b0c3591..129a1e1 100644 --- a/docs/academic_research.md +++ b/docs/academic_research.md @@ -778,6 +778,66 @@ This document explains the cognitive and social-science mechanisms that justify --- +### 51. Canon TDD — Authoritative Red-Green-Refactor Definition + +| | | +|---|---| +| **Source** | Beck, K. (2023). "Canon TDD." *tidyfirst.substack.com*. December 11, 2023. | +| **Date** | 2023 | +| **URL** | https://tidyfirst.substack.com/p/canon-tdd | +| **Alternative** | Fowler, M. (2023). "Test Driven Development." *martinfowler.com*. December 11, 2023. https://martinfowler.com/bliki/TestDrivenDevelopment.html | +| **Status** | Confirmed — canonical source; explicitly authored to stop strawman critiques | +| **Core finding** | The canonical TDD loop is: (1) write a list of test scenarios; (2) convert exactly one item into a runnable test; (3) make it pass; (4) optionally refactor; (5) repeat. Writing all test code before any implementation is an explicit anti-pattern ("Mistake: convert all items on the list into concrete tests, then make them pass"). | +| **Mechanism** | The interleaving of test-writing and implementation is not cosmetic — each test drives interface decisions at the moment they are cheapest to make. Batch-writing tests first forces speculative interface decisions that later require rework when earlier tests reveal structural problems. | +| **Where used** | Justifies merging Step 3 (test bodies) into the implementation loop. Removing the separate "write all tests" phase and replacing it with one-@id-at-a-time interleaved TDD. | + +--- + +### 52. Growing Object-Oriented Software, Guided by Tests (GOOS) — Outer/Inner TDD Loop + +| | | +|---|---| +| **Source** | Freeman, S., & Pryce, N. (2009). *Growing Object-Oriented Software, Guided by Tests*. Addison-Wesley. | +| **Date** | 2009 | +| **URL** | — | +| **Alternative** | — | +| **Status** | Confirmed — canonical ATDD/BDD integration model | +| **Core finding** | Acceptance tests and unit tests operate at two separate, nested timescales. The outer loop: write one failing acceptance test (Gherkin/feature-level) before writing any implementation. The inner loop: drive implementation with unit-level Red-Green-Refactor cycles until the acceptance test passes. The acceptance test stays red throughout all inner cycles and goes green only when the feature is complete. | +| **Mechanism** | The outer loop provides direction (what to build); the inner loop provides momentum (how to build it). Running acceptance tests first prevents tunnel vision during unit-level work — the developer always has a red acceptance test as the north star. This is the canonical model for integrating Gherkin acceptance criteria (@id Examples) with unit TDD. | +| **Where used** | Justifies the two-level structure in Step 3 (TDD Loop): outer loop per @id acceptance test, inner loop per unit. Each @id Example is the acceptance test for one outer loop iteration. | + +--- + +### 53. Is TDD Dead? — Anti-Bureaucracy Evidence + +| | | +|---|---| +| **Source** | Beck, K., Fowler, M., & Hansson, D. H. (2014). "Is TDD Dead?" Video series, *martinfowler.com*. May–June 2014. https://martinfowler.com/articles/is-tdd-dead/ | +| **Date** | 2014 | +| **URL** | https://martinfowler.com/articles/is-tdd-dead/ | +| **Alternative** | — | +| **Status** | Confirmed — primary evidence for what TDD practitioners reject as overhead | +| **Core finding** | Per-cycle human reviewer gates, per-cycle checklists, and tests that provide zero delta coverage are all explicitly identified as harmful overhead in TDD workflows. The green bar is the quality gate — not a checklist. DHH: "Many people used to think that documentation was more important than code. Now he's concerned that people think tests are more important than functional code." Beck: "Tests with zero delta coverage should be deleted unless they provide some kind of communication purpose." | +| **Mechanism** | Administrative overhead added to TDD workflows increases the cost per cycle without increasing coverage or catching defects. The optimal TDD loop is as lean as productive — ceremony that does not eliminate a failure mode should be eliminated. Fowler: "The sign of too much testing is whenever you change the code you think you expend more effort changing the tests than changing the code." | +| **Where used** | Justifies removing per-test reviewer gates and per-test 24-item self-declaration from the TDD loop. Self-declaration moves to end-of-feature (once), preserving Cialdini+Tetlock accountability at feature granularity without interrupting cycle momentum. | + +--- + +### 54. Introducing BDD — Behavioural-Driven Development Origin + +| | | +|---|---| +| **Source** | North, D. (2006). "Introducing BDD." *Better Software Magazine*, March 2006. https://dannorth.net/introducing-bdd/ | +| **Date** | 2006 | +| **URL** | https://dannorth.net/introducing-bdd/ | +| **Alternative** | Fowler, M. (2013). "Given When Then." *martinfowler.com*. https://martinfowler.com/bliki/GivenWhenThen.html | +| **Status** | Confirmed — primary BDD source | +| **Core finding** | BDD evolved directly from TDD to address persistent practitioner confusion: where to start, what to test, how much to test in one go, and what to call tests. BDD reframes TDD vocabulary around observable behavior: scenarios instead of tests, Given-When-Then (G/W/T) instead of Arrange-Act-Assert (AAA). The underlying mechanics are identical — G/W/T is AAA with shared-vocabulary semantics for collaboration between technical and non-technical stakeholders. | +| **Mechanism** | The "Given" clause captures preconditions (Arrange), "When" captures the triggering event (Act), and "Then" captures the observable outcome (Assert). Translating from AAA to G/W/T shifts the focus from implementation mechanics to user-observable behavior, making acceptance criteria verifiable by non-technical stakeholders and executable by the test suite simultaneously. | +| **Where used** | Theoretical link between Gherkin @id Examples (Step 1 output) and the TDD inner loop (Step 3). Each @id Example is a G/W/T specification that maps directly to a test function. The outer GOOS loop is an acceptance test written in BDD vocabulary; the inner loop is unit TDD. | + +--- + ## Bibliography 1. Cialdini, R. B. (2001). *Influence: The Psychology of Persuasion* (rev. ed.). HarperBusiness. @@ -846,3 +906,7 @@ This document explains the cognitive and social-science mechanisms that justify 64. Rosala, M., & Moran, K. (2022). The Funnel Technique in Qualitative User Research. *Nielsen Norman Group*. https://www.nngroup.com/articles/the-funnel-technique-in-qualitative-user-research/ 65. Cockburn, A. (2005). Hexagonal Architecture. *alistair.cockburn.us*. https://alistair.cockburn.us/hexagonal-architecture/ 66. Freeman, S., & Pryce, N. (2009). *Growing Object-Oriented Software, Guided by Tests*. Addison-Wesley. +67. Beck, K. (2023). "Canon TDD." *tidyfirst.substack.com*. https://tidyfirst.substack.com/p/canon-tdd +68. Beck, K., Fowler, M., & Hansson, D. H. (2014). "Is TDD Dead?" Video series. *martinfowler.com*. https://martinfowler.com/articles/is-tdd-dead/ +69. Fowler, M. (2014). "Self Testing Code." *martinfowler.com*. https://martinfowler.com/bliki/SelfTestingCode.html +70. North, D. (2006). "Introducing BDD." *Better Software Magazine*. https://dannorth.net/introducing-bdd/ diff --git a/docs/workflow.md b/docs/workflow.md index f55e0b2..98839cb 100644 --- a/docs/workflow.md +++ b/docs/workflow.md @@ -110,7 +110,7 @@ Each step has a designated agent and a specific deliverable. No step is skipped. └─────────────────────────────────────────────────────────────────────┘ ↓ PO picks feature from backlog ┌─────────────────────────────────────────────────────────────────────┐ -│ STEP 2 — ARCHITECTURE agent: developer │ +│ STEP 2 — ARCHITECTURE agent: software-engineer │ ├─────────────────────────────────────────────────────────────────────┤ │ │ │ PREREQUISITES (stop if any fail — escalate to PO) │ @@ -183,78 +183,152 @@ Each step has a designated agent and a specific deliverable. No step is skipped. └─────────────────────────────────────────────────────────────────────┘ ↓ ┌─────────────────────────────────────────────────────────────────────┐ -│ STEP 3 — TEST FIRST agent: developer │ +│ STEP 3 — TDD LOOP agent: software-engineer │ ├─────────────────────────────────────────────────────────────────────┤ │ │ -│ uv run task gen-tests → creates tests/features/<name>/ │ -│ one <rule-slug>_test.py per Rule: │ -│ test_<rule_slug>_<hex>() per Example │ -│ Write test bodies (real assertions, not raise NotImplementedError) │ -│ Confirm every test FAILS (ImportError / AssertionError) │ -│ ★ STOP — reviewer checks test design + semantic alignment │ -│ ★ WAIT for APPROVED │ -│ commit: test(<name>): write failing tests │ -│ │ -└─────────────────────────────────────────────────────────────────────┘ - ↓ -┌─────────────────────────────────────────────────────────────────────┐ -│ STEP 4 — IMPLEMENT agent: developer │ -├─────────────────────────────────────────────────────────────────────┤ -│ │ -│ For each failing test (one at a time): │ -│ │ -│ RED → GREEN → REFACTOR → SELF-DECLARE ─STOP─ REVIEWER ─WAIT─ │ -│ ↓ APPROVED │ -│ COMMIT │ -│ ↓ │ -│ next test │ -│ │ -│ RED: confirm test fails │ -│ GREEN: minimum code to pass (YAGNI + KISS only) │ -│ REFACTOR: DRY → SOLID → Object Calisthenics (9 rules) │ -│ → type hints → docstrings │ -│ SELF-DECLARE: write ## Self-Declaration block in TODO.md │ -│ 24 first-person declarations (YAGNI×2, KISS×2, │ -│ DRY×2, SOLID×5, OC×9, Patterns×3, Semantic×1) │ -│ "As a developer I declare [rule] — YES | file:line" │ -│ or N/A | reason; load design-patterns if smell found │ -│ REVIEWER: code-design check only (no lint/pyright/coverage) │ -│ reviewer independently verifies YES claims │ -│ reviewer does NOT re-audit self-declared failures │ -│ COMMIT: feat(<name>): implement <what> │ -│ │ -│ After all tests green: │ -│ lint + static-check + test + timeout run (all must pass) │ -│ developer pre-mortem (2-3 sentences) │ -│ │ +│ PREREQUISITES (stop if any fail — escalate to PO) │ +│ [ ] Architecture section present in in-progress .feature file │ +│ [ ] All tests written in tests/features/<feature>/ │ +│ │ +│ Build TODO.md test list │ +│ List all @id tags from in-progress .feature file │ +│ Order: fewest dependencies first; most impactful within that │ +│ Each @id = one TODO item, status: pending │ +│ │ +│ OUTER LOOP — one @id at a time │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ Pick next pending @id → mark in_progress in TODO.md │ │ +│ │ (WIP limit: exactly one in_progress at all times) │ │ +│ │ │ │ +│ │ INNER LOOP │ │ +│ │ ┌───────────────────────────────────────────────────────┐ │ │ +│ │ │ RED │ │ │ +│ │ │ Write test body (Given/When/Then → Arrange/Act/Assert) │ │ +│ │ │ uv run task test-fast │ │ │ +│ │ │ EXIT: this @id FAILS │ │ │ +│ │ │ (if it passes: test is wrong — fix it first) │ │ │ +│ │ ├───────────────────────────────────────────────────────┤ │ │ +│ │ │ GREEN │ │ │ +│ │ │ Write minimum code — YAGNI + KISS only │ │ │ +│ │ │ (no DRY, SOLID, OC here — those belong in REFACTOR)│ │ │ +│ │ │ uv run task test-fast │ │ │ +│ │ │ EXIT: this @id passes AND all prior tests pass │ │ │ +│ │ │ (fix implementation only; do not advance @id) │ │ │ +│ │ ├───────────────────────────────────────────────────────┤ │ │ +│ │ │ REFACTOR │ │ │ +│ │ │ Apply: DRY → SOLID → OC → patterns │ │ │ +│ │ │ Load design-patterns skill if smell detected │ │ │ +│ │ │ Add type hints and docstrings │ │ │ +│ │ │ uv run task test-fast after each change │ │ │ +│ │ │ EXIT: test-fast passes; no smells remain │ │ │ +│ │ └───────────────────────────────────────────────────────┘ │ │ +│ │ │ │ +│ │ Mark @id completed in TODO.md │ │ +│ │ Commit when a meaningful increment is green │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +│ Repeat until all @id items completed │ +│ │ +│ QUALITY GATE (all @id green) │ +│ uv run task lint │ +│ uv run task static-check │ +│ uv run task test (coverage must be 100%) │ +│ timeout 10s uv run task run │ +│ coverage < 100%: add test in tests/unit/ for uncovered branch │ +│ (do NOT add @id tests for coverage — @id tests are AC only) │ +│ All must pass before Self-Declaration │ +│ │ +│ SELF-DECLARATION (once, after all quality gates pass) │ +│ As a software-engineer I declare: │ +│ * YAGNI: no code without a failing test — YES/NO | file:line │ +│ * YAGNI: no speculative abstractions — YES/NO | file:line │ +│ * KISS: simplest solution that passes — YES/NO | file:line │ +│ * KISS: no premature optimization — YES/NO | file:line │ +│ * DRY: no duplication — YES/NO | file:line │ +│ * DRY: no redundant comments — YES/NO | file:line │ +│ * SOLID-S: one reason to change per class — YES/NO | file:line│ +│ * SOLID-O: open for extension, closed for modification │ +│ — YES/NO | file:line │ +│ * SOLID-L: subtypes substitutable — YES/NO | file:line │ +│ * SOLID-I: no forced unused deps — YES/NO | file:line │ +│ * SOLID-D: depend on abstractions, not concretions │ +│ — YES/NO | file:line │ +│ * OC-1: one level of indentation per method — YES/NO | file:line│ +│ * OC-2: no else after return — YES/NO | file:line │ +│ * OC-3: primitive types wrapped — YES/NO | file:line │ +│ * OC-4: first-class collections — YES/NO | file:line │ +│ * OC-5: one dot per line — YES/NO | file:line │ +│ * OC-6: no abbreviations — YES/NO | file:line │ +│ * OC-7: ≤20 lines per function — YES/NO | file:line │ +│ * OC-8: ≤2 instance variables per class — YES/NO | file:line │ +│ * OC-9: no getters/setters — YES/NO | file:line │ +│ * Patterns: no creational smell — YES/NO | file:line │ +│ * Patterns: no structural smell — YES/NO | file:line │ +│ * Patterns: no behavioral smell — YES/NO | file:line │ +│ * Semantic: tests operate at same abstraction as AC │ +│ — YES/NO | file:line │ +│ │ +│ → Hand off to Step 4 (Verify) │ └─────────────────────────────────────────────────────────────────────┘ ↓ ┌─────────────────────────────────────────────────────────────────────┐ -│ STEP 5 — VERIFY agent: reviewer │ +│ STEP 4 — VERIFY agent: reviewer │ ├─────────────────────────────────────────────────────────────────────┤ │ │ -│ Default hypothesis: broken despite green checks │ -│ │ -│ 1. Read feature file — all @id Examples, interaction model │ -│ 2. Check commit history — one commit per test, clean status │ -│ 3. Production-grade gate: │ -│ app exits cleanly + output changes with input │ -│ 4. Code review (stop on first failure): │ -│ 4a Correctness (dead code, DRY, YAGNI) │ -│ 4b KISS (one thing, nesting, size) │ -│ 4c SOLID (5-row table) │ -│ 4d Object Calisthenics (9-row table) │ -│ 4e Design Patterns (5 smells) │ -│ 4f Tests (docstrings, contracts, @id coverage, naming) │ -│ 4g Code Quality (noqa, type hints, docstrings, coverage) │ -│ 5. Run: gen-tests --orphans → lint → static-check → test │ -│ 6. Interactive verification (if UI involved) │ -│ 7. Written report: APPROVED or REJECTED │ -│ │ +│ Default hypothesis: BROKEN. Prove otherwise or REJECT. │ +│ │ +│ 4a. READ │ +│ in-progress .feature file (Rules + Examples + @id) │ +│ Self-Declaration from software-engineer │ +│ │ +│ 4b. pyproject.toml GATE │ +│ git diff main -- pyproject.toml │ +│ Any change → REJECT immediately │ +│ software-engineer must revert + get stakeholder approval │ +│ │ +│ 4c. COMMIT HISTORY │ +│ git log --oneline main..HEAD │ +│ All commits follow conventional commit format? │ +│ No "fix tests", "wip", "temp" commits? │ +│ │ +│ 4d. COMMANDS │ +│ uv run task lint (must exit 0) │ +│ uv run task static-check (must exit 0) │ +│ uv run task test (must exit 0, coverage 100%) │ +│ timeout 10s uv run task run (exit 124 = hung = REJECT) │ +│ │ +│ 4e. PRODUCTION GATE │ +│ Does the application behave as described in the feature file? │ +│ Run manually or via integration test — not just green CI │ +│ Input → output check for each Rule: block │ +│ │ +│ 4f. CODE REVIEW (semantic — not covered by tooling) │ +│ [ ] Tests operate at same abstraction level as AC │ +│ [ ] No test asserts implementation details │ +│ [ ] Each @id test covers exactly one Example │ +│ [ ] No logic in tests (no if/for/while) │ +│ [ ] Module structure matches Architecture section │ +│ [ ] No external dependency outside adapters/ │ +│ [ ] Docstrings explain why, not what │ +│ │ +│ 4g. SELF-DECLARATION AUDIT │ +│ For every YES claim: find the file:line — does it hold? │ +│ For every NO claim: is the deviation justified? │ +│ Undeclared violations → REJECT │ +│ │ +│ 4h. INTERACTIVE (if any doubt remains) │ +│ Ask software-engineer one targeted question per ambiguity │ +│ Do not proceed to report if question is unanswered │ +│ │ +│ 4i. REPORT │ +│ APPROVED — all gates passed, no undeclared violations │ +│ REJECTED — list each failure with file:line and required fix │ +│ │ +│ On APPROVED → notify PO │ +│ On REJECTED → return to software-engineer (Step 3 quality gate) │ └─────────────────────────────────────────────────────────────────────┘ - ↓ APPROVED + ↓ APPROVED ┌─────────────────────────────────────────────────────────────────────┐ -│ STEP 6 — ACCEPT agent: product-owner │ +│ STEP 5 — ACCEPT agent: product-owner │ ├─────────────────────────────────────────────────────────────────────┤ │ │ │ PO runs/observes the feature (real user interaction) │ @@ -336,16 +410,12 @@ Two discovery sources: | Command | When | Purpose | |---|---|---| -| `uv run task gen-tests` | Step 3, Step 4 | Reads `.feature` files → creates/syncs test stubs in `tests/features/` | -| `uv run task gen-tests -- --check` | Before gen-tests | Dry run — preview what would change | -| `uv run task gen-tests -- --orphans` | Step 5 | List tests with no matching `@id` — already validated by gen-tests | | `uv run task gen-todo` | Every session | Reads in-progress `.feature` → syncs `TODO.md` | -| `uv run task gen-id` | Step 1 Phase 4 | Generate 8-char hex `@id` for a new Example | -| `uv run task test-fast` | Step 4 cycle | Fast test run (no coverage) — used during Red-Green-Refactor | -| `uv run task test` | Handoff, Step 5 | Full suite with coverage — must reach 100% | -| `uv run task lint` | Handoff, Step 5 | ruff — must exit 0 | -| `uv run task static-check` | Handoff, Step 5 | pyright — must exit 0, 0 errors | -| `timeout 10s uv run task run` | Handoff, Step 5 | App must exit cleanly (exit 124 = hang = fix it) | +| `uv run task test-fast` | Step 3 cycle | Fast test run (no coverage) — used during Red-Green-Refactor | +| `uv run task test` | Handoff, Step 4 | Full suite with coverage — must reach 100% | +| `uv run task lint` | Handoff, Step 4 | ruff — must exit 0 | +| `uv run task static-check` | Handoff, Step 4 | pyright — must exit 0, 0 errors | +| `timeout 10s uv run task run` | Handoff, Step 4 | App must exit cleanly (exit 124 = hang = fix it) | --- @@ -354,11 +424,11 @@ Two discovery sources: ``` tests/ features/<feature-name>/ - <rule-slug>_test.py ← generated by gen-tests, one per Rule: block + <rule-slug>_test.py ← developer-written, one per Rule: block function: test_<rule_slug>_<8char_hex>() unit/ <anything>_test.py ← developer-authored extras, no @id traceability - plain pytest or Hypothesis @given (developer's choice) + plain pytest or Hypothesis @given (developer choice) ``` --- @@ -369,41 +439,42 @@ tests/ # Current Work Feature: <name> -Step: <1-6> (<step name>) +Step: <1-5> (<step name>) Source: docs/features/in-progress/<name>.feature ## Cycle State Test: @id:<hex> — <description> -Phase: RED | GREEN | REFACTOR | SELF-DECLARE | REVIEWER(code-design) | COMMITTED - -## Self-Declaration (@id:<hex>) -As a developer I declare this code follows YAGNI-1 (no abstractions beyond current AC) — YES | `file:line` -As a developer I declare this code follows YAGNI-2 (no speculative parameters or flags) — YES | `file:line` -As a developer I declare this code follows KISS-1 (every function has one job) — YES | `file:line` -As a developer I declare this code follows KISS-2 (no unnecessary indirection) — YES | `file:line` -As a developer I declare this code follows DRY-1 (no duplicated logic) — YES | `file:line` -As a developer I declare this code follows DRY-2 (every shared concept in one place) — YES | `file:line` -As a developer I declare this code follows SOLID-S (one reason to change) — YES | `file:line` -As a developer I declare this code follows SOLID-O (extension not modification) — YES | `file:line` or N/A | reason -As a developer I declare this code follows SOLID-L (subtypes fully substitutable) — YES | `file:line` or N/A | reason -As a developer I declare this code follows SOLID-I (no forced stub methods) — YES | `file:line` or N/A | reason -As a developer I declare this code follows SOLID-D (domain depends on Protocols) — YES | `file:line` -As a developer I declare this code follows OC-1 (max one indent level per method) — YES | deepest: `file:line` -As a developer I declare this code follows OC-2 (no else after return) — YES | `file:line` or N/A | reason -As a developer I declare this code follows OC-3 (no bare primitives as domain concepts) — YES | `file:line` or N/A | reason -As a developer I declare this code follows OC-4 (no bare collections as domain values) — YES | `file:line` or N/A | reason -As a developer I declare this code follows OC-5 (no chained dot navigation) — YES | `file:line` or N/A | reason -As a developer I declare this code follows OC-6 (no abbreviations) — YES | `file:line` or N/A | reason -As a developer I declare this code follows OC-7 (functions ≤20 lines, classes ≤50 lines) — YES | longest: `file:line` -As a developer I declare this code follows OC-8 (≤2 instance variables per class) — YES | `file:line` -As a developer I declare this code follows OC-9 (no getters/setters) — YES | `file:line` or N/A | reason -As a developer I declare this code has no missing Creational pattern (no smell: repeated construction or scattered instantiation) — YES | `file:line` or N/A | reason -As a developer I declare this code has no missing Structural pattern (no smell: feature envy or parallel conditionals on type) — YES | `file:line` or N/A | reason -As a developer I declare this code has no missing Behavioral pattern (no smell: large state machine, scattered notification, or repeated algorithm skeleton) — YES | `file:line` or N/A | reason -As a developer I declare test abstraction matches AC level (semantic alignment) — YES | `file:line` +Phase: RED | GREEN | REFACTOR + +## Self-Declaration +As a software-engineer I declare: +* YAGNI: no code without a failing test — AGREE/DISAGREE | file:line +* YAGNI: no speculative abstractions — AGREE/DISAGREE | file:line +* KISS: simplest solution that passes — AGREE/DISAGREE | file:line +* KISS: no premature optimization — AGREE/DISAGREE | file:line +* DRY: no duplication — AGREE/DISAGREE | file:line +* DRY: no redundant comments — AGREE/DISAGREE | file:line +* SOLID-S: one reason to change per class — AGREE/DISAGREE | file:line +* SOLID-O: open for extension, closed for modification — AGREE/DISAGREE | file:line +* SOLID-L: subtypes substitutable — AGREE/DISAGREE | file:line +* SOLID-I: no forced unused deps — AGREE/DISAGREE | file:line +* SOLID-D: depend on abstractions, not concretions — AGREE/DISAGREE | file:line +* OC-1: one level of indentation per method — AGREE/DISAGREE | deepest: file:line +* OC-2: no else after return — AGREE/DISAGREE | file:line +* OC-3: primitive types wrapped — AGREE/DISAGREE | file:line +* OC-4: first-class collections — AGREE/DISAGREE | file:line +* OC-5: one dot per line — AGREE/DISAGREE | file:line +* OC-6: no abbreviations — AGREE/DISAGREE | file:line +* OC-7: ≤20 lines per function, ≤50 per class — AGREE/DISAGREE | longest: file:line +* OC-8: ≤2 instance variables per class — AGREE/DISAGREE | file:line +* OC-9: no getters/setters — AGREE/DISAGREE | file:line +* Patterns: no creational smell — AGREE/DISAGREE | file:line +* Patterns: no structural smell — AGREE/DISAGREE | file:line +* Patterns: no behavioral smell — AGREE/DISAGREE | file:line +* Semantic: tests operate at same abstraction as AC — AGREE/DISAGREE | file:line ## Progress -- [x] @id:<hex>: <done> — reviewer(code-design) APPROVED +- [x] @id:<hex>: <done> - [~] @id:<hex>: <in progress> - [ ] @id:<hex>: <next> @@ -411,7 +482,7 @@ As a developer I declare test abstraction matches AC level (semantic alignment) <one actionable sentence> ``` -`## Cycle State` is updated at every phase transition. `## Self-Declaration` is replaced per-test cycle. Both sections are present only during Step 4; omit when in other steps. +`## Cycle State` is updated at every phase transition. `## Self-Declaration` is written once after all quality gates pass in Step 3. Both sections are present only during Step 3; omit when in other steps. --- diff --git a/feedback.md b/feedback.md index ca1d5fd..87b57c2 100644 --- a/feedback.md +++ b/feedback.md @@ -256,3 +256,57 @@ The session-workflow skill should enforce reading and updating this section at s 4. Self-declare: "I have read all backlog features and this architecture accounts for the full known feature set" This is distinct from Item 4 (hollow PO approval) — the fix here is about the developer's reading obligation before making architectural decisions. + +--- + +## 19. Workflow Diagram — Redundancies and Late Error Detection + +### Redundancies + +**19a. Step 3 reviewer gate is a subset of Step 4's per-test reviewer gate** + +Step 3 stops for reviewer approval of test design and semantic alignment before any implementation starts. Step 4 then repeats the same semantic alignment check per-test cycle. The Step 3 check reviews all tests at once before any code exists — but semantic alignment is best verified when both the test and the implementation can be seen side by side. The Step 3 review is premature and likely re-done anyway during Step 4. + +**19b. Step 5 code review overlaps heavily with Step 4 self-declaration + per-test reviewer** + +Step 5 checks Correctness, KISS, SOLID, ObjCal, Design Patterns, Tests, Code Quality (4a–4g). All of these except tooling (lint/coverage) were already covered by the 21-item self-declaration and per-test reviewer in Step 4. Step 5 implies a full re-audit of already-reviewed work, rather than a targeted spot-check of what is novel or risky. + +**19c. `gen-tests --check` listed as a separate pre-step that nothing uses** + +The `--check` dry-run appears in the tools table as "Before gen-tests" but is never referenced in the actual workflow steps. Either make it a mandatory gate or remove it. + +**19d. Step 2 architecture commit and Step 3 gen-tests commit are always consecutive** + +These two commits are always paired and never independently useful. Step 2 commits architecture, Step 3 immediately runs `gen-tests` and commits stubs. Combining them into one step would reduce overhead without losing traceability. + +### Late Error Detection + +**19e. Architecture locked before test bodies reveal structural problems** + +Test bodies are written in Step 3 after the architecture is committed in Step 2. If a test body reveals an architectural flaw (wrong abstraction, missing entity), the developer must return to Step 2 — but the diagram has no explicit back-arrow from Step 3 to Step 2. The diagram implies Step 3 is always forward. + +**19f. Decomposition check happens at the end of Phase 2, after all discovery is done** + +If a feature is too large (>2 concerns, >8 examples), the split happens after discovery questions are already answered. The check should happen earlier — at Phase 1 when the feature list is identified, or at the start of Phase 2 before generating questions. + +**19g. `lint + static-check` run only at handoff (end of Step 4)** + +A type error or lint violation introduced in cycle 3 is not caught until all cycles are complete. Running these tools only at handoff means multiple commits may need to be unwound. + +**19h. Production-grade input→output check first appears in Step 5** + +Step 5 verifies that "output changes with input". This basic correctness property is not checked by the developer until the reviewer finds it. The developer's pre-mortem at end of Step 4 exists but is vague — it does not mandate the input→output check explicitly. + +### Proposed Improvements + +| # | Issue | Proposed change | +|---|---|---| +| A | Step 3 reviewer gate redundant with Step 4 | Merge Step 3 into Step 2: after architecture commit, run `gen-tests` to create stubs. Test body writing becomes the first action of Step 4 (write test → RED → GREEN → REFACTOR → SELF-DECLARE → REVIEWER → COMMIT). Removes one full reviewer interaction. | +| B | Step 5 is a full re-audit of already-reviewed work | Reframe Step 5 as a spot-check + tooling run: skip re-checking items covered by per-test reviewers; focus on (a) tooling — lint, static-check, coverage, orphans, (b) integration/system behavior, (c) semantic alignment of the feature as a whole. | +| C | Decomposition check too late | Move to Phase 1 (when feature stubs are created) and add a lightweight re-check at the start of Phase 2 (before generating questions). | +| D | `lint + static-check` run only at handoff | Run `lint + static-check` (not coverage) after each Step 4 commit as a fast sanity check. Keep full `test` with coverage at handoff only. | +| E | Step 2 + Step 3 always consecutive | Merge into one step: architecture + `gen-tests` stubs in one commit. Test bodies are the opening move of Step 4. | +| F | No back-arrow from Step 3 to Step 2 | Add explicit "if test body reveals arch flaw → back to Step 2" path in the diagram. | +| G | Input→output check first found by reviewer | Make it explicit in the developer's Step 4 self-verification (before handoff): run with two different inputs, confirm output differs. | + +**Highest-value change: A + E combined.** Collapsing Steps 2+3 removes a full reviewer interaction. Test body writing as the opening move of Step 4 means architectural flaws are discovered immediately when the developer cannot make the test fail for the right reason. From 64b9230ad2dd1482ae0c6aac4d7b63d143732ddb Mon Sep 17 00:00:00 2001 From: nullhack <nullhack@users.noreply.github.com> Date: Fri, 17 Apr 2026 17:47:41 -0400 Subject: [PATCH 04/12] fix: align all files to 5-step workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix YES/NO → AGREE/DISAGREE in self-declaration - Fix developer → software-engineer role names - Fix 6 steps → 5 steps - Remove gen-tests, gen-id references - Update scope frozen-criteria section - Fix code-quality redirect to verify - Fix design-patterns load trigger step 3 - Fix create-skill available skills table - Delete tdd/ skill (obsolete) - Fix AGENTS.md: step count, role name, duplicate markers, quality gate heading --- .opencode/skills/code-quality/SKILL.md | 10 +- .opencode/skills/create-skill/SKILL.md | 18 +- .opencode/skills/design-patterns/SKILL.md | 8 +- .opencode/skills/git-release/SKILL.md | 6 +- .opencode/skills/pr-management/SKILL.md | 4 +- .opencode/skills/scope/SKILL.md | 3 +- .opencode/skills/session-workflow/SKILL.md | 2 +- .../skills/tdd/scripts/gen_test_stubs.py | 698 ------------------ .opencode/skills/verify/SKILL.md | 6 +- AGENTS.md | 16 +- docs/workflow.md | 58 +- 11 files changed, 63 insertions(+), 766 deletions(-) delete mode 100644 .opencode/skills/tdd/scripts/gen_test_stubs.py diff --git a/.opencode/skills/code-quality/SKILL.md b/.opencode/skills/code-quality/SKILL.md index f09d76e..40f0294 100644 --- a/.opencode/skills/code-quality/SKILL.md +++ b/.opencode/skills/code-quality/SKILL.md @@ -2,16 +2,18 @@ name: code-quality description: Enforce code quality using ruff, pytest coverage, and static type checking version: "2.0" -author: developer -audience: developer, reviewer +author: software-engineer +audience: software-engineer, reviewer workflow: feature-lifecycle --- # Code Quality -Run these four commands before handing off to the reviewer (Step 5). All must pass. +Run these four commands before handing off to the reviewer (Step 4). All must pass. -## Developer Self-Check +**This is a quick reference. For the full verification protocol used by the reviewer, load `skill verify`.** + +## Software-Engineer Self-Check Before handing off to reviewer: diff --git a/.opencode/skills/create-skill/SKILL.md b/.opencode/skills/create-skill/SKILL.md index 6480d30..7df4acc 100644 --- a/.opencode/skills/create-skill/SKILL.md +++ b/.opencode/skills/create-skill/SKILL.md @@ -2,8 +2,8 @@ name: create-skill description: Create new OpenCode skills following the skill definition standard version: "1.0" -author: developer -audience: developer +author: software-engineer +audience: software-engineer workflow: opencode --- @@ -83,10 +83,10 @@ Add the skill name to the agent's "Available Skills" section so the agent knows |---|---|---| | `session-workflow` | all agents | Session start/end protocol | | `scope` | product-owner | Step 1: define acceptance criteria | -| `tdd` | developer | Step 3: write failing tests | -| `implementation` | developer | Step 4: Red-Green-Refactor | -| `verify` | reviewer | Step 5: run commands and review code | -| `code-quality` | developer | Quick reference: four handoff commands before Step 5 | -| `pr-management` | developer | Create PRs with proper format | -| `git-release` | developer | Calver versioning and release naming | -| `create-skill` | developer | Create new skills | +| `implementation` | software-engineer | Steps 2-3: architecture + TDD loop | +| `design-patterns` | software-engineer | Steps 2, 3: refactor when smell detected | +| `verify` | reviewer | Step 4: adversarial verification | +| `code-quality` | software-engineer | Quick reference — redirects to verify | +| `pr-management` | software-engineer | Step 5: create PR with squash merge | +| `git-release` | software-engineer | Step 5: calver versioning and release | +| `create-skill` | software-engineer | Create new skills | diff --git a/.opencode/skills/design-patterns/SKILL.md b/.opencode/skills/design-patterns/SKILL.md index 917a83a..591ab6d 100644 --- a/.opencode/skills/design-patterns/SKILL.md +++ b/.opencode/skills/design-patterns/SKILL.md @@ -2,8 +2,8 @@ name: design-patterns description: Reference skill for GoF design patterns, SOLID, Object Calisthenics, Python Zen, and other SE principles — with smell triggers and Python before/after examples version: "1.0" -author: developer -audience: developer +author: software-engineer +audience: software-engineer workflow: feature-lifecycle --- @@ -11,7 +11,7 @@ workflow: feature-lifecycle Load this skill when: - Running the architecture smell check in Step 2 and a smell is detected -- Refactoring in Step 4 and a pattern smell appears in the self-declaration +- Refactoring in Step 3 and a pattern smell appears in the self-declaration --- @@ -20,7 +20,7 @@ Load this skill when: 1. **Identify the smell** from the checklist in your self-declaration or architecture check 2. **Find the smell category** below (Creational / Structural / Behavioral) 3. **Read the trigger and the before/after example** -4. **Apply the pattern** and update the Architecture section (Step 2) or the refactored code (Step 4) +4. **Apply the pattern** and update the Architecture section (Step 2) or the refactored code (Step 3) --- diff --git a/.opencode/skills/git-release/SKILL.md b/.opencode/skills/git-release/SKILL.md index afe5763..34bd11f 100644 --- a/.opencode/skills/git-release/SKILL.md +++ b/.opencode/skills/git-release/SKILL.md @@ -2,14 +2,14 @@ name: git-release description: Create releases with hybrid major.minor.calver versioning and AI-generated adjective-animal naming version: "1.0" -author: developer -audience: developer +author: software-engineer +audience: software-engineer workflow: release-management --- # Git Release -Create a tagged GitHub release after the PO accepts the feature (Step 6). +Create a tagged GitHub release after the PO accepts the feature (Step 5). ## Version Format diff --git a/.opencode/skills/pr-management/SKILL.md b/.opencode/skills/pr-management/SKILL.md index 172ebd5..f10605c 100644 --- a/.opencode/skills/pr-management/SKILL.md +++ b/.opencode/skills/pr-management/SKILL.md @@ -2,8 +2,8 @@ name: pr-management description: Create pull requests with conventional commits, proper formatting, and branch workflow version: "1.0" -author: developer -audience: developer +author: software-engineer +audience: software-engineer workflow: git-management --- diff --git a/.opencode/skills/scope/SKILL.md b/.opencode/skills/scope/SKILL.md index 91bb8e2..7581ab5 100644 --- a/.opencode/skills/scope/SKILL.md +++ b/.opencode/skills/scope/SKILL.md @@ -255,7 +255,7 @@ All Rules must have their pre-mortems completed before any Examples are written. ### 4.2 Write Example Blocks -Add `Example:` blocks under each `Rule:`. Each Example gets an `@id:<8-char-hex>` tag (generated with `uv run task gen-id`). +Add `Example:` blocks under each `Rule:`. Each Example gets an `@id:<8-char-hex>` tag. **Format** (mandatory): @@ -322,7 +322,6 @@ git commit -m "feat(criteria): write acceptance criteria for <name>" **After this commit, the `Example:` blocks are frozen.** Any change requires: 1. Add `@deprecated` tag to the old Example 2. Write a new Example with a new `@id` -3. Run `uv run task gen-tests` to sync test stubs --- diff --git a/.opencode/skills/session-workflow/SKILL.md b/.opencode/skills/session-workflow/SKILL.md index f653458..fc370a8 100644 --- a/.opencode/skills/session-workflow/SKILL.md +++ b/.opencode/skills/session-workflow/SKILL.md @@ -153,4 +153,4 @@ Run `gen-todo` at session start (after reading TODO.md) and at session end (befo 5. The "Next" line must be actionable enough that a fresh AI can execute it without asking questions 6. During Step 3, always update `## Cycle State` when transitioning between RED/GREEN/REFACTOR phases 7. When a step completes, update TODO.md and commit **before** any further work -8. During Step 3, write the `## Self-Declaration` block into TODO.md after all quality gates pass — every claim must have YES/NO with `file:line` evidence +8. During Step 3, write the `## Self-Declaration` block into TODO.md after all quality gates pass — every claim must have AGREE/DISAGREE with `file:line` evidence diff --git a/.opencode/skills/tdd/scripts/gen_test_stubs.py b/.opencode/skills/tdd/scripts/gen_test_stubs.py deleted file mode 100644 index ae2c09c..0000000 --- a/.opencode/skills/tdd/scripts/gen_test_stubs.py +++ /dev/null @@ -1,698 +0,0 @@ -"""Generate and sync pytest test stubs from Gherkin .feature files. - -Scans all .feature files under docs/features/{backlog,in-progress,completed}/ -and creates or updates test stubs in tests/features/<feature-name>/. - -Each Rule: block in a .feature file maps to one test file: - tests/features/<feature-name>/<rule-slug>_test.py - -Test function naming: - test_<rule_slug>_<8char_hex>() - -Modes: - uv run task gen-tests Sync all features (default) - uv run task gen-tests -- --check Dry run — report what would change - uv run task gen-tests -- --orphans List orphaned tests (no matching @id) - -Safety rules: - - backlog / in-progress: full write (create stubs, update docstrings, rename) - - completed: only toggle @pytest.mark.deprecated (no docstring changes) - - Never touches function bodies (code between # Given and end of function) -""" - -from __future__ import annotations - -import re -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -from gherkin import Parser as GherkinParser - -PROJECT_ROOT = Path(__file__).resolve().parents[4] -FEATURES_DIR = PROJECT_ROOT / "docs" / "features" -TESTS_DIR = PROJECT_ROOT / "tests" / "features" - -FEATURE_STAGES = ("backlog", "in-progress", "completed") - -ID_TAG_RE = re.compile(r"@id:([a-f0-9]{8})") - -TEST_FUNC_RE = re.compile(r"^def (test_\w+)\(.*\)") -TEST_ID_RE = re.compile(r"test_\w+_([a-f0-9]{8})\b") -DEPRECATED_MARKER_RE = re.compile(r"^@pytest\.mark\.deprecated$", re.MULTILINE) -ORPHAN_MARKER_RE = re.compile( - r'^@pytest\.mark\.skip\(reason="orphan: no matching @id in \.feature files"\)$', - re.MULTILINE, -) - - -@dataclass(frozen=True, slots=True) -class GherkinExample: - """A single Example block parsed from a .feature file.""" - - id_hex: str - title: str - given: str - when: str - then: str - deprecated: bool - source_file: str - - -@dataclass(frozen=True, slots=True) -class RuleBlock: - """A Rule: block with its examples, mapped to one test file.""" - - rule_title: str - rule_slug: str - examples: list[GherkinExample] - - -@dataclass(frozen=True, slots=True) -class FeatureFile: - """A parsed .feature file with its Rule blocks.""" - - path: Path - feature_name: str - feature_slug: str - rules: list[RuleBlock] - - -def slugify(name: str) -> str: - """Convert a name to a Python-safe slug. - - Args: - name: Kebab-case or space-separated name. - - Returns: - Underscore-separated lowercase string. - """ - return re.sub(r"[^a-z0-9]+", "_", name.lower()).strip("_") - - -def parse_feature_file(path: Path) -> FeatureFile | None: - """Parse a .feature file into structured data with Rule blocks. - - Args: - path: Path to the .feature file. - - Returns: - FeatureFile if valid, None if no Feature: line found. - """ - text = path.read_text(encoding="utf-8") - doc = GherkinParser().parse(text) - feature: dict[str, Any] | None = doc.get("feature") - if not feature or not feature.get("name"): - return None - - feature_slug = slugify(path.stem) - rules = _extract_rules(feature, str(path)) - return FeatureFile( - path=path, - feature_name=feature["name"], - feature_slug=feature_slug, - rules=rules, - ) - - -def _extract_rules(feature: dict[str, Any], source_file: str) -> list[RuleBlock]: - """Extract Rule blocks from a parsed Gherkin feature AST. - - Each Rule: block becomes one RuleBlock with its examples. - Examples not under any Rule are grouped into a synthetic rule - using the feature name as the slug. - - Args: - feature: The 'feature' dict from gherkin-official Parser output. - source_file: Path string for provenance tracking. - - Returns: - List of RuleBlock objects. - """ - rules: list[RuleBlock] = [] - orphan_examples: list[GherkinExample] = [] - - for child in feature.get("children", []): - rule_node: dict[str, Any] | None = child.get("rule") - scenario_node: dict[str, Any] | None = child.get("scenario") - - if rule_node is not None: - rule_title = rule_node.get("name", "") - rule_slug = slugify(rule_title) - examples = _extract_examples_from_rule(rule_node, source_file) - if examples: - rules.append( - RuleBlock( - rule_title=rule_title, - rule_slug=rule_slug, - examples=examples, - ) - ) - elif scenario_node is not None: - example = _scenario_to_example(scenario_node, source_file) - if example is not None: - orphan_examples.append(example) - - if orphan_examples: - feature_slug = slugify(feature.get("name", "feature")) - rules.append( - RuleBlock( - rule_title=feature.get("name", ""), - rule_slug=feature_slug, - examples=orphan_examples, - ) - ) - - return rules - - -def _extract_examples_from_rule( - rule_node: dict[str, Any], source_file: str -) -> list[GherkinExample]: - """Extract Example blocks from a Rule node. - - Args: - rule_node: The 'rule' dict from the Gherkin AST. - source_file: Path string for provenance tracking. - - Returns: - List of parsed GherkinExample objects. - """ - examples: list[GherkinExample] = [] - for child in rule_node.get("children", []): - scenario: dict[str, Any] | None = child.get("scenario") - if scenario is None: - continue - example = _scenario_to_example(scenario, source_file) - if example is not None: - examples.append(example) - return examples - - -def _scenario_to_example( - scenario: dict[str, Any], source_file: str -) -> GherkinExample | None: - """Convert a single parsed scenario dict to a GherkinExample. - - Skips scenarios without an @id tag. - - Args: - scenario: A scenario dict from the Gherkin AST. - source_file: Path string for provenance tracking. - - Returns: - GherkinExample if the scenario has an @id tag, None otherwise. - """ - tags = scenario.get("tags", []) - id_hex = _extract_id_tag(tags) - if id_hex is None: - return None - - deprecated = any(t["name"] == "@deprecated" for t in tags) - given, when, then = _extract_steps(scenario.get("steps", [])) - return GherkinExample( - id_hex=id_hex, - title=scenario.get("name", ""), - given=given, - when=when, - then=then, - deprecated=deprecated, - source_file=source_file, - ) - - -def _extract_id_tag(tags: list[dict[str, Any]]) -> str | None: - """Find the @id:<hex> tag value from a list of AST tags. - - Args: - tags: List of tag dicts from the Gherkin AST. - - Returns: - The 8-char hex ID, or None if no @id tag is present. - """ - for tag in tags: - m = ID_TAG_RE.search(tag.get("name", "")) - if m: - return m.group(1) - return None - - -def _extract_steps(steps: list[dict[str, Any]]) -> tuple[str, str, str]: - """Extract Given/When/Then text from parsed Gherkin steps. - - Args: - steps: List of step dicts from the Gherkin AST. - - Returns: - Tuple of (given, when, then) step text strings. - """ - given = when = then = "" - for step in steps: - keyword_type = step.get("keywordType", "") - text = step.get("text", "") - if keyword_type == "Context": - given = text - elif keyword_type == "Action": - when = text - elif keyword_type == "Outcome": - then = text - return given, when, then - - -def generate_stub(rule_slug: str, example: GherkinExample) -> str: - """Generate a single test stub function. - - Args: - rule_slug: Underscored rule title (used as function prefix). - example: The parsed Gherkin example. - - Returns: - Complete test function source code as a string. - """ - func_name = f"test_{rule_slug}_{example.id_hex}" - markers = ["@pytest.mark.unit"] - if example.deprecated: - markers.append("@pytest.mark.deprecated") - - marker_lines = "\n".join(markers) - docstring = _build_docstring(example) - - lines = [ - marker_lines, - f"def {func_name}() -> None:", - *docstring, - " # Given", - "", - " # When", - "", - " # Then", - " raise NotImplementedError", - ] - return "\n".join(lines) + "\n" - - -def _build_docstring(example: GherkinExample) -> list[str]: - """Build properly indented docstring lines for a test stub. - - Args: - example: The parsed Gherkin example. - - Returns: - List of indented lines (each with 4-space prefix) including triple quotes. - """ - return [ - ' """', - f" Given: {example.given}", - f" When: {example.when}", - f" Then: {example.then}", - ' """', - ] - - -def generate_test_file(rule_slug: str, examples: list[GherkinExample]) -> str: - """Generate a complete test file for one Rule: block. - - Args: - rule_slug: Underscored rule title (file name stem + function prefix). - examples: All examples from that Rule block. - - Returns: - Complete test module source code. - """ - header = ( - f'"""Tests for {rule_slug.replace("_", " ")} rule."""\n\nimport pytest\n\n\n' - ) - stubs = "\n\n".join(generate_stub(rule_slug, ex) for ex in examples) - return header + stubs + "\n" - - -def find_feature_files() -> list[tuple[Path, str, str]]: - """Find all .feature files across all stages. - - Returns: - List of (feature_file_path, feature_name, stage) tuples. - feature_name is the .feature file stem (e.g. 'display-version'). - """ - results: list[tuple[Path, str, str]] = [] - for stage in FEATURE_STAGES: - stage_dir = FEATURES_DIR / stage - if not stage_dir.exists(): - continue - for feature_file in sorted(stage_dir.glob("*.feature")): - results.append((feature_file, feature_file.stem, stage)) - return results - - -def read_existing_test_ids(test_file: Path) -> set[str]: - """Extract @id hex values from existing test function names. - - Args: - test_file: Path to existing test file. - - Returns: - Set of 8-char hex IDs found in test function names. - """ - if not test_file.exists(): - return set() - text = test_file.read_text(encoding="utf-8") - return set(TEST_ID_RE.findall(text)) - - -def sync_test_file( - rule_slug: str, - examples: list[GherkinExample], - test_file: Path, - stage: str, - *, - check_only: bool = False, -) -> list[str]: - """Sync a single test file with its Rule: block examples. - - Args: - rule_slug: Underscored rule title. - examples: Parsed examples from the Rule block. - test_file: Path to the test file to create/update. - stage: Feature stage (backlog, in-progress, completed). - check_only: If True, report changes without writing. - - Returns: - List of action descriptions taken/planned. - """ - actions: list[str] = [] - example_ids = {ex.id_hex for ex in examples} - - if not test_file.exists(): - if stage == "completed": - return actions - content = generate_test_file(rule_slug, examples) - actions.append(f"CREATE {test_file} ({len(examples)} stubs)") - if not check_only: - test_file.parent.mkdir(parents=True, exist_ok=True) - test_file.write_text(content, encoding="utf-8") - return actions - - text = test_file.read_text(encoding="utf-8") - existing_ids = set(TEST_ID_RE.findall(text)) - - if stage == "completed": - actions.extend(_sync_deprecated_markers(examples, test_file, text, check_only)) - return actions - - actions.extend( - _sync_full( - rule_slug, - examples, - example_ids, - existing_ids, - test_file, - text, - check_only, - ) - ) - return actions - - -def _sync_deprecated_markers( - examples: list[GherkinExample], - test_file: Path, - text: str, - check_only: bool, -) -> list[str]: - """For completed features, only toggle @deprecated markers. - - Args: - examples: Parsed examples from the .feature file. - test_file: Path to the test file. - text: Current content of the test file. - check_only: If True, report without writing. - - Returns: - List of action descriptions. - """ - actions: list[str] = [] - modified = text - for ex in examples: - func_pattern = re.compile( - rf"((?:@pytest\.mark\.\w+(?:\(.*?\))?\n)*)def test_\w+_{ex.id_hex}\b" - ) - match = func_pattern.search(modified) - if not match: - continue - decorators = match.group(1) - has_deprecated = "@pytest.mark.deprecated" in decorators - if ex.deprecated and not has_deprecated: - new_decorators = "@pytest.mark.deprecated\n" + decorators - modified = ( - modified[: match.start()] - + new_decorators - + match.group()[len(decorators) :] - + modified[match.end() :] - ) - actions.append(f"ADD @deprecated to test for {ex.id_hex}") - elif not ex.deprecated and has_deprecated: - new_decorators = decorators.replace("@pytest.mark.deprecated\n", "") - modified = ( - modified[: match.start()] - + new_decorators - + match.group()[len(decorators) :] - + modified[match.end() :] - ) - actions.append(f"REMOVE @deprecated from test for {ex.id_hex}") - if modified != text and not check_only: - test_file.write_text(modified, encoding="utf-8") - return actions - - -def _sync_full( - rule_slug: str, - examples: list[GherkinExample], - example_ids: set[str], - existing_ids: set[str], - test_file: Path, - text: str, - check_only: bool, -) -> list[str]: - """Full sync for backlog/in-progress features. - - Args: - rule_slug: Underscored rule title. - examples: Parsed examples. - example_ids: Set of IDs from .feature file. - existing_ids: Set of IDs found in existing test file. - test_file: Path to test file. - text: Current file content. - check_only: Dry run flag. - - Returns: - List of action descriptions. - """ - actions: list[str] = [] - modified = text - - new_ids = example_ids - existing_ids - orphan_ids = existing_ids - example_ids - - for ex in examples: - if ex.id_hex in new_ids: - stub = "\n\n" + generate_stub(rule_slug, ex) - modified += stub - actions.append(f"ADD stub for @id:{ex.id_hex}") - elif ex.id_hex in existing_ids: - modified, doc_actions = _update_docstring(modified, rule_slug, ex) - actions.extend(doc_actions) - - for oid in orphan_ids: - orphan_marker = ( - '@pytest.mark.skip(reason="orphan: no matching @id in .feature files")' - ) - func_pattern = re.compile( - rf"((?:@pytest\.mark\.\w+(?:\(.*?\))?\n)*)def test_\w+_{oid}\b" - ) - match = func_pattern.search(modified) - if match and orphan_marker not in match.group(1): - decorators = match.group(1) - new_decorators = orphan_marker + "\n" + decorators - modified = ( - modified[: match.start()] - + new_decorators - + match.group()[len(decorators) :] - + modified[match.end() :] - ) - actions.append(f"MARK orphan: test with @id:{oid}") - - if modified != text and not check_only: - test_file.write_text(modified, encoding="utf-8") - return actions - - -def _update_docstring( - text: str, rule_slug: str, example: GherkinExample -) -> tuple[str, list[str]]: - """Update the docstring of an existing test to match the .feature file. - - Args: - text: Full test file content. - rule_slug: Underscored rule title. - example: The Gherkin example to match. - - Returns: - Tuple of (modified_text, list_of_actions). - """ - actions: list[str] = [] - func_re = re.compile( - rf'(def test_\w+_{example.id_hex}\(.*?\).*?:\n\s+""")' - rf"(.*?)" - rf'(""")', - re.DOTALL, - ) - match = func_re.search(text) - if not match: - return text, actions - - new_docstring = ( - f"\n Given: {example.given}\n" - f" When: {example.when}\n" - f" Then: {example.then}\n " - ) - old_docstring = match.group(2) - if old_docstring.strip() != new_docstring.strip(): - text = text[: match.start(2)] + new_docstring + text[match.end(2) :] - actions.append(f"UPDATE docstring for @id:{example.id_hex}") - - old_func = re.search(rf"def (test_\w+_{example.id_hex})\b", text) - if old_func: - expected_name = f"test_{rule_slug}_{example.id_hex}" - if old_func.group(1) != expected_name: - text = text.replace(old_func.group(1), expected_name) - actions.append(f"RENAME {old_func.group(1)} -> {expected_name}") - return text, actions - - -def find_duplicate_ids() -> list[str]: - """Find @id hex values that appear in more than one distinct feature file. - - A feature that appears in multiple stage directories (backlog, in-progress, - completed) with the same stem is counted only once — that is expected during - migrations. Duplicates are only flagged when the same @id appears in two - different feature files (different stems). - - Returns: - List of warning strings describing each duplicate @id. - """ - id_sources: dict[str, set[str]] = {} - for fpath, feature_name, _stage in find_feature_files(): - parsed = parse_feature_file(fpath) - if not parsed: - continue - for rule in parsed.rules: - for ex in rule.examples: - id_sources.setdefault(ex.id_hex, set()).add( - f"{feature_name}/{rule.rule_slug}" - ) - - warnings: list[str] = [] - for id_hex, sources in sorted(id_sources.items()): - if len(sources) > 1: - locations = ", ".join(sorted(sources)) - warnings.append(f"@id:{id_hex} appears in multiple locations: {locations}") - return warnings - - -def find_orphaned_tests() -> list[str]: - """Find all test files with IDs that don't match any .feature file. - - Returns: - List of orphan descriptions. - """ - all_feature_ids: set[str] = set() - for fpath, _name, _stage in find_feature_files(): - parsed = parse_feature_file(fpath) - if parsed: - for rule in parsed.rules: - all_feature_ids.update(ex.id_hex for ex in rule.examples) - - orphans: list[str] = [] - if not TESTS_DIR.exists(): - return orphans - for test_file in TESTS_DIR.rglob("*_test.py"): - ids = read_existing_test_ids(test_file) - for tid in ids: - if tid not in all_feature_ids: - orphans.append(f"{test_file}: @id:{tid}") - return orphans - - -def _sync_all_features( - feature_files: list[tuple[Path, str, str]], *, check_only: bool -) -> int: - """Sync test stubs for all feature files. - - Args: - feature_files: List of (fpath, feature_name, stage) tuples. - check_only: If True, report actions without writing files. - - Returns: - Exit code: 0 = success, 1 = changes needed in check mode. - """ - duplicates = find_duplicate_ids() - for warning in duplicates: - print(f"WARNING: {warning}") - - all_actions: list[str] = [] - for fpath, feature_name, stage in sorted(feature_files): - parsed = parse_feature_file(fpath) - if not parsed: - print(f"SKIP {fpath} — no Feature: line found") - continue - for rule in parsed.rules: - test_file = TESTS_DIR / feature_name / f"{rule.rule_slug}_test.py" - actions = sync_test_file( - rule.rule_slug, - rule.examples, - test_file, - stage, - check_only=check_only, - ) - all_actions.extend(actions) - - if all_actions: - mode = "Would" if check_only else "Did" - print(f"{mode} perform {len(all_actions)} action(s):") - for a in all_actions: - print(f" {a}") - return 1 if check_only else 0 - - print("All test stubs are in sync.") - return 0 - - -def main() -> int: - """Entry point for the gen-tests command. - - Returns: - Exit code (0 = success, 1 = changes needed in check mode). - """ - check_only = "--check" in sys.argv - orphans_only = "--orphans" in sys.argv - - if orphans_only: - orphans = find_orphaned_tests() - if orphans: - print("Orphaned tests (no matching @id in .feature files):") - for o in orphans: - print(f" {o}") - return 1 - print("No orphaned tests found.") - return 0 - - feature_files = find_feature_files() - if not feature_files: - print("No .feature files found.") - return 0 - - return _sync_all_features(feature_files, check_only=check_only) - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/.opencode/skills/verify/SKILL.md b/.opencode/skills/verify/SKILL.md index a508da5..9a738a7 100644 --- a/.opencode/skills/verify/SKILL.md +++ b/.opencode/skills/verify/SKILL.md @@ -157,11 +157,11 @@ Record what input was given and what output was observed. Read the software-engineer's Self-Declaration from `TODO.md`. -For every **YES** claim: +For every **AGREE** claim: - Find the `file:line` — does it hold? -For every **NO** claim: -- Is the deviation justified? +For every **DISAGREE** claim: +- REJECT — the software-engineer must fix before requesting review again. Undeclared violations → REJECT. diff --git a/AGENTS.md b/AGENTS.md index e7a4158..89fe0e3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,7 +4,7 @@ A Python template to quickstart any project with a production-ready workflow, qu ## Workflow Overview -Features flow through 6 steps with a WIP limit of 1 feature at a time. The filesystem enforces WIP: +Features flow through 5 steps with a WIP limit of 1 feature at a time. The filesystem enforces WIP: - `docs/features/backlog/<feature-name>.feature` — features waiting to be worked on - `docs/features/in-progress/<feature-name>.feature` — exactly one feature being built right now - `docs/features/completed/<feature-name>.feature` — accepted and shipped features @@ -17,7 +17,7 @@ STEP 4: VERIFY (reviewer) → run all commands, review code STEP 5: ACCEPT (product-owner) → demo, validate, move folder to completed/ ``` -**PO picks the next feature from backlog. Developer never self-selects.** +**PO picks the next feature from backlog. Software-engineer never self-selects.** **Verification is adversarial.** The reviewer's job is to try to break the feature, not to confirm it works. The default hypothesis is "it might be broken despite green checks; prove otherwise." @@ -127,13 +127,7 @@ def test_wall_bounce_a3f2b1c4() -> None: raise NotImplementedError ``` -### Markers (4 total) -- `@pytest.mark.unit` — isolated, one function/class, no external state -- `@pytest.mark.integration` — multiple components, external state -- `@pytest.mark.slow` — takes > 50ms; additionally applied alongside `unit` or `integration` -- Tests do not use markers — software-engineer writes test bodies directly - -### Markers (available if needed) +### Markers (3 total) - `@pytest.mark.unit` — isolated, one function/class, no external state - `@pytest.mark.integration` — multiple components, external state - `@pytest.mark.slow` — takes > 50ms; additionally applied alongside `unit` or `integration` @@ -182,9 +176,9 @@ uv run task doc-serve - **Semantic alignment**: tests must operate at the same abstraction level as the acceptance criteria they cover - **Integration tests**: multi-component features require at least one `@pytest.mark.integration` test exercising the public entry point -### Developer Quality Gate Priority Order +### Software-Engineer Quality Gate Priority Order -During Step 4 (Implementation), correctness priorities are: +During Step 3 (TDD Loop), correctness priorities are: 1. **Design correctness** — YAGNI > KISS > DRY > SOLID > Object Calisthenics > appropriate design patterns 2. **One test green** — the specific test under work passes, plus `test-fast` still passes diff --git a/docs/workflow.md b/docs/workflow.md index 98839cb..370a7a4 100644 --- a/docs/workflow.md +++ b/docs/workflow.md @@ -6,7 +6,7 @@ This document describes the complete feature lifecycle used to develop software ## Overview -Features flow through 6 steps with a WIP limit of 1 feature at a time. The filesystem enforces the limit: +Features flow through 5 steps with a WIP limit of 1 feature at a time. The filesystem enforces the limit: ``` docs/features/backlog/<name>.feature ← waiting @@ -239,33 +239,33 @@ Each step has a designated agent and a specific deliverable. No step is skipped. │ │ │ SELF-DECLARATION (once, after all quality gates pass) │ │ As a software-engineer I declare: │ -│ * YAGNI: no code without a failing test — YES/NO | file:line │ -│ * YAGNI: no speculative abstractions — YES/NO | file:line │ -│ * KISS: simplest solution that passes — YES/NO | file:line │ -│ * KISS: no premature optimization — YES/NO | file:line │ -│ * DRY: no duplication — YES/NO | file:line │ -│ * DRY: no redundant comments — YES/NO | file:line │ -│ * SOLID-S: one reason to change per class — YES/NO | file:line│ +│ * YAGNI: no code without a failing test — AGREE/DISAGREE | file:line │ +│ * YAGNI: no speculative abstractions — AGREE/DISAGREE | file:line │ +│ * KISS: simplest solution that passes — AGREE/DISAGREE | file:line │ +│ * KISS: no premature optimization — AGREE/DISAGREE | file:line │ +│ * DRY: no duplication — AGREE/DISAGREE | file:line │ +│ * DRY: no redundant comments — AGREE/DISAGREE | file:line │ +│ * SOLID-S: one reason to change per class — AGREE/DISAGREE | file:line│ │ * SOLID-O: open for extension, closed for modification │ -│ — YES/NO | file:line │ -│ * SOLID-L: subtypes substitutable — YES/NO | file:line │ -│ * SOLID-I: no forced unused deps — YES/NO | file:line │ +│ — AGREE/DISAGREE | file:line │ +│ * SOLID-L: subtypes substitutable — AGREE/DISAGREE | file:line │ +│ * SOLID-I: no forced unused deps — AGREE/DISAGREE | file:line │ │ * SOLID-D: depend on abstractions, not concretions │ -│ — YES/NO | file:line │ -│ * OC-1: one level of indentation per method — YES/NO | file:line│ -│ * OC-2: no else after return — YES/NO | file:line │ -│ * OC-3: primitive types wrapped — YES/NO | file:line │ -│ * OC-4: first-class collections — YES/NO | file:line │ -│ * OC-5: one dot per line — YES/NO | file:line │ -│ * OC-6: no abbreviations — YES/NO | file:line │ -│ * OC-7: ≤20 lines per function — YES/NO | file:line │ -│ * OC-8: ≤2 instance variables per class — YES/NO | file:line │ -│ * OC-9: no getters/setters — YES/NO | file:line │ -│ * Patterns: no creational smell — YES/NO | file:line │ -│ * Patterns: no structural smell — YES/NO | file:line │ -│ * Patterns: no behavioral smell — YES/NO | file:line │ +│ — AGREE/DISAGREE | file:line │ +│ * OC-1: one level of indentation per method — AGREE/DISAGREE | file:line│ +│ * OC-2: no else after return — AGREE/DISAGREE | file:line │ +│ * OC-3: primitive types wrapped — AGREE/DISAGREE | file:line │ +│ * OC-4: first-class collections — AGREE/DISAGREE | file:line │ +│ * OC-5: one dot per line — AGREE/DISAGREE | file:line │ +│ * OC-6: no abbreviations — AGREE/DISAGREE | file:line │ +│ * OC-7: ≤20 lines per function — AGREE/DISAGREE | file:line │ +│ * OC-8: ≤2 instance variables per class — AGREE/DISAGREE | file:line │ +│ * OC-9: no getters/setters — AGREE/DISAGREE | file:line │ +│ * Patterns: no creational smell — AGREE/DISAGREE | file:line │ +│ * Patterns: no structural smell — AGREE/DISAGREE | file:line │ +│ * Patterns: no behavioral smell — AGREE/DISAGREE | file:line │ │ * Semantic: tests operate at same abstraction as AC │ -│ — YES/NO | file:line │ +│ — AGREE/DISAGREE | file:line │ │ │ │ → Hand off to Step 4 (Verify) │ └─────────────────────────────────────────────────────────────────────┘ @@ -336,7 +336,7 @@ Each step has a designated agent and a specific deliverable. No step is skipped. │ │ │ ACCEPTED: │ │ mv in-progress/<name>.feature → completed/<name>.feature │ -│ developer creates PR (squash merge) + tags release │ +│ software-engineer creates PR (squash merge) + tags release │ │ │ │ REJECTED: │ │ feedback in TODO.md → back to relevant step │ @@ -378,7 +378,7 @@ Feature: <title> Synthesis: <full synthesis across clusters> Approved: YES / NO - Architecture: ← added at Step 2 by developer + Architecture: ← added at Step 2 by software-engineer ### Module Structure - <package>/domain/entity.py — ... @@ -424,10 +424,10 @@ Two discovery sources: ``` tests/ features/<feature-name>/ - <rule-slug>_test.py ← developer-written, one per Rule: block + <rule-slug>_test.py ← software-engineer-written, one per Rule: block function: test_<rule_slug>_<8char_hex>() unit/ - <anything>_test.py ← developer-authored extras, no @id traceability + <anything>_test.py ← software-engineer-authored extras, no @id traceability plain pytest or Hypothesis @given (developer choice) ``` From 624a56a90b17db9bf014b33f9cfb38fe976396fc Mon Sep 17 00:00:00 2001 From: nullhack <nullhack@users.noreply.github.com> Date: Fri, 17 Apr 2026 17:52:04 -0400 Subject: [PATCH 05/12] feat: add create-agent skill with research-backed design patterns - Updated create-skill to v2.0 with research basis (Anthropic, OpenAI) - Added new create-agent skill following OpenAI/Anthropic best practices - Added create-agent to AGENTS.md skills table - create-agent is invoked by human-user (not software-engineer) --- .opencode/skills/create-agent/SKILL.md | 201 +++++++++++++++++++++++++ .opencode/skills/create-skill/SKILL.md | 50 +++++- AGENTS.md | 1 + 3 files changed, 245 insertions(+), 7 deletions(-) create mode 100644 .opencode/skills/create-agent/SKILL.md diff --git a/.opencode/skills/create-agent/SKILL.md b/.opencode/skills/create-agent/SKILL.md new file mode 100644 index 0000000..300e17c --- /dev/null +++ b/.opencode/skills/create-agent/SKILL.md @@ -0,0 +1,201 @@ +--- +name: create-agent +description: Create new OpenCode agents with research-backed design patterns and industry standards +version: "1.0" +author: human-user +audience: human-user +workflow: opencode +--- + +# Create Agent + +Create a new OpenCode agent following research-backed best practices from OpenAI, Anthropic, and scientific literature. + +## When to Use + +When you need a new agent with distinct ownership, instructions, tool surface, or approval policy. Not for simple routing — only when the task requires a separate domain of responsibility. + +## Research Basis + +### Agent Design Principles (OpenAI, Anthropic, 2024-2026) + +**Core principle**: "Define the smallest agent that can own a clear task. Add more agents only when you need separate ownership, different instructions, different tool surfaces, or different approval policies." — OpenAI Agents SDK (Entry 318 in `docs/academic_research.md`) + +**Split criterion is ownership boundary, not instruction volume.** + +### Multi-Agent Architecture Patterns + +| Pattern | When to Use | Example | +|---|---|---| +| **Single-agent** | Most tasks; incrementally add tools | `software-engineer` handles Steps 2-3 | +| **Hierarchical (triage + specialist)** | Multiple distinct task types requiring different expertise | `product-owner` → `software-engineer` → `reviewer` | +| **Evaluator-optimizer** | Tasks requiring iteration with quality checks | Review workflow | + +### Agent Definition Components + +From OpenAI's practical guide: + +1. **Model** — LLM for reasoning/decision-making +2. **Instructions** — System prompt defining behavior +3. **Tools** — Actions the agent can take +4. **Guardrails** — Safety boundaries + +## How to Create an Agent + +### 1. Create the agent file + +```bash +mkdir -p .opencode/agents/ +``` + +Create `.opencode/agents/<agent-name>.md`: + +```markdown +--- +name: <agent-name> +description: <1-sentence description of what this agent does> +role: <product-owner | software-engineer | reviewer | setup-project | human-user> +steps: <step numbers this agent owns, e.g., "2, 3"> +--- + +# <Agent Name> + +[Brief description of the agent's purpose and when it's invoked.] + +## Role + +<What this agent does in the workflow.> + +## Available Skills + +| Skill | When to Load | Purpose | +|---|---|---| +| `session-workflow` | Every session | Session start/end protocol | +| `<skill-name>` | When needed | <What the skill provides> | + +## Instructions + +<Detailed instructions for this agent. Include:> + +- When to invoke this agent (trigger conditions) +- What steps it owns +- How to use tools +- When to escalate or hand off +``` + +### 2. Follow the structural rules + +From `academic_research.md` Entry 410: + +| File | When Loaded | Content | Avoid | +|---|---|---|---| +| `AGENTS.md` | Always | Shared conventions, commands | Workflow details | +| `.opencode/agents/*.md` | When role invoked | Role identity, step ownership, skill loads, tool permissions | Duplication | +| `.opencode/skills/*.md` | On demand | Full procedural instructions | Duplication | + +### 3. Define clear ownership boundaries + +**Split criteria** (Anthropic/OpenAI): +- Separate ownership (different domain responsibility) +- Different instructions (not just more detail) +- Different tool surface (distinct actions) +- Different approval policy (escalation rules) + +**Anti-pattern**: Creating agents just to organize instructions. A single agent with more tools is usually better than multiple agents. + +### 4. Write effective instructions + +From Anthropic's agent design patterns: + +- **Specific triggers**: "Load this skill when X" not "use judgment" +- **Clear actions**: Every step corresponds to a specific output +- **Concrete examples**: Include before/after code where helpful +- **Verification criteria**: How does the agent know it's done? + +### 5. Define tool permissions + +From Anthropic's tool design principles: + +- **Start with bash** for breadth +- **Promote to dedicated tools** when you need to: + - Gate security-sensitive actions + - Render structured output + - Audit usage patterns + - Serialize vs. parallelize + +### 6. Add to AGENTS.md + +Register the agent in the workflow section of `AGENTS.md`: + +```markdown +## Agents + +| Agent | Role | Steps | Skills | +|-------|------|-------|--------| +| <name> | <role> | <steps> | <skills> | +``` + +## Agent Template + +```markdown +--- +name: <agent-name> +description: <what this agent does, 1 sentence> +role: <product-owner | software-engineer | reviewer | setup-project | human-user> +steps: <owned steps, e.g., "2-3"> +--- + +# <Agent Title> + +<2-3 paragraphs: what this agent does, when invoked, what it delivers.> + +## Context + +<What this agent knows/has access to> + +## Available Skills + +- `session-workflow` — always +- `<skill>` — when <trigger> + +## Instructions + +### Step <N>: <Step Name> + +1. <Specific action> +2. <Specific action> +3. <Verification> + +### Hand-off + +When to transfer to <other agent>: <condition> + +## Tool Permissions + +- Read files: <scope> +- Write files: <scope> +- Execute commands: <scope> +- Network access: <yes/no> + +## Escalation + +When to escalate to human: <conditions> +``` + +## Existing Agents in This Project + +| Agent | Role | Steps | Purpose | +|---|---|---|---| +| `product-owner` | product-owner | 1, 5 | Scope discovery, acceptance | +| `software-engineer` | software-engineer | 2, 3, 5 | Architecture, TDD, releases | +| `reviewer` | reviewer | 4 | Adversarial verification | +| `setup-project` | setup-project | meta | Initialize new projects | + +## Best Practices Summary + +1. **Start with a single agent** — add more only when ownership boundaries are clear +2. **Define ownership, not volume** — separate domains, not instruction sets +3. **Keep instructions specific** — concrete triggers, not vague guidance +4. **Match tools to security needs** — bash for flexibility, dedicated tools for gating +5. **Test with real usage** — iterate based on failures +6. **Reference, don't duplicate** — link to skills and AGENTS.md, don't copy content \ No newline at end of file diff --git a/.opencode/skills/create-skill/SKILL.md b/.opencode/skills/create-skill/SKILL.md index 7df4acc..4132176 100644 --- a/.opencode/skills/create-skill/SKILL.md +++ b/.opencode/skills/create-skill/SKILL.md @@ -1,7 +1,7 @@ --- name: create-skill description: Create new OpenCode skills following the skill definition standard -version: "1.0" +version: "2.0" author: software-engineer audience: software-engineer workflow: opencode @@ -9,12 +9,18 @@ workflow: opencode # Create Skill -Create a new reusable skill for OpenCode agents. +Create a new reusable skill for OpenCode agents, following research-backed best practices. ## When to Use When you need to codify a repeatable workflow that multiple agents or sessions will follow. Skills are loaded on demand; they don't run automatically. +## Research Basis + +- **Lazy loading principle** (Anthropic, 2024): Skills should be loaded on demand, not in every session. This preserves the primary context budget and prevents important instructions from being pushed beyond effective attention range (Entry 347 in `docs/academic_research.md`). +- **Concise is key** (Anthropic skill authoring best practices): Every token in a skill competes with conversation context. Keep SKILL.md under 500 lines for optimal performance. +- **Tool abstraction** (OpenAI Agents SDK): Skills should define clear actions that correspond to specific outputs, not abstract guidance. + ## How to Create a Skill ### 1. Create the directory @@ -29,7 +35,7 @@ Naming rules: - Cannot start or end with hyphen, no consecutive hyphens - Must match the directory name exactly -### 2. Create SKILL.md +### 2. Create SKILL.md with frontmatter ```markdown --- @@ -62,20 +68,49 @@ workflow: <workflow-category> - [ ] <Verification item> ``` -### 3. Keep it lean +**Frontmatter requirements:** +- `name`: Max 64 chars, lowercase letters/numbers/hyphens only +- `description`: 1 sentence, 10-100 chars, include key terms and triggers +- `author`/`audience`: Use role names from AGENTS.md +- `workflow`: Category like `feature-lifecycle`, `opencode`, `release-management` + +### 3. Write body content + +Follow these research-backed patterns: + +**Structure:** +1. **When to Use** — specific trigger conditions, not vague guidance +2. **Step-by-Step** — clear sequential steps with specific actions +3. **Checklist** — verification items the agent can self-check + +**Formatting rules:** +- Use imperative voice ("Write the test" not "You should write") +- One step per line item in checklists +- Include concrete examples (one is enough, not exhaustive) +- Use tables for multi-column data (tool options, decision criteria) +- Link to reference docs instead of duplicating them + +**Tone:** Write in third person. The description is injected into the system prompt. + +### 4. Keep it lean Skills are loaded into context. Long skills consume tokens. Target: - < 150 lines for focused workflow skills - < 250 lines for complex multi-phase skills +- < 500 lines absolute maximum (Anthropic recommendation) -Cut: +**Cut:** - Exhaustive examples when one is enough - Reference documentation (link to it instead) - Boilerplate CI/CD YAML (it belongs in `.github/`, not skills) -### 4. Reference from agents +### 5. Test with real usage + +The most effective skill development process involves using the skill in real tasks and iterating based on failures. + +### 6. Reference from agents -Add the skill name to the agent's "Available Skills" section so the agent knows to load it. +Add the skill name to the agent's "Available Skills" section so the agent knows to load it. Update AGENTS.md skills table. ## Available Skills in This Project @@ -90,3 +125,4 @@ Add the skill name to the agent's "Available Skills" section so the agent knows | `pr-management` | software-engineer | Step 5: create PR with squash merge | | `git-release` | software-engineer | Step 5: calver versioning and release | | `create-skill` | software-engineer | Create new skills | +| `create-agent` | human-user | Create new agents with research-backed design | \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index 89fe0e3..c4ab039 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -48,6 +48,7 @@ STEP 5: ACCEPT (product-owner) → demo, validate, move folder to compl | `pr-management` | software-engineer | 5 | | `git-release` | software-engineer | 5 | | `create-skill` | software-engineer | meta | +| `create-agent` | human-user | meta | **Session protocol**: Every agent loads `skill session-workflow` at session start. Load additional skills as needed for the current step. From 83d0cc050142e8ffcd07b4bcc117c59348ce063a Mon Sep 17 00:00:00 2001 From: nullhack <nullhack@users.noreply.github.com> Date: Fri, 17 Apr 2026 17:59:08 -0400 Subject: [PATCH 06/12] fix: add mandatory research step to create-skill and create-agent skills - create-skill: Step 0 is now research domain, search best practices, synthesize conclusions, embed as guidance - create-agent: Same mandatory research step, scoped to agent's domain + domain methodology - Removed academic citations, embed conclusions as direct actionable guidance --- .opencode/skills/create-agent/SKILL.md | 49 ++++++++++++-------------- .opencode/skills/create-skill/SKILL.md | 26 +++++++++++--- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/.opencode/skills/create-agent/SKILL.md b/.opencode/skills/create-agent/SKILL.md index 300e17c..a482246 100644 --- a/.opencode/skills/create-agent/SKILL.md +++ b/.opencode/skills/create-agent/SKILL.md @@ -15,32 +15,27 @@ Create a new OpenCode agent following research-backed best practices from OpenAI When you need a new agent with distinct ownership, instructions, tool surface, or approval policy. Not for simple routing — only when the task requires a separate domain of responsibility. -## Research Basis - -### Agent Design Principles (OpenAI, Anthropic, 2024-2026) - -**Core principle**: "Define the smallest agent that can own a clear task. Add more agents only when you need separate ownership, different instructions, different tool surfaces, or different approval policies." — OpenAI Agents SDK (Entry 318 in `docs/academic_research.md`) - -**Split criterion is ownership boundary, not instruction volume.** - -### Multi-Agent Architecture Patterns - -| Pattern | When to Use | Example | -|---|---|---| -| **Single-agent** | Most tasks; incrementally add tools | `software-engineer` handles Steps 2-3 | -| **Hierarchical (triage + specialist)** | Multiple distinct task types requiring different expertise | `product-owner` → `software-engineer` → `reviewer` | -| **Evaluator-optimizer** | Tasks requiring iteration with quality checks | Review workflow | +## How to Create an Agent -### Agent Definition Components +### 0. Research (mandatory — do this first) -From OpenAI's practical guide: +Before writing any agent, research the domain to ground the agent design in industry standards and scientifically-backed evidence: -1. **Model** — LLM for reasoning/decision-making -2. **Instructions** — System prompt defining behavior -3. **Tools** — Actions the agent can take -4. **Guardrails** — Safety boundaries +1. **Identify the agent's domain**: What role, responsibility, and domain will this agent own? +2. **Search for domain-specific best practices**: + - For agent architecture: OpenAI Agents SDK, Anthropic Claude Agent SDK, Google Agents SDK + - For domain methodology: Academic papers, vendor guides, established standards (e.g., OWASP for security, IEEE for software engineering) + - For known failure modes: Post-mortems, case studies, industry reports +3. **Synthesize conclusions**: What ownership boundaries work? What tool design patterns? What escalation rules? +4. **Embed as design decisions**: Write the agent's ownership definition, instruction patterns, tool surface, and escalation rules based on those conclusions — not as citations but as direct guidance -## How to Create an Agent +**Example research synthesis:** +``` +Agent domain: Security reviewer agent +Research: OWASP Testing Guide, NIST security controls, Anthropic's adversarial verification patterns +Conclusion: Security agents should assume breach by default, escalate on any critical finding, use defense-in-depth checklist. +→ Agent design: "role: reviewer", "escalation: any critical = human", "tool: security-scan + vuln-check" +``` ### 1. Create the agent file @@ -85,7 +80,7 @@ steps: <step numbers this agent owns, e.g., "2, 3"> ### 2. Follow the structural rules -From `academic_research.md` Entry 410: +Apply the research conclusions about file organization: | File | When Loaded | Content | Avoid | |---|---|---|---| @@ -93,9 +88,11 @@ From `academic_research.md` Entry 410: | `.opencode/agents/*.md` | When role invoked | Role identity, step ownership, skill loads, tool permissions | Duplication | | `.opencode/skills/*.md` | On demand | Full procedural instructions | Duplication | +**Why**: Keeping always-loaded files lean preserves attention budget for the task at hand. + ### 3. Define clear ownership boundaries -**Split criteria** (Anthropic/OpenAI): +**Split criteria**: - Separate ownership (different domain responsibility) - Different instructions (not just more detail) - Different tool surface (distinct actions) @@ -105,7 +102,7 @@ From `academic_research.md` Entry 410: ### 4. Write effective instructions -From Anthropic's agent design patterns: +Write instructions that work in practice: - **Specific triggers**: "Load this skill when X" not "use judgment" - **Clear actions**: Every step corresponds to a specific output @@ -114,7 +111,7 @@ From Anthropic's agent design patterns: ### 5. Define tool permissions -From Anthropic's tool design principles: +Design the tool surface based on what the agent needs to accomplish: - **Start with bash** for breadth - **Promote to dedicated tools** when you need to: diff --git a/.opencode/skills/create-skill/SKILL.md b/.opencode/skills/create-skill/SKILL.md index 4132176..8d94116 100644 --- a/.opencode/skills/create-skill/SKILL.md +++ b/.opencode/skills/create-skill/SKILL.md @@ -15,13 +15,29 @@ Create a new reusable skill for OpenCode agents, following research-backed best When you need to codify a repeatable workflow that multiple agents or sessions will follow. Skills are loaded on demand; they don't run automatically. -## Research Basis +## How to Create a Skill -- **Lazy loading principle** (Anthropic, 2024): Skills should be loaded on demand, not in every session. This preserves the primary context budget and prevents important instructions from being pushed beyond effective attention range (Entry 347 in `docs/academic_research.md`). -- **Concise is key** (Anthropic skill authoring best practices): Every token in a skill competes with conversation context. Keep SKILL.md under 500 lines for optimal performance. -- **Tool abstraction** (OpenAI Agents SDK): Skills should define clear actions that correspond to specific outputs, not abstract guidance. +### 0. Research (mandatory — do this first) -## How to Create a Skill +Before writing any skill, research the domain to ground the skill in industry standards and scientifically-backed evidence: + +1. **Identify the domain**: What workflow or methodology will this skill codify? +2. **Search for best practices**: + - Academic sources (Google Scholar, IEEE, ACM) + - Vendor documentation (OpenAI, Anthropic, Google, Microsoft) + - Industry standards (ISO, NIST, OMG) + - Established methodologies (e.g., FDD, Scrum, Kanban for process skills) +3. **Read existing research**: Check `docs/academic_research.md` for related entries +4. **Synthesize conclusions**: Extract actionable conclusions — what works, why, and when to apply it +5. **Embed as guidance**: Write the skill's steps, checklists, and decision rules based on those conclusions — not as academic citations but as direct guidance ("Use X because it produces Y outcome") + +**Example research synthesis:** +``` +Research question: How to structure a security review skill? +Sources found: OWASP Testing Guide, NIST SP 800-53, Anthropic's agent design patterns +Conclusion: Security reviews should be adversarial (assume breakage), use defense-in-depth checklist, escalate on first critical finding. +→ Skill step: "3. Run adversarial checks — assume breach, verify every control" +``` ### 1. Create the directory From 1d99b8eb83cc592803a469944b22bea6945031bf Mon Sep 17 00:00:00 2001 From: nullhack <nullhack@users.noreply.github.com> Date: Fri, 17 Apr 2026 18:02:11 -0400 Subject: [PATCH 07/12] =?UTF-8?q?fix:=20remaining=20inconsistencies=20-=20?= =?UTF-8?q?README.md=206=20steps=20=E2=86=92=205=20steps,=20developer=20?= =?UTF-8?q?=E2=86=92=20software-engineer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 11 +++++------ docs/workflow.md | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 955fd5f..2defb3a 100644 --- a/README.md +++ b/README.md @@ -44,16 +44,15 @@ docs/features/in-progress/ ← exactly one feature being built docs/features/completed/ ← accepted and shipped features ``` -**3 roles, 6 steps:** +**4 roles, 5 steps:** | Step | Role | What happens | |------|------|-------------| | 1. SCOPE | Product Owner | Discovery + Gherkin stories + `@id` criteria | -| 2. ARCH | Developer | Design module structure, get PO approval | -| 3. TEST FIRST | Developer | Sync stubs, write failing tests mapped to `@id` | -| 4. IMPLEMENT | Developer | Red→Green→Refactor, commit per green test | -| 5. VERIFY | Reviewer | Run all commands, code review, `@id` traceability | -| 6. ACCEPT | Product Owner | Demo, validate, move folder to completed/ | +| 2. ARCH | Software Engineer | Design module structure, read all backlog features | +| 3. TDD LOOP | Software Engineer | RED→GREEN→REFACTOR, one `@id` at a time | +| 4. VERIFY | Reviewer | Run all commands, adversarial code review | +| 5. ACCEPT | Product Owner | Demo, validate, move folder to completed/ | ### AI Agents diff --git a/docs/workflow.md b/docs/workflow.md index 370a7a4..90f2477 100644 --- a/docs/workflow.md +++ b/docs/workflow.md @@ -428,7 +428,7 @@ tests/ function: test_<rule_slug>_<8char_hex>() unit/ <anything>_test.py ← software-engineer-authored extras, no @id traceability - plain pytest or Hypothesis @given (developer choice) + plain pytest or Hypothesis @given (software-engineer choice) ``` --- From e7a5ed09312cc8e7eb75d7b9cfb41ef694cb713e Mon Sep 17 00:00:00 2001 From: nullhack <nullhack@users.noreply.github.com> Date: Sat, 18 Apr 2026 01:24:34 -0400 Subject: [PATCH 08/12] fix: final consistency updates - README.md and docs/workflow.md role names, step counts, skills list --- README.md | 29 ++++++++++++----------------- docs/workflow.md | 2 +- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 2defb3a..e217a35 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ uv run task test && uv run task lint && uv run task static-check && timeout 10s ### Development Workflow -A **6-step Kanban workflow** with WIP=1 (one feature at a time), enforced by the filesystem: +A **5-step Kanban workflow** with WIP=1 (one feature at a time), enforced by the filesystem: ``` docs/features/backlog/ ← features waiting to be worked on @@ -58,7 +58,7 @@ docs/features/completed/ ← accepted and shipped features ```bash @product-owner # Defines features, picks from backlog, accepts deliveries -@developer # Architecture, tests, code, git, releases +@software-engineer # Architecture, tests, code, git, releases @reviewer # Runs commands, reviews code — read+bash only @setup-project # One-time template initialization ``` @@ -68,30 +68,25 @@ docs/features/completed/ ← accepted and shipped features ```bash /skill session-workflow # Read TODO.md, continue, hand off cleanly /skill scope # Write user stories + acceptance criteria -/skill tdd # TDD: file naming, docstring format, markers -/skill implementation # Red-Green-Refactor, architecture, ADRs -/skill code-quality # redirects to verify (quick reference) -/skill verify # Step 5 verification checklist +/skill implementation # Steps 2-3: architecture + TDD loop +/skill design-patterns # Refactor with patterns when smell detected +/skill code-quality # Redirects to verify (quick reference) +/skill verify # Step 4 verification checklist /skill pr-management # Branch naming, PR template, squash merge /skill git-release # Hybrid calver versioning, themed naming /skill create-skill # Add new skills to the system +/skill create-agent # Add new agents (human-user only) ``` ## Development Commands ```bash uv run task run # Run the application (humans) -timeout 10s uv run task run # Run with timeout (agents — exit 124 = hung = FAIL) -uv run task test # Full test suite with coverage report -uv run task test-fast # Tests without coverage (faster iteration) -uv run task test-slow # Only slow tests -uv run task lint # ruff check + format -uv run task static-check # pyright type checking -uv run task gen-id # Generate an 8-char hex ID for @id tags -uv run task gen-tests # Sync test stubs from .feature files -uv run task doc-build # Generate API docs + coverage + test reports -uv run task doc-publish # Publish unified docs site to GitHub Pages -uv run task doc-serve # Live API doc server at localhost:8080 +timeout 10s uv run task run # Run with timeout (agents — exit 124 = hung = FAIL) +uv run task test # Full test suite with 100% coverage (Step 4 handoff) +uv run task test-fast # Fast tests no coverage (Step 3 Red-Green-Refactor cycle) +uv run task lint # ruff check + format (Step 4 handoff) +uv run task static-check # pyright type checking (Step 4 handoff) ``` ## Code Quality Standards diff --git a/docs/workflow.md b/docs/workflow.md index 90f2477..6052f92 100644 --- a/docs/workflow.md +++ b/docs/workflow.md @@ -492,7 +492,7 @@ As a software-engineer I declare: |---|---|---| | **Stakeholder** | Human | Answers questions, provides domain knowledge, approves syntheses | | **Product Owner** | AI agent | Interviews stakeholder, writes `.feature` files, picks features, accepts deliveries | -| **Developer** | AI agent | Architecture, tests, code, git, releases | +| **Software Engineer** | AI agent | Architecture, tests, code, git, releases | | **Reviewer** | AI agent | Adversarial verification — defaults to REJECTED until proven correct | --- From 458df2fcdb5ee545959ec6f2e1973c66a664e0fb Mon Sep 17 00:00:00 2001 From: nullhack <nullhack@users.noreply.github.com> Date: Sat, 18 Apr 2026 01:24:46 -0400 Subject: [PATCH 09/12] fix: remove broken gen-tests task from pyproject.toml (tdd skill was deleted) --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6658085..694ea0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -144,7 +144,6 @@ pytest \ doc-publish = "task doc-build && ghp-import -n -p -f docs" static-check = "pyright" gen-id = "python -c \"import uuid; [print(uuid.uuid4().hex[:8]) for _ in range(20)]\"" -gen-tests = "python .opencode/skills/tdd/scripts/gen_test_stubs.py" gen-todo = "python .opencode/skills/session-workflow/scripts/gen_todo.py" [dependency-groups] From 9a2f45ffe01c0800c63c91c86407ea182ba1bac2 Mon Sep 17 00:00:00 2001 From: nullhack <nullhack@users.noreply.github.com> Date: Sat, 18 Apr 2026 02:22:03 -0400 Subject: [PATCH 10/12] refactor: rename cluster to behavior groups for clearer AI focus --- .opencode/skills/scope/SKILL.md | 40 +++++++++--------- .opencode/skills/scope/discovery-template.md | 6 +-- AGENTS.md | 10 ++--- docs/features/discovery.md | 6 +-- docs/workflow.md | 44 ++++++++++---------- 5 files changed, 53 insertions(+), 53 deletions(-) diff --git a/.opencode/skills/scope/SKILL.md b/.opencode/skills/scope/SKILL.md index 7581ab5..14af5f6 100644 --- a/.opencode/skills/scope/SKILL.md +++ b/.opencode/skills/scope/SKILL.md @@ -63,7 +63,7 @@ Ask the stakeholder to describe the same situation from another actor's point of Three levels of active listening apply throughout every interview session: - **Level 1 — Per answer**: immediately paraphrase each answer before moving to the next question. "So if I understand correctly, you're saying that X happens when Y?" Catches misunderstanding in the moment. -- **Level 2 — Per cluster**: brief synthesis when transitioning between topic clusters. "We've covered [area A] and [area B]. Before I ask about [area C], here is what I understood so far: [summary]. Does that capture it?" Confirms completeness, gives stakeholder a recovery point. +- **Level 2 — Per group**: brief synthesis when transitioning between behavior groups. "We've covered [area A] and [area B]. Before I ask about [area C], here is what I understood so far: [summary]. Does that capture it?" Confirms completeness, gives stakeholder a recovery point. - **Level 3 — End of session**: full synthesis of everything discussed. Present to stakeholder for approval. This is the accuracy gate, the baseline signal, and the input to domain modeling. Do not introduce topic labels or categories during active listening. The summary must reflect what the stakeholder said, not new framing that prompts reactions to things they haven't considered. @@ -98,20 +98,20 @@ Do not introduce topic labels or categories during active listening. The summary 4. Run a **silent pre-mortem** on the confirmed synthesis: "Imagine we build exactly what was described, ship it, and it fails. What was missing?" Add any discoveries as new questions to the Questions table. 5. Mark `Template §1: CONFIRMED` in `discovery.md`. This unlocks Session 2. -### Session 2 — Cluster / Big Picture +### Session 2 — Behavior Groups / Big Picture -**Before the session**: Review the confirmed Session 1 synthesis. Identify topic clusters (cross-cutting concerns, system-wide constraints, integration points, lifecycle questions). Prepare cluster-level questions. +**Before the session**: Review the confirmed Session 1 synthesis. Identify behavior groups (cross-cutting concerns, system-wide constraints, integration points, lifecycle questions). Prepare group-level questions. -**During the session**: Apply Level 1 active listening per answer. Apply Level 2 active listening when transitioning between clusters. Apply CIT, Laddering, and CI Perspective Change per cluster. Add new questions in the moment. +**During the session**: Apply Level 1 active listening per answer. Apply Level 2 active listening when transitioning between groups. Apply CIT, Laddering, and CI Perspective Change per group. Add new questions in the moment. **After the session**: -1. For each cluster, write a **Cluster Summary** in `discovery.md`. +1. For each group, write a **Group Summary** in `discovery.md`. 2. Mark `Template §2: CONFIRMED` in `discovery.md`. This unlocks Session 3. ### Session 3 — Synthesis Approval + Feature Derivation -**Before the session**: Produce a **Full Synthesis** across all clusters from Sessions 1 and 2. Write it to `discovery.md`. +**Before the session**: Produce a **Full Synthesis** across all behavior groups from Sessions 1 and 2. Write it to `discovery.md`. **During the session**: Present the full synthesis to the stakeholder. "This is my understanding of the full scope. Please correct anything that is missing or wrong." Stakeholder approves or corrects. PO refines until the stakeholder explicitly approves. @@ -152,26 +152,26 @@ Commit: `feat(discovery): baseline project discovery` 3. Run a **silent pre-mortem** on the confirmed synthesis. 4. Mark `Template §1: CONFIRMED`. This unlocks Session 2. -### Session 2 — Cluster / Big Picture for This Feature +### Session 2 — Behavior Groups / Big Picture for This Feature -**Before the session**: Review the confirmed Session 1 synthesis. Identify clusters of behavior within this feature (happy paths, error paths, edge cases, lifecycle events, integration points). +**Before the session**: Review the confirmed Session 1 synthesis. Identify behavior groups within this feature (happy paths, error paths, edge cases, lifecycle events, integration points). -**During the session**: Apply Level 1 active listening per answer. Apply Level 2 active listening when transitioning between clusters. Apply CIT, Laddering, and CI Perspective Change per cluster. +**During the session**: Apply Level 1 active listening per answer. Apply Level 2 active listening when transitioning between groups. Apply CIT, Laddering, and CI Perspective Change per group. **After the session**: -1. Write **Cluster Summaries** in the `.feature` file. Name each cluster — these names become candidate `Rule:` titles. +1. Write **Group Summaries** in the `.feature` file. Name each group — these names become candidate `Rule:` titles. 2. Mark `Template §2: CONFIRMED`. This unlocks Session 3. ### Session 3 — Feature Synthesis Approval + Story Derivation -**Before the session**: Produce a **Full Synthesis** of the feature scope, covering all clusters from Sessions 1 and 2. +**Before the session**: Produce a **Full Synthesis** of the feature scope, covering all behavior groups from Sessions 1 and 2. **During the session**: Present the full synthesis to the stakeholder. Stakeholder approves or corrects. PO refines until explicitly approved. **After the session** (PO alone): -1. Map each named cluster from Session 2 to a candidate user story (Rule). +1. Map each named group from Session 2 to a candidate user story (Rule). 2. Write `Status: BASELINED (YYYY-MM-DD)` to the `.feature` file's discovery section. 3. Mark `Template §3: CONFIRMED`. @@ -355,16 +355,16 @@ Feature: <Feature title> Template §1: CONFIRMED Synthesis: <PO synthesis — confirmed by stakeholder> - Session 2 — Cluster / Big Picture: + Session 2 — Behavior Groups / Big Picture: | ID | Question | Answer | Status | |----|----------|--------|--------| | Q2 | ... | ... | OPEN / ANSWERED | Template §2: CONFIRMED - Clusters: - - <Cluster name>: <one-sentence summary> + Behavior Groups: + - <Behavior group name>: <one-sentence summary> Session 3 — Feature Synthesis: - Synthesis: <full synthesis across all clusters> + Synthesis: <full synthesis across all behavior groups> Template §3: CONFIRMED — stakeholder approved YYYY-MM-DD Rule: <User story title> @@ -409,19 +409,19 @@ Template §1: CONFIRMED Synthesis: <PO synthesis — confirmed by stakeholder> Pre-mortem: <gaps identified; new questions added above> -## Session 2 — Cluster / Big Picture +## Session 2 — Behavior Groups / Big Picture | ID | Question | Answer | Status | |----|----------|--------|--------| | Q2 | ... | ... | OPEN / ANSWERED | Template §2: CONFIRMED -Clusters: -- <Cluster name>: <one-sentence summary> +Behavior Groups: +- <Behavior group name>: <one-sentence summary> ## Session 3 — Full Synthesis -<3–6 paragraph synthesis of all scope, clusters, and boundaries> +<3–6 paragraph synthesis of all scope, behavior groups, and boundaries> Template §3: CONFIRMED — stakeholder approved YYYY-MM-DD ``` diff --git a/.opencode/skills/scope/discovery-template.md b/.opencode/skills/scope/discovery-template.md index c8e8c90..117d025 100644 --- a/.opencode/skills/scope/discovery-template.md +++ b/.opencode/skills/scope/discovery-template.md @@ -20,13 +20,13 @@ Feature: <feature-name> Synthesis: (fill after stakeholder confirms) Pre-mortem: (fill after synthesis is confirmed) - Session 2 — Cluster / Big Picture: + Session 2 — Behavior Groups / Big Picture: | ID | Question | Answer | Status | |----|----------|--------|--------| Template §2: PENDING - Clusters: - - (fill after all cluster questions are answered) + Behavior Groups: + - (fill after all group questions are answered) Session 3 — Feature Synthesis: (fill after Sessions 1 and 2 are complete) diff --git a/AGENTS.md b/AGENTS.md index c4ab039..9d5a93e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -58,20 +58,20 @@ STEP 5: ACCEPT (product-owner) → demo, validate, move folder to compl PO creates `docs/features/discovery.md` using the 3-session template. **Skip Phase 1 entirely if `discovery.md` Status is BASELINED.** To add features to an existing project: append new questions to Session 1 and re-fill from there. - **Session 1** — Individual scope elicitation: 5Ws + Success + Failure + Out-of-scope. Gap-finding per answer using CIT, Laddering, and CI Perspective Change. PO writes synthesis; stakeholder confirms or corrects. PO runs silent pre-mortem on confirmed synthesis. Template §1 must be confirmed before Session 2. -- **Session 2** — Cluster / big picture: questions target clusters and cross-cutting concerns. Gap-finding per cluster. Level 2 synthesis when transitioning between clusters. Template §2 must be complete before Session 3. -- **Session 3** — Synthesis approval + feature derivation: PO produces full synthesis of all clusters; stakeholder approves or corrects (PO refines until approved). Domain analysis: nouns/verbs → subject areas → FDD "Action object" feature names. Create `backlog/<name>.feature` stubs. Write `Status: BASELINED` to `discovery.md`. +- **Session 2** — Behavior groups / big picture: questions target behavior groups and cross-cutting concerns. Gap-finding per group. Level 2 synthesis when transitioning between groups. Template §2 must be complete before Session 3. +- **Session 3** — Synthesis approval + feature derivation: PO produces full synthesis of all behavior groups; stakeholder approves or corrects (PO refines until approved). Domain analysis: nouns/verbs → subject areas → FDD "Action object" feature names. Create `backlog/<name>.feature` stubs. Write `Status: BASELINED` to `discovery.md`. ### Phase 2 — Feature Discovery (per feature) Each `.feature` file has its own 3-session discovery template in its description. **Sessions are enforced by the template: each section must be filled before proceeding to the next.** - **Session 1** — Individual entity elicitation: populate Entities table from project discovery; generate questions from entity gaps using CIT, Laddering, CI Perspective Change. PO writes synthesis; stakeholder confirms. Silent pre-mortem on confirmed synthesis. -- **Session 2** — Cluster / big picture: questions target clusters of behavior within this feature. Gap-finding per cluster. Level 2 cluster transition summaries. -- **Session 3** — Feature synthesis approval + story derivation: PO produces synthesis of feature scope and clusters; stakeholder approves or corrects (PO refines until approved). Clusters become candidate user stories (Rules). Write `Status: BASELINED` to `.feature` discovery section. +- **Session 2** — Behavior groups / big picture: questions target behavior groups within this feature. Gap-finding per group. Level 2 group transition summaries. +- **Session 3** — Feature synthesis approval + story derivation: PO produces synthesis of feature scope and behavior groups; stakeholder approves or corrects (PO refines until approved). Story candidates become candidate user stories (Rules). Write `Status: BASELINED` to `.feature` discovery section. **Decomposition check**: after Session 3, does this feature span >2 distinct concerns OR have >8 candidate Examples? YES → split into separate `.feature` files, re-run Phase 2. NO → proceed. ### Phase 3 — Stories (PO alone) -Clusters from Phase 2 Session 2 → one `Rule:` block per user story. Each `Rule:` has the user story header (`As a / I want / So that`) as its description — no `Example:` blocks yet. INVEST gate: all 6 letters must pass. Commit: `feat(stories): write user stories for <name>` +Story candidates from Phase 2 Session 2 → one `Rule:` block per user story. Each `Rule:` has the user story header (`As a / I want / So that`) as its description — no `Example:` blocks yet. INVEST gate: all 6 letters must pass. Commit: `feat(stories): write user stories for <name>` ### Phase 4 — Criteria (PO alone) Pre-mortem per Rule (all Rules must be checked before writing Examples). Write `Example:` blocks — declarative Given/When/Then, MoSCoW triage (Must/Should/Could) per Example. Review checklist (4.3). Commit: `feat(criteria): write acceptance criteria for <name>` diff --git a/docs/features/discovery.md b/docs/features/discovery.md index d7d7c3c..f764e10 100644 --- a/docs/features/discovery.md +++ b/docs/features/discovery.md @@ -23,14 +23,14 @@ Pre-mortem: (fill after synthesis is confirmed) --- -## Session 2 — Cluster / Big Picture +## Session 2 — Behavior Groups / Big Picture | ID | Question | Answer | Status | |----|----------|--------|--------| Template §2: PENDING -Clusters: -- (fill after all cluster questions are answered) +Behavior Groups: +- (fill after all group questions are answered) --- diff --git a/docs/workflow.md b/docs/workflow.md index 6052f92..20e114b 100644 --- a/docs/workflow.md +++ b/docs/workflow.md @@ -46,20 +46,20 @@ Each step has a designated agent and a specific deliverable. No step is skipped. │ → PO runs silent pre-mortem on confirmed synthesis │ │ [template §1: synthesis confirmed → unlocks Session 2] │ │ │ -│ Session 2 — Cluster / Big Picture │ -│ Questions target clusters and cross-cutting concerns │ -│ Gap-finding per cluster: CIT · Laddering · CI Perspective │ +│ Session 2 — Behavior Groups / Big Picture │ +│ Questions target behavior groups and cross-cutting concerns │ +│ Gap-finding per group: CIT · Laddering · CI Perspective │ │ [new questions from elucidation added in the moment] │ │ Level 1: paraphrase each answer │ -│ Level 2: synthesis when transitioning between clusters │ -│ [template §2: all clusters answered → unlocks Session 3] │ +│ Level 2: synthesis when transitioning between groups │ +│ [template §2: all groups answered → unlocks Session 3] │ │ │ │ Session 3 — Synthesis Approval + Feature Derivation │ -│ PO produces full synthesis across all clustered areas │ +│ PO produces full synthesis across all behavior groups │ │ → stakeholder approves or corrects; PO refines until approved │ │ [template §3: approval → unlocks domain analysis] │ │ Domain analysis: nouns/verbs → subject areas │ -│ Name features (FDD "Action object" / Affinity clusters) │ +│ Name features (FDD "Action object" / Affinity groups) │ │ Create backlog/<name>.feature stubs │ │ Status: BASELINED written to discovery.md │ │ │ @@ -75,18 +75,18 @@ Each step has a designated agent and a specific deliverable. No step is skipped. │ → PO runs silent pre-mortem on confirmed synthesis │ │ [template §1: synthesis confirmed → unlocks Session 2] │ │ │ -│ Session 2 — Cluster / Big Picture for this Feature │ -│ Questions target clusters of behavior within this feature │ -│ Gap-finding per cluster: CIT · Laddering · CI Perspective │ +│ Session 2 — Behavior Groups / Big Picture for this Feature │ +│ Questions target behavior groups within this feature │ +│ Gap-finding per group: CIT · Laddering · CI Perspective │ │ [new questions from elucidation added in the moment] │ -│ Level 1: paraphrase · Level 2: cluster transition summaries │ -│ [template §2: all clusters answered → unlocks Session 3] │ -│ │ -│ Session 3 — Feature Synthesis Approval + Story Derivation │ -│ PO produces synthesis of feature scope and clusters │ -│ → stakeholder approves or corrects; PO refines until approved │ -│ Clusters → candidate user stories (Rules) │ -│ Status: BASELINED written to .feature discovery section │ +│ Level 1: paraphrase · Level 2: group transition summaries │ +│ [template §2: all groups answered → unlocks Session 3] │ +│ │ +│ Session 3 — Feature Synthesis Approval + Story Derivation │ +│ PO produces synthesis of feature scope and behavior groups │ +│ → stakeholder approves or corrects; PO refines until approved │ +│ Story candidates → candidate user stories (Rules) │ +│ Status: BASELINED written to .feature discovery section │ │ [template §3: approval + stories → unlocks decomp check] │ │ │ │ DECOMPOSITION CHECK │ @@ -95,7 +95,7 @@ Each step has a designated agent and a specific deliverable. No step is skipped. │ NO → proceed │ │ │ │ Phase 3 — Stories (PO alone) │ -│ Clusters from Phase 2 Session 2 → one Rule: block per story │ +│ Story candidates from Phase 2 Session 2 → one Rule: block per story │ │ INVEST gate: all 6 letters must pass before committing │ │ commit: feat(stories): write user stories for <name> │ │ │ @@ -370,12 +370,12 @@ Feature: <title> | ID | Question | Answer | Status | ← OPEN / ANSWERED Synthesis: <PO synthesis — confirmed by stakeholder> - Session 2 — Cluster / Big Picture: + Session 2 — Behavior Groups / Big Picture: | ID | Question | Answer | Status | - Clusters: <named topic clusters derived from answers> + Behavior Groups: <named behavior groups derived from answers> Session 3 — Feature Synthesis: - Synthesis: <full synthesis across clusters> + Synthesis: <full synthesis across behavior groups> Approved: YES / NO Architecture: ← added at Step 2 by software-engineer From 424d9ee51c878d559294491f0e0dff535f5a02d1 Mon Sep 17 00:00:00 2001 From: nullhack <nullhack@users.noreply.github.com> Date: Sat, 18 Apr 2026 02:57:17 -0400 Subject: [PATCH 11/12] fix: enforce BASELINED gate before PO moves feature to in-progress --- .opencode/agents/product-owner.md | 1 + .opencode/skills/session-workflow/SKILL.md | 4 ++-- TODO.md | 2 +- docs/workflow.md | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.opencode/agents/product-owner.md b/.opencode/agents/product-owner.md index 081653d..40e6d05 100644 --- a/.opencode/agents/product-owner.md +++ b/.opencode/agents/product-owner.md @@ -34,6 +34,7 @@ Load `skill session-workflow` first — it reads TODO.md, orients you to the cur - No other agent may edit these files - Software-engineer escalates spec gaps to you; you decide whether to extend criteria - **You pick** the next feature from backlog — the software-engineer never self-selects +- **NEVER move a feature to `in-progress/` unless its discovery section has `Status: BASELINED`** — if not baselined, complete Step 1 (Phase 2 + 3 + 4) first ## Step 5 — Accept diff --git a/.opencode/skills/session-workflow/SKILL.md b/.opencode/skills/session-workflow/SKILL.md index fc370a8..85da1f7 100644 --- a/.opencode/skills/session-workflow/SKILL.md +++ b/.opencode/skills/session-workflow/SKILL.md @@ -27,7 +27,7 @@ Every session starts by reading state. Every session ends by writing state. This 3. Run `git status` — understand what is committed vs. what is not 4. Confirm scope: you are working on exactly one step of one feature -If TODO.md says "No feature in progress", report to the PO that backlog features are waiting. **The software-engineer never self-selects a feature from the backlog — only the PO picks.** +If TODO.md says "No feature in progress", report to the PO that backlog features are waiting. **The software-engineer never self-selects a feature from the backlog — only the PO picks.** The PO must verify the feature has `Status: BASELINED` in its discovery section before moving it to `in-progress/` — if not baselined, the PO must complete Step 1 first. ## Session End @@ -89,7 +89,7 @@ When no feature is active: # Current Work No feature in progress. -Next: PO picks feature from docs/features/backlog/ and moves it to docs/features/in-progress/. +Next: PO picks a feature from docs/features/backlog/ that has Status: BASELINED and moves it to docs/features/in-progress/. ``` ## Step 3 (TDD Loop) Cycle-Aware TODO Format diff --git a/TODO.md b/TODO.md index f0bff67..72e090e 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,4 @@ # Current Work No feature in progress. -Next: PO picks feature from docs/features/backlog/ and moves it to docs/features/in-progress/. +Next: PO picks a feature from docs/features/backlog/ that has Status: BASELINED and moves it to docs/features/in-progress/. diff --git a/docs/workflow.md b/docs/workflow.md index 20e114b..e7d2797 100644 --- a/docs/workflow.md +++ b/docs/workflow.md @@ -108,7 +108,7 @@ Each step has a designated agent and a specific deliverable. No step is skipped. │ ★ FROZEN — changes require @deprecated + new Example │ │ │ └─────────────────────────────────────────────────────────────────────┘ - ↓ PO picks feature from backlog + ↓ PO picks feature from backlog — only if Status: BASELINED ┌─────────────────────────────────────────────────────────────────────┐ │ STEP 2 — ARCHITECTURE agent: software-engineer │ ├─────────────────────────────────────────────────────────────────────┤ From e327381aa7536ef0b9006a613c58611e920f67b5 Mon Sep 17 00:00:00 2001 From: nullhack <nullhack@users.noreply.github.com> Date: Sat, 18 Apr 2026 04:08:00 -0400 Subject: [PATCH 12/12] refactor: replace raise NotImplementedError with skip stub, drop unit/integration markers --- .opencode/skills/implementation/SKILL.md | 16 ++++++---------- AGENTS.md | 12 +++++------- README.md | 4 ++-- pyproject.toml | 3 --- 4 files changed, 13 insertions(+), 22 deletions(-) diff --git a/.opencode/skills/implementation/SKILL.md b/.opencode/skills/implementation/SKILL.md index 30fc2a2..27187fc 100644 --- a/.opencode/skills/implementation/SKILL.md +++ b/.opencode/skills/implementation/SKILL.md @@ -243,8 +243,10 @@ def test_<rule_slug>_<8char_hex>() -> None: ### Docstring Format (mandatory) +New tests start as skipped stubs. Remove `@pytest.mark.skip` when implementing in the RED phase. + ```python -@pytest.mark.unit +@pytest.mark.skip(reason="not yet implemented") def test_wall_bounce_a3f2b1c4() -> None: """ Given: A ball moving upward reaches y=0 @@ -262,19 +264,13 @@ def test_wall_bounce_a3f2b1c4() -> None: ### Markers -Every test gets exactly one of: -- `@pytest.mark.unit` — isolated, no external state -- `@pytest.mark.integration` — multiple components, external state - -Additionally: - `@pytest.mark.slow` — takes > 50ms (Hypothesis, DB, network, terminal I/O) +- `@pytest.mark.deprecated` — auto-skipped by conftest; used for superseded Examples ```python -@pytest.mark.unit def test_wall_bounce_a3f2b1c4() -> None: ... -@pytest.mark.integration @pytest.mark.slow def test_checkout_flow_b2c3d4e5() -> None: ... @@ -285,7 +281,6 @@ def test_checkout_flow_b2c3d4e5() -> None: When using `@given` in `tests/unit/`: ```python -@pytest.mark.unit @pytest.mark.slow @given(x=st.floats(min_value=-100, max_value=100, allow_nan=False)) @example(x=0.0) @@ -321,7 +316,8 @@ If testing through the real entry point is infeasible, escalate to PO to adjust - Write every test as if you cannot see the production code — test what a caller observes - No `isinstance()`, `type()`, or internal attribute (`_x`) checks in assertions - One assertion concept per test (multiple `assert` ok if they verify the same thing) -- No `pytest.skip` or `pytest.mark.xfail` without written justification +- No `pytest.mark.xfail` without written justification +- `pytest.mark.skip` is only valid on stubs (`reason="not yet implemented"`) — remove it when implementing - Test data embedded directly in the test, not loaded from external files ### Test Tool Decision diff --git a/AGENTS.md b/AGENTS.md index 9d5a93e..ebde9e6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -115,7 +115,7 @@ def test_<rule_slug>_<8char_hex>() -> None: ### Docstring Format (mandatory) ```python -@pytest.mark.unit +@pytest.mark.skip(reason="not yet implemented") def test_wall_bounce_a3f2b1c4() -> None: """ Given: A ball moving upward reaches y=0 @@ -125,13 +125,11 @@ def test_wall_bounce_a3f2b1c4() -> None: # Given # When # Then - raise NotImplementedError ``` -### Markers (3 total) -- `@pytest.mark.unit` — isolated, one function/class, no external state -- `@pytest.mark.integration` — multiple components, external state -- `@pytest.mark.slow` — takes > 50ms; additionally applied alongside `unit` or `integration` +### Markers +- `@pytest.mark.slow` — takes > 50ms; applied to Hypothesis tests and any test with I/O, network, or DB +- `@pytest.mark.deprecated` — auto-skipped by conftest; used for superseded Examples ## Development Commands @@ -175,7 +173,7 @@ uv run task doc-serve - **Max nesting**: 2 levels - **Instance variables**: ≤ 2 per class - **Semantic alignment**: tests must operate at the same abstraction level as the acceptance criteria they cover -- **Integration tests**: multi-component features require at least one `@pytest.mark.integration` test exercising the public entry point +- **Integration tests**: multi-component features require at least one test in `tests/features/` that exercises the public entry point end-to-end ### Software-Engineer Quality Gate Priority Order diff --git a/README.md b/README.md index e217a35..06db549 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ uv run task static-check # pyright type checking (Step 4 handoff) ## Test Conventions ```python -@pytest.mark.unit +@pytest.mark.skip(reason="not yet implemented") def test_bounce_physics_a3f2b1c4() -> None: """ Given: A ball moving upward reaches y=0 @@ -119,7 +119,7 @@ def test_bounce_physics_a3f2b1c4() -> None: ... ``` -**Markers**: `@pytest.mark.unit` · `@pytest.mark.integration` · `@pytest.mark.slow` · `@pytest.mark.deprecated` +**Markers**: `@pytest.mark.slow` · `@pytest.mark.deprecated` ## Technology Stack diff --git a/pyproject.toml b/pyproject.toml index 694ea0d..4f67dc5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,8 +80,6 @@ pydocstyle.convention = "google" minversion = "6.0" markers = [ "slow: marks tests as slow (deselect with '-m \"not slow\"')", - "unit: isolated tests for a single function or class", - "integration: tests covering multiple components together", "deprecated: marks tests for deprecated AC; automatically skipped (deselect with '-m \"not deprecated\"')", ] addopts = """ @@ -103,7 +101,6 @@ exclude_lines = [ "if self.debug:", "if settings.DEBUG", "raise AssertionError", - "raise NotImplementedError", "if 0:", "if __name__ == .__main__.:", ]