From 758abf76c3df7e02c0c47797a54473b01087406a Mon Sep 17 00:00:00 2001 From: MakiforDevelop Date: Mon, 18 May 2026 17:55:07 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E7=B5=A6=E8=A8=AA=E8=AB=87=20bot=20?= =?UTF-8?q?=E5=85=B1=E7=94=A8=E7=9A=84=20interview=20briefing=EF=BC=88purp?= =?UTF-8?q?ose=20+=20=E7=B4=AF=E7=A9=8D=20context=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根因修法。pilot 連續 4 個 bug(EVASION 誤判 / resume 把控制訊息當題目 / PII 誤 redact / follow-up 咬語氣詞)的共同根因:每個 LLM call 只拿到 當前問題、沒有訪談目的、沒有累積的對話脈絡 —— bot 沒頭沒尾,不知道 為何問、不知道前面聊過什麼、不知道怎麼接話。 Codex + Gemini 雙審設計、Chair 拍板: - 新模組 briefing.py:build_interview_briefing 組裝 purpose(訪談目的, 明示困惑/痛苦是材料不是迴避)+ 進度 + durable summary(anchors/ triples)+ coverage gaps + 最近逐字 - InterviewBriefing dataclass + 4 種 render mode(full / follow_up / classifier / anchor),各 LLM call 拿到適量 context - process_turn 每 turn 組裝一次,注入 evaluate_depth / extract_anchors / _final_reply / generate_follow_up - 全部 briefing 參數 optional(=None),向後相容、零行為改變 Constraint: 在既有 pipeline 上加 context 層;不重寫成 LLM-driven、 不改 DB schema、不擴張 ontology Directive: classifier mode 只給 purpose + 近期逐字 — EVASION 誤判的根因 修法;anchor mode 不給完整舊 anchors,避免 confirmation bias Co-Authored-By: Claude Opus 4.7 (1M context) --- src/virtualme/interview/anchor_extractor.py | 4 + src/virtualme/interview/bot.py | 94 ++++++--- src/virtualme/interview/briefing.py | 203 ++++++++++++++++++++ src/virtualme/interview/depth_evaluator.py | 8 +- src/virtualme/interview/follow_up.py | 9 +- tests/unit/test_anchor_extractor.py | 67 +++++++ tests/unit/test_bot.py | 94 ++++++++- tests/unit/test_briefing.py | 137 +++++++++++++ tests/unit/test_depth_evaluator.py | 22 +++ tests/unit/test_follow_up_cjk.py | 39 ++++ 10 files changed, 649 insertions(+), 28 deletions(-) create mode 100644 src/virtualme/interview/briefing.py create mode 100644 tests/unit/test_anchor_extractor.py create mode 100644 tests/unit/test_briefing.py diff --git a/src/virtualme/interview/anchor_extractor.py b/src/virtualme/interview/anchor_extractor.py index 8d9bc16..b77c7e5 100644 --- a/src/virtualme/interview/anchor_extractor.py +++ b/src/virtualme/interview/anchor_extractor.py @@ -2,6 +2,7 @@ from anthropic import AsyncAnthropic +from virtualme.interview.briefing import InterviewBriefing from virtualme.interview.json_utils import extract_json_payload from virtualme.interview.models import MODEL_STANDARD, create_message from virtualme.storage.db import Anchor, Dimension, Layer, Question, Turn @@ -11,8 +12,11 @@ async def extract_anchors( turn: Turn, current_question: Question, claude: AsyncAnthropic, + briefing: InterviewBriefing | None = None, ) -> list[Anchor]: + briefing_text = f"{briefing.render('anchor')}\n\n" if briefing is not None else "" prompt = f""" +{briefing_text} Extract 1-3 anchors as JSON list. Fields: dimension, layer, content. Use dimensions: {[dimension.value for dimension in Dimension]}. Use layers: fact, pattern, principle. diff --git a/src/virtualme/interview/bot.py b/src/virtualme/interview/bot.py index c305ee8..286b6dd 100644 --- a/src/virtualme/interview/bot.py +++ b/src/virtualme/interview/bot.py @@ -8,6 +8,7 @@ from virtualme.export.auto import auto_export_persona from virtualme.interview import byok from virtualme.interview.anchor_extractor import extract_anchors +from virtualme.interview.briefing import InterviewBriefing, build_interview_briefing from virtualme.interview.commands import ( DIMENSION_LABELS, GenerateProfileRequest, @@ -101,6 +102,7 @@ async def process_turn( else await db.get_current_week(interviewee_id, max_week) ) session = await db.get_or_create_session(interviewee_id, week=week) + briefing = await build_interview_briefing(db, interviewee_id, session, max_week) command = pre_gate_command if is_session_closing(incoming_message): return await _close_session( @@ -124,6 +126,7 @@ async def process_turn( db, selector, settings, + briefing, ) turn_count = await db.count_turns(session.id) if _is_light_greeting(incoming_message): @@ -134,6 +137,7 @@ async def process_turn( active_client, db, selector, + briefing, ) scrub_result = scrub_pii(incoming_message) @@ -150,11 +154,11 @@ async def process_turn( asked_question_ids = await db.load_asked_question_ids(interviewee_id) current_question = await _resolve_current_question(db, selector, session.id, session.week) assessment = await evaluate_depth( - scrub_result.scrubbed_text, current_question.text, active_client + scrub_result.scrubbed_text, current_question.text, active_client, briefing ) if assessment.parse_failed: reply = await _restate_current_question( - interviewee_id, current_question, active_client, db + interviewee_id, current_question, active_client, db, briefing ) await db.save_turn(session.id, "assistant", reply) return reply @@ -171,6 +175,7 @@ async def process_turn( is_meta=True, anchors_by_dimension=anchors_by_dimension, asked_question_ids=asked_question_ids, + briefing=briefing, ) await db.save_turn(session.id, "assistant", reply) return reply @@ -187,6 +192,7 @@ async def process_turn( is_meta=False, anchors_by_dimension=anchors_by_dimension, asked_question_ids=asked_question_ids, + briefing=briefing, ) await db.save_turn(session.id, "assistant", reply) return reply @@ -197,7 +203,9 @@ async def process_turn( rule = select_rule(scrub_result.scrubbed_text, depth, all_anchors) if assessment.kind == TurnKind.SUFFICIENT: - extracted_anchors = await extract_anchors(user_turn, current_question, active_client) + extracted_anchors = await extract_anchors( + user_turn, current_question, active_client, briefing + ) for anchor in extracted_anchors: await db.save_anchor( interviewee_id, @@ -217,7 +225,7 @@ async def process_turn( if should_probe: await db.record_question_probe(interviewee_id, current_question.id, session.week) reply = await generate_follow_up( - rule, scrub_result.scrubbed_text, current_question.text, active_client + rule, scrub_result.scrubbed_text, current_question.text, active_client, briefing ) else: excluded = {current_question.id} if probe_count >= MAX_PROBES_PER_QUESTION else set() @@ -234,7 +242,9 @@ async def process_turn( await db.set_current_question_id(session.id, next_question.id) await db.record_question_asked(interviewee_id, next_question.id, session.week) if next_question is not None: - reply = await _final_reply(interviewee_id, next_question, active_client, db) + reply = await _final_reply( + interviewee_id, next_question, active_client, db, briefing + ) elif settings.use_ppa: from virtualme.interview.ppa import ppa_response from virtualme.interview.reinjection import build_reinjection_anchor, should_reinject @@ -246,7 +256,9 @@ async def process_turn( dialogue_context = f"{anchor}\n\n{dialogue_context}" if anchor else dialogue_context reply = await ppa_response(dialogue_context, triples, active_client, settings) else: - reply = await _final_reply(interviewee_id, DEFAULT_QUESTION, active_client, db) + reply = await _final_reply( + interviewee_id, DEFAULT_QUESTION, active_client, db, briefing + ) await db.save_turn(session.id, "assistant", reply) turns_so_far = await db.load_session_turns(session.id) @@ -316,6 +328,7 @@ async def _handle_non_answer( is_meta: bool, anchors_by_dimension: dict, asked_question_ids: set[str], + briefing: InterviewBriefing | None = None, ) -> str: count = await db.record_question_non_answer( interviewee_id, current_question.id, session.week @@ -325,12 +338,12 @@ async def _handle_non_answer( # 分類器必有誤判, runtime 不該讓一次 EVASION 判斷就停題。 if count < 2: return await _gentle_evasion_bridge( - interviewee_id, current_question, active_client, db + interviewee_id, current_question, active_client, db, briefing ) return _pause_current_question() if count < 2: return await _bridge_to_current_question( - interviewee_id, user_text, current_question, active_client, db + interviewee_id, user_text, current_question, active_client, db, briefing ) next_question = selector.select_next( @@ -344,38 +357,51 @@ async def _handle_non_answer( await db.reset_question_non_answer(interviewee_id, current_question.id) if next_question is None: return await _bridge_to_current_question( - interviewee_id, user_text, current_question, active_client, db + interviewee_id, user_text, current_question, active_client, db, briefing ) await db.set_current_question_id(session.id, next_question.id) await db.record_question_asked(interviewee_id, next_question.id, session.week) - return await _final_reply(interviewee_id, next_question, active_client, db) + return await _final_reply(interviewee_id, next_question, active_client, db, briefing) async def _restate_current_question( - interviewee_id: str, question: Question, claude: AsyncAnthropic, db: DB + interviewee_id: str, + question: Question, + claude: AsyncAnthropic, + db: DB, + briefing: InterviewBriefing | None = None, ) -> str: # Re-ask via _final_reply so the question is rendered in Traditional Chinese # — the English question-pool text must never reach the interviewee. - asked = await _final_reply(interviewee_id, question, claude, db) + asked = await _final_reply(interviewee_id, question, claude, db, briefing) return f"我們先回到剛才這題。\n{asked}" async def _bridge_to_current_question( - interviewee_id: str, user_text: str, question: Question, claude: AsyncAnthropic, db: DB + interviewee_id: str, + user_text: str, + question: Question, + claude: AsyncAnthropic, + db: DB, + briefing: InterviewBriefing | None = None, ) -> str: if _asks_for_traditional_chinese(user_text): prefix = "可以,我們用繁體中文。" # noqa: RUF001 else: prefix = "可以,我先記下這點。" # noqa: RUF001 - asked = await _final_reply(interviewee_id, question, claude, db) + asked = await _final_reply(interviewee_id, question, claude, db, briefing) return f"{prefix}我們回到剛才這題。\n{asked}" async def _gentle_evasion_bridge( - interviewee_id: str, question: Question, claude: AsyncAnthropic, db: DB + interviewee_id: str, + question: Question, + claude: AsyncAnthropic, + db: DB, + briefing: InterviewBriefing | None = None, ) -> str: - asked = await _final_reply(interviewee_id, question, claude, db) + asked = await _final_reply(interviewee_id, question, claude, db, briefing) return f"這題如果不好說, 可以慢慢來 —— 挑一個你想到的小片段講就好。\n{asked}" @@ -423,6 +449,7 @@ async def _handle_light_greeting( active_client: AsyncAnthropic, db: DB, selector: QuestionSelector, + briefing: InterviewBriefing | None = None, ) -> str: """Resume from known progress instead of classifying a greeting as an answer.""" scrub_result = scrub_pii(incoming_message) @@ -447,7 +474,9 @@ async def _handle_light_greeting( or _is_control_message(raw_last_asked) or _has_unresolved_placeholder(last_asked) ): - rendered_question = await _final_reply(interviewee_id, question, active_client, db) + rendered_question = await _final_reply( + interviewee_id, question, active_client, db, briefing + ) reply = ( f"{progress_prefix}\n" f"我們從【{DIMENSION_LABELS[question.dimension]}】開始。\n" @@ -460,7 +489,9 @@ async def _handle_light_greeting( f"剛才問的是:\n{last_asked}" ) else: - rendered_question = await _final_reply(interviewee_id, question, active_client, db) + rendered_question = await _final_reply( + interviewee_id, question, active_client, db, briefing + ) reply = ( f"{progress_prefix}\n" f"我們從【{DIMENSION_LABELS[question.dimension]}】開始。\n" @@ -558,6 +589,7 @@ async def _handle_command( db: DB, selector: QuestionSelector, settings: Settings, + briefing: InterviewBriefing | None = None, ) -> str: """Reply to a meta-command. Saves the turn pair but runs no extraction.""" if isinstance(command, GenerateProfileRequest): @@ -581,7 +613,7 @@ async def _handle_command( user_turn = await db.save_turn(session.id, "user", scrub_result.scrubbed_text) await db.save_redactions(user_turn.id, scrub_result.redactions) reply, new_session = await _handle_restart( - interviewee_id, active_client, db, selector, settings + interviewee_id, active_client, db, selector, settings, briefing ) await db.save_turn(new_session.id, "assistant", reply) return reply @@ -672,6 +704,7 @@ async def _handle_restart( db: DB, selector: QuestionSelector, settings: Settings, + briefing: InterviewBriefing | None = None, ) -> tuple[str, Session]: archive_note = "已先輸出目前的 markdown archive 快照。" try: @@ -686,7 +719,9 @@ async def _handle_restart( first_question = _default_question(selector, 1) await db.set_current_question_id(new_session.id, first_question.id) await db.record_question_asked(interviewee_id, first_question.id, 1) - rendered_question = await _final_reply(interviewee_id, first_question, active_client, db) + rendered_question = await _final_reply( + interviewee_id, first_question, active_client, db, briefing + ) return format_restart_reply(archive_note, archived_counts, rendered_question), new_session @@ -768,17 +803,26 @@ def _all_questions(selector: QuestionSelector) -> list[Question]: async def _final_reply( - interviewee_id: str, question: Question, claude: AsyncAnthropic, db: DB + interviewee_id: str, + question: Question, + claude: AsyncAnthropic, + db: DB, + briefing: InterviewBriefing | None = None, ) -> str: - anchors = await db.load_anchors_summary(interviewee_id) - gaps = await db.compute_coverage_gap(interviewee_id) + briefing_text = f"{briefing.render('full')}\n\n" if briefing is not None else "" + if briefing is None: + anchors = await db.load_anchors_summary(interviewee_id) + gaps = await db.compute_coverage_gap(interviewee_id) + context_lines = f"Accumulated anchors: {anchors}\nCoverage gaps: {gaps}" + else: + context_lines = "" system = f""" +{briefing_text} You are the interview assistant for {interviewee_id}. Ask one question at a time. {INTERVIEW_OUTPUT_LANGUAGE} Translate the source question into natural Traditional Chinese, preserving its exact meaning, depth, and directness. Do not advise, praise, soften, or add commentary. -Accumulated anchors: {anchors} -Coverage gaps: {gaps} +{context_lines} """ response = await create_message( claude, diff --git a/src/virtualme/interview/briefing.py b/src/virtualme/interview/briefing.py new file mode 100644 index 0000000..2e3be86 --- /dev/null +++ b/src/virtualme/interview/briefing.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from virtualme.storage.db import Dimension, Layer + +INTERVIEW_PURPOSE = """INTERVIEW PURPOSE: +This is a long-running, multi-week VirtualMe interview. Its goal is to understand +the interviewee's behavior patterns, decision mechanisms, tradeoffs, boundaries, +and recurring ways of responding under pressure — not to judge, advise, diagnose, +or optimize them. +Treat confusion, hesitation, doubt, emotion, pain, contradiction, and reflection +about difficult topics as meaningful interview material — NOT as evasion. Genuine +evasion is only explicit refusal or deflection of the topic. +Ask one gentle question at a time. Prefer concrete episodes, choices, and +constraints. Do not advise, praise, diagnose, or over-interpret.""" + +MAX_RENDER_CHARS = 8000 +MAX_ANCHORS = 12 +MAX_TRIPLES = 12 +MAX_COVERAGE_GAPS = 8 +MAX_RECENT_TURNS = 8 +MAX_ANCHOR_MODE_TURNS = 4 + + +@dataclass(frozen=True) +class InterviewBriefing: + purpose: str + progress: str + durable_summary: str + coverage_gaps: str + recent_transcript: str + + def render(self, mode: str) -> str: + if mode == "classifier": + sections = [ + self.purpose, + _section("RECENT CONVERSATION", self.recent_transcript), + ] + elif mode == "anchor": + sections = [ + self.purpose, + _section( + "RECENT CONVERSATION", + _last_transcript_lines(self.recent_transcript, MAX_ANCHOR_MODE_TURNS), + ), + ] + elif mode == "follow_up": + sections = [ + self.purpose, + _section("STILL TO COVER", self.coverage_gaps), + _section("RECENT CONVERSATION", self.recent_transcript), + ] + else: + sections = [ + self.purpose, + _section("PROGRESS", self.progress), + _section("WHAT WE KNOW SO FAR", self.durable_summary), + _section("STILL TO COVER", self.coverage_gaps), + _section("RECENT CONVERSATION", self.recent_transcript), + ] + return _fit_to_limit(sections, mode) + + +async def build_interview_briefing( + db: Any, interviewee_id: str, session: Any, max_week: int +) -> InterviewBriefing: + anchors_by_dimension = await db.load_anchors_summary(interviewee_id) + triples = await db.load_triples(interviewee_id) + coverage_gap = await db.compute_coverage_gap(interviewee_id) + recent_turns = await db.load_recent_turns(session.id, MAX_RECENT_TURNS) + + return InterviewBriefing( + purpose=INTERVIEW_PURPOSE, + progress=f"Week {session.week} of {max_week}.", + durable_summary=_format_durable_summary(anchors_by_dimension, triples), + coverage_gaps=_format_coverage_gaps(coverage_gap), + recent_transcript=_format_recent_transcript(recent_turns), + ) + + +def _section(title: str, content: str) -> str: + return f"{title}:\n{content.strip() if content.strip() else 'None yet.'}" + + +def _format_durable_summary( + anchors_by_dimension: dict[Dimension, list[Any]], triples: list[Any] +) -> str: + lines: list[str] = [] + anchors = [ + anchor + for dimension in Dimension + for anchor in anchors_by_dimension.get(dimension, []) + if str(getattr(anchor, "content", "")).strip() + ] + anchors.sort( + key=lambda anchor: ( + 0 if getattr(anchor, "triangulated", False) else 1, + 0 if getattr(anchor, "layer", None) == Layer.PRINCIPLE else 1, + ) + ) + for anchor in anchors[:MAX_ANCHORS]: + dimension = getattr(anchor, "dimension", "") + dimension_text = getattr(dimension, "value", str(dimension)) + layer = getattr(anchor, "layer", "") + layer_text = getattr(layer, "value", str(layer)) + marker = "triangulated " if getattr(anchor, "triangulated", False) else "" + lines.append(f"- Anchor [{dimension_text}/{marker}{layer_text}]: {anchor.content}") + + prioritized_triples = [ + triple + for triple in triples + if str(getattr(triple, "object", "")).strip() + ] + relation_priority = {"red_line": 0, "value_anchor": 1, "skill": 2} + prioritized_triples.sort( + key=lambda triple: relation_priority.get(str(getattr(triple, "relation", "")), 3) + ) + for triple in prioritized_triples[:MAX_TRIPLES]: + lines.append( + "- Triple " + f"[{triple.relation}]: {triple.subject} -> {triple.object} " + f"(confidence {float(getattr(triple, 'confidence', 0.0)):.2f})" + ) + + return "\n".join(lines) if lines else "No durable signal extracted yet." + + +def _format_coverage_gaps(coverage_gap: dict[Dimension, float]) -> str: + if not coverage_gap: + return "No computed coverage gaps yet." + rows = sorted( + coverage_gap.items(), + key=lambda item: item[1], + reverse=True, + ) + lines = [] + for dimension, gap in rows[:MAX_COVERAGE_GAPS]: + dimension_text = getattr(dimension, "value", str(dimension)) + lines.append(f"- {dimension_text}: {float(gap):.2f}") + return "\n".join(lines) + + +def _format_recent_transcript(turns: list[Any]) -> str: + if not turns: + return "No recent conversation yet." + labels = {"user": "受訪者", "assistant": "訪談者"} + lines = [] + for turn in turns[-MAX_RECENT_TURNS:]: + role = labels.get(str(getattr(turn, "role", "")), str(getattr(turn, "role", ""))) + content = str(getattr(turn, "content", "")).strip() + if content: + lines.append(f"{role}: {content}") + return "\n".join(lines) if lines else "No recent conversation yet." + + +def _last_transcript_lines(transcript: str, limit: int) -> str: + lines = [line for line in transcript.splitlines() if line.strip()] + if not lines: + return "No recent conversation yet." + return "\n".join(lines[-limit:]) + + +def _fit_to_limit(sections: list[str], mode: str) -> str: + text = "\n\n".join(sections) + if len(text) <= MAX_RENDER_CHARS: + return text + + if mode == "classifier": + return _truncate_last_section(sections, MAX_RENDER_CHARS) + if mode == "anchor": + return _truncate_last_section(sections, MAX_RENDER_CHARS) + + trimmed = sections[:] + if trimmed: + trimmed[-1] = _truncate_section(trimmed[-1], MAX_RENDER_CHARS // 4) + text = "\n\n".join(trimmed) + if len(text) <= MAX_RENDER_CHARS: + return text + + if len(trimmed) >= 3: + trimmed[2] = _truncate_section(trimmed[2], MAX_RENDER_CHARS // 4) + return _truncate_last_section(trimmed, MAX_RENDER_CHARS) + + +def _truncate_last_section(sections: list[str], limit: int) -> str: + text = "\n\n".join(sections) + if len(text) <= limit: + return text + if not sections: + return "" + prefix = "\n\n".join(sections[:-1]) + separator = "\n\n" if prefix else "" + available = max(0, limit - len(prefix) - len(separator) - len("\n[truncated]")) + sections[-1] = sections[-1][:available].rstrip() + "\n[truncated]" + return "\n\n".join(sections) + + +def _truncate_section(section: str, target: int) -> str: + if len(section) <= target: + return section + return section[: max(0, target - len("\n[truncated]"))].rstrip() + "\n[truncated]" diff --git a/src/virtualme/interview/depth_evaluator.py b/src/virtualme/interview/depth_evaluator.py index c2245f6..57daec8 100644 --- a/src/virtualme/interview/depth_evaluator.py +++ b/src/virtualme/interview/depth_evaluator.py @@ -5,6 +5,7 @@ from anthropic import AsyncAnthropic +from virtualme.interview.briefing import InterviewBriefing from virtualme.interview.json_utils import extract_json_payload from virtualme.interview.models import MODEL_FAST, create_message from virtualme.storage.db import Layer @@ -34,9 +35,14 @@ def value(self) -> str: async def evaluate_depth( - answer: str, current_question: str, claude: AsyncAnthropic + answer: str, + current_question: str, + claude: AsyncAnthropic, + briefing: InterviewBriefing | None = None, ) -> TurnAssessment: + briefing_text = f"{briefing.render('classifier')}\n\n" if briefing is not None else "" prompt = f""" +{briefing_text} Assess this interview turn. Return JSON only, no markdown. Schema: diff --git a/src/virtualme/interview/follow_up.py b/src/virtualme/interview/follow_up.py index a6d2a17..52496ee 100644 --- a/src/virtualme/interview/follow_up.py +++ b/src/virtualme/interview/follow_up.py @@ -2,6 +2,7 @@ from anthropic import AsyncAnthropic +from virtualme.interview.briefing import InterviewBriefing from virtualme.interview.lang import INTERVIEW_OUTPUT_LANGUAGE, length_units, tokens from virtualme.interview.models import MODEL_STANDARD, create_message from virtualme.storage.db import Anchor, Layer @@ -81,12 +82,18 @@ def select_rule( async def generate_follow_up( - rule: FollowUpRule, answer: str, original_question: str, claude: AsyncAnthropic + rule: FollowUpRule, + answer: str, + original_question: str, + claude: AsyncAnthropic, + briefing: InterviewBriefing | None = None, ) -> str: if rule == FollowUpRule.R5_REPEAT_TO_TRIANGULATE: return "這個原則我想我們已經談得夠清楚了。讓我換個角度問。" rule_instruction = FOLLOW_UP_RULE_PROMPTS[rule] + briefing_text = f"{briefing.render('follow_up')}\n\n" if briefing is not None else "" prompt = f""" +{briefing_text} Generate one short follow-up question. Rule: {rule.value} diff --git a/tests/unit/test_anchor_extractor.py b/tests/unit/test_anchor_extractor.py new file mode 100644 index 0000000..25e2c86 --- /dev/null +++ b/tests/unit/test_anchor_extractor.py @@ -0,0 +1,67 @@ +import json + +from virtualme.interview.anchor_extractor import extract_anchors +from virtualme.interview.briefing import INTERVIEW_PURPOSE, InterviewBriefing +from virtualme.storage.db import Dimension, Question, Turn + + +class _Content: + def __init__(self, text: str): + self.text = text + + +class _Messages: + def __init__(self): + self.calls = [] + + async def create(self, **kwargs): + self.calls.append(kwargs) + return type( + "Response", + (), + { + "content": [ + _Content( + json.dumps( + [ + { + "dimension": "STATE", + "layer": "fact", + "content": "works through uncertainty", + } + ] + ) + ) + ] + }, + ) + + +class _Claude: + def __init__(self): + self.messages = _Messages() + + +async def test_extract_anchors_prompt_includes_briefing_when_present(): + claude = _Claude() + briefing = InterviewBriefing( + purpose=INTERVIEW_PURPOSE, + progress="Week 1 of 4.", + durable_summary="- durable", + coverage_gaps="- gap", + recent_transcript="受訪者: one\n訪談者: two\n受訪者: three\n訪談者: four\n受訪者: five", + ) + + await extract_anchors( + Turn(id=1, session_id=1, role="user", content="answer", content_hash="h"), + Question(id="Q1", week=1, dimension=Dimension.STATE, text="Question?"), + claude, + briefing, + ) + + prompt = claude.messages.calls[0]["messages"][0]["content"] + assert "INTERVIEW PURPOSE:" in prompt + assert "RECENT CONVERSATION:" in prompt + assert "WHAT WE KNOW SO FAR:" not in prompt + assert "受訪者: one" not in prompt + assert "訪談者: two" in prompt diff --git a/tests/unit/test_bot.py b/tests/unit/test_bot.py index b4aa4c1..39122a2 100644 --- a/tests/unit/test_bot.py +++ b/tests/unit/test_bot.py @@ -3,14 +3,19 @@ from pydantic import SecretStr from virtualme.config import Settings +from virtualme.interview import bot from virtualme.interview.bot import ( + _final_reply, _handle_light_greeting, _handle_non_answer, _is_control_message, _pause_current_question, _resolve_current_question, + process_turn, ) -from virtualme.storage.db import Dimension, Question, Session +from virtualme.interview.briefing import INTERVIEW_PURPOSE, InterviewBriefing +from virtualme.interview.depth_evaluator import TurnAssessment, TurnKind +from virtualme.storage.db import DB, Dimension, Layer, Question, Session class _Content: @@ -19,7 +24,11 @@ def __init__(self, text: str): class _Messages: + def __init__(self): + self.calls = [] + async def create(self, **kwargs): + self.calls.append(kwargs) text = kwargs["messages"][0]["content"].split("Ask this next: ", 1)[1] return type("Response", (), {"content": [_Content(text)]}) @@ -176,3 +185,86 @@ async def test_resolve_current_question_uses_pool_question_when_last_assistant_t resolved = await _resolve_current_question(db, _selector(question), session_id=1, week=1) assert resolved.text == question.text + + +async def test_final_reply_system_prompt_includes_briefing_when_present(): + question = Question( + id="Q1", + week=1, + dimension=Dimension.STATE, + text="請說說您最近的工作狀況。", + ) + claude = _Claude() + briefing = InterviewBriefing( + purpose=INTERVIEW_PURPOSE, + progress="Week 1 of 4.", + durable_summary="- durable", + coverage_gaps="- gap", + recent_transcript="受訪者: 前一輪回答", + ) + + reply = await _final_reply("u1", question, claude, _DB(count=0), briefing) + + system = claude.messages.calls[0]["system"] + assert reply == question.text + assert "INTERVIEW PURPOSE:" in system + assert "WHAT WE KNOW SO FAR:" in system + assert "Accumulated anchors:" not in system + + +async def test_process_turn_builds_briefing_once_and_passes_downstream(tmp_path, monkeypatch): + db = DB(str(tmp_path / "virtualme.db")) + await db.init() + question = Question( + id="Q1", + week=1, + dimension=Dimension.STATE, + text="How has work been?", + ) + selector = SimpleNamespace( + question_pool={1: [question]}, + select_next=lambda *args, **kwargs: None, + ) + settings = Settings(anthropic_api_key=SecretStr("test"), use_ppa=False) + briefing = InterviewBriefing( + purpose=INTERVIEW_PURPOSE, + progress="Week 1 of 1.", + durable_summary="No durable signal extracted yet.", + coverage_gaps="No computed coverage gaps yet.", + recent_transcript="No recent conversation yet.", + ) + calls = [] + + async def fake_build(db_arg, interviewee_id, session, max_week): + calls.append(("build", interviewee_id, session.id, max_week)) + return briefing + + async def fake_evaluate_depth(answer, current_question, claude, briefing_arg=None): + calls.append(("depth", briefing_arg)) + return TurnAssessment( + kind=TurnKind.SUFFICIENT, + depth=Layer.PRINCIPLE, + needs_follow_up=False, + confidence=0.9, + ) + + async def fake_extract_anchors(turn, current_question, claude, briefing_arg=None): + calls.append(("anchor", briefing_arg)) + return [] + + async def fake_final_reply(interviewee_id, question, claude, db, briefing_arg=None): + calls.append(("final", briefing_arg)) + return "final question" + + monkeypatch.setattr(bot, "build_interview_briefing", fake_build) + monkeypatch.setattr(bot, "evaluate_depth", fake_evaluate_depth) + monkeypatch.setattr(bot, "extract_anchors", fake_extract_anchors) + monkeypatch.setattr(bot, "_final_reply", fake_final_reply) + + reply = await process_turn("u1", "I value direct evidence.", object(), db, selector, settings) + + assert reply == "final question" + assert calls[0] == ("build", "u1", 1, 1) + assert ("depth", briefing) in calls + assert ("anchor", briefing) in calls + assert ("final", briefing) in calls diff --git a/tests/unit/test_briefing.py b/tests/unit/test_briefing.py new file mode 100644 index 0000000..5d47ce4 --- /dev/null +++ b/tests/unit/test_briefing.py @@ -0,0 +1,137 @@ +from types import SimpleNamespace + +from virtualme.interview.briefing import ( + INTERVIEW_PURPOSE, + MAX_RENDER_CHARS, + InterviewBriefing, + build_interview_briefing, +) +from virtualme.interview.triples import PersonaTriple +from virtualme.storage.db import Anchor, Dimension, Layer, Session, Turn + + +class _BriefingDB: + def __init__(self, *, recent_turns: list[Turn] | None = None): + self.recent_turns = recent_turns or [] + + async def load_anchors_summary(self, interviewee_id: str): + anchors = [] + for index in range(15): + anchors.append( + Anchor( + interviewee_id=interviewee_id, + dimension=Dimension.STATE, + layer=Layer.PRINCIPLE if index % 2 == 0 else Layer.FACT, + content=f"anchor {index}", + triangulated=index >= 10, + ) + ) + return {Dimension.STATE: anchors} + + async def load_triples(self, interviewee_id: str): + relations = ["preference"] * 4 + ["skill"] * 4 + ["value_anchor"] * 4 + ["red_line"] * 4 + return [ + PersonaTriple( + interviewee_id=interviewee_id, + subject="interviewee", + relation=relation, + object=f"triple {index}", + source_turn_ids=[index], + ) + for index, relation in enumerate(relations) + ] + + async def compute_coverage_gap(self, interviewee_id: str): + return {dimension: index / 10 for index, dimension in enumerate(Dimension)} + + async def load_recent_turns(self, session_id: int, limit: int): + return self.recent_turns[-limit:] + + +async def test_build_interview_briefing_applies_item_limits(): + turns = [ + Turn(id=index, session_id=1, role="user", content=f"turn {index}", content_hash="h") + for index in range(10) + ] + + briefing = await build_interview_briefing( + _BriefingDB(recent_turns=turns), + "u1", + Session(id=1, interviewee_id="u1", week=2), + max_week=5, + ) + + assert briefing.purpose == INTERVIEW_PURPOSE + assert briefing.progress == "Week 2 of 5." + assert briefing.durable_summary.count("- Anchor") == 12 + assert briefing.durable_summary.count("- Triple") == 12 + assert briefing.coverage_gaps.count("- ") == 8 + assert briefing.recent_transcript.count("受訪者:") == 8 + assert "anchor 14" in briefing.durable_summary + assert "red_line" in briefing.durable_summary + + +def test_render_modes_include_expected_sections(): + briefing = InterviewBriefing( + purpose=INTERVIEW_PURPOSE, + progress="Week 1 of 3.", + durable_summary="- durable", + coverage_gaps="- gap", + recent_transcript="受訪者: one\n訪談者: two\n受訪者: three\n訪談者: four\n受訪者: five", + ) + + full = briefing.render("full") + classifier = briefing.render("classifier") + anchor = briefing.render("anchor") + + assert "WHAT WE KNOW SO FAR:" in full + assert "STILL TO COVER:" in full + assert briefing.render("unknown") == full + assert "WHAT WE KNOW SO FAR:" not in classifier + assert "STILL TO COVER:" not in classifier + assert "RECENT CONVERSATION:" in classifier + assert "受訪者: one" not in anchor + assert "訪談者: two" in anchor + + +def test_render_hard_limit_trims_recent_transcript_first(): + briefing = InterviewBriefing( + purpose=INTERVIEW_PURPOSE, + progress="Week 1 of 3.", + durable_summary="- durable", + coverage_gaps="- gap", + recent_transcript="\n".join(f"受訪者: {'x' * 500}" for _ in range(40)), + ) + + rendered = briefing.render("full") + + assert len(rendered) <= MAX_RENDER_CHARS + assert INTERVIEW_PURPOSE in rendered + assert "PROGRESS:" in rendered + assert "[truncated]" in rendered + + +async def test_build_interview_briefing_empty_state(): + class EmptyDB: + async def load_anchors_summary(self, interviewee_id: str): + return {} + + async def load_triples(self, interviewee_id: str): + return [] + + async def compute_coverage_gap(self, interviewee_id: str): + return {} + + async def load_recent_turns(self, session_id: int, limit: int): + return [] + + briefing = await build_interview_briefing( + EmptyDB(), + "u1", + SimpleNamespace(id=1, week=1), + max_week=4, + ) + + assert briefing.durable_summary == "No durable signal extracted yet." + assert briefing.coverage_gaps == "No computed coverage gaps yet." + assert briefing.recent_transcript == "No recent conversation yet." diff --git a/tests/unit/test_depth_evaluator.py b/tests/unit/test_depth_evaluator.py index d2c25c1..c4fef18 100644 --- a/tests/unit/test_depth_evaluator.py +++ b/tests/unit/test_depth_evaluator.py @@ -1,5 +1,6 @@ import json +from virtualme.interview.briefing import INTERVIEW_PURPOSE, InterviewBriefing from virtualme.interview.depth_evaluator import TurnKind, evaluate_depth from virtualme.storage.db import Layer @@ -12,8 +13,10 @@ def __init__(self, text: str): class _Messages: def __init__(self, text: str): self.text = text + self.calls = [] async def create(self, **kwargs): + self.calls.append(kwargs) return type("Response", (), {"content": [_Content(self.text)]}) @@ -101,3 +104,22 @@ async def test_high_confidence_evasion_stays_evasion(): assert assessment.kind == TurnKind.EVASION assert assessment.needs_follow_up is False + + +async def test_depth_prompt_includes_briefing_when_present(): + claude = _Claude(_assessment()) + briefing = InterviewBriefing( + purpose=INTERVIEW_PURPOSE, + progress="Week 1 of 4.", + durable_summary="- durable", + coverage_gaps="- gap", + recent_transcript="受訪者: 前一輪回答", + ) + + await evaluate_depth("answer", "Question?", claude, briefing) + + prompt = claude.messages.calls[0]["messages"][0]["content"] + assert "INTERVIEW PURPOSE:" in prompt + assert "Treat confusion, hesitation, doubt" in prompt + assert "RECENT CONVERSATION:" in prompt + assert "WHAT WE KNOW SO FAR:" not in prompt diff --git a/tests/unit/test_follow_up_cjk.py b/tests/unit/test_follow_up_cjk.py index d8d6a6e..c39f627 100644 --- a/tests/unit/test_follow_up_cjk.py +++ b/tests/unit/test_follow_up_cjk.py @@ -1,3 +1,4 @@ +from virtualme.interview.briefing import INTERVIEW_PURPOSE, InterviewBriefing from virtualme.interview.follow_up import ( FOLLOW_UP_RULE_PROMPTS, FollowUpRule, @@ -67,6 +68,44 @@ async def fake_create_message(*args, **kwargs): assert answer in prompt +async def test_generate_follow_up_prompt_includes_briefing_when_present(monkeypatch): + captured = {} + + class FakeContent: + def __init__(self, text): + self.text = text + + class FakeResponse: + def __init__(self, text): + self.content = [FakeContent(text)] + + async def fake_create_message(*args, **kwargs): + captured["messages"] = kwargs["messages"] + return FakeResponse("哪個壓力最明顯?") + + monkeypatch.setattr("virtualme.interview.follow_up.create_message", fake_create_message) + briefing = InterviewBriefing( + purpose=INTERVIEW_PURPOSE, + progress="Week 1 of 3.", + durable_summary="- durable", + coverage_gaps="- gap", + recent_transcript="受訪者: 前一輪回答", + ) + + await generate_follow_up( + FollowUpRule.R1_FACT_TO_PATTERN, + "我昨天卡住了", + "最近工作如何?", + claude=None, + briefing=briefing, + ) + + prompt = captured["messages"][0]["content"] + assert "INTERVIEW PURPOSE:" in prompt + assert "STILL TO COVER:" in prompt + assert "RECENT CONVERSATION:" in prompt + + async def test_generate_follow_up_r5_returns_canned_response_without_llm(monkeypatch): async def fake_create_message(*args, **kwargs): raise AssertionError("R5 should not call create_message")