74th · 74th · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@
 .vscode/launch.json
 .vscode/ipch
 .log
+tmp/
 firmware/include/config.h
 recordings
 ._.DS_Store

diff --git a/example_apps/record_wakeup_word.py b/example_apps/record_wakeup_word.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import wave
+from datetime import UTC, datetime
+from logging import getLogger
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+from stackchan_server.app import StackChanApp
+from stackchan_server.static import LISTEN_AUDIO_FORMAT
+from stackchan_server.ws_proxy import EmptyTranscriptError, WsProxy
+
+logger = getLogger(__name__)
+logging.basicConfig(
+    level=os.getenv("STACKCHAN_LOG_LEVEL", "INFO"),
+    format="%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s",
+    datefmt="%H:%M:%S",
+)
+
+load_dotenv()
+
+
+app = StackChanApp()
+
+
+@app.setup
+async def setup(proxy: WsProxy):
+    logger.info("WebSocket connected")
+
+
+@app.talk_session
+async def talk_session(proxy: WsProxy):
+    while True:
+        try:
+            text = await proxy.listen()
+        except EmptyTranscriptError:
+            return
+        if not text:
+            return
+        logger.info("Heard: %s", text)
+        await proxy.speak(text)
+
+
+@app.webapi("/record_wakeup_word")
+async def record_wakeup_word(proxy: WsProxy, args: dict):
+    duration_ms = 2500
+    logger.info("Recording wakeup word duration_ms=%d", duration_ms)
+    await proxy.speak(
+        "これからウェイクアップワードの録音を開始します。ピッと鳴ったら、ウェイクアップワードを話してください。"
+    )
+    await proxy.speak(
+        "50回録音します。トーンを変えたり、ちょっと遠くから話したりして、いろいろなパターンを録音してください。"
+    )
+
+    output_dir = Path("tmp")
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for i in range(50):
+        if i > 0 and i % 10 == 0:
+            await proxy.speak(f"あと{50 - i}回")
+
+        await proxy.tone(2000, 200)
+        raw_audio = await proxy.listen_raw(duration=duration_ms)
+
+        filename = f"wakeup_word_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S_%f')}.wav"
+        filepath = output_dir / filename
+
+        with wave.open(str(filepath), "wb") as wav_fp:
+            wav_fp.setnchannels(LISTEN_AUDIO_FORMAT.channels)
+            wav_fp.setsampwidth(LISTEN_AUDIO_FORMAT.sample_width)
+            wav_fp.setframerate(LISTEN_AUDIO_FORMAT.sample_rate_hz)
+            wav_fp.writeframes(raw_audio)
+
+        logger.info("Saved wakeup word recording to %s", filepath)
+
+    await proxy.speak(
+        "お疲れ様でした"
+    )
+
+    return {
+        "path": str(filepath),
+        "bytes": len(raw_audio),
+        "sample_rate": LISTEN_AUDIO_FORMAT.sample_rate_hz,
+        "channels": LISTEN_AUDIO_FORMAT.channels,
+        "sample_width": LISTEN_AUDIO_FORMAT.sample_width,
+        "duration_ms": duration_ms,
+    }
diff --git a/firmware/include/listening.hpp b/firmware/include/listening.hpp
@@ -34,6 +34,10 @@ class Listening
   // 無音が所定時間続いているか判定
   bool shouldStopForSilence() const;
 
+  // 固定時間録音モードを設定（0で通常の無音停止モード）
+  void setFixedDurationMs(uint32_t durationMs);
+  bool shouldStopForFixedDuration() const;
+
 private:
   void updateLevelStats(const int16_t *samples, size_t sampleCount);
   bool sendPacket(stackchan_websocket_v1_MessageType type, const int16_t *samples, size_t sampleCount);
@@ -60,6 +64,8 @@ class Listening
   // 無音判定関連
   int32_t last_level_ = 0;
   uint32_t silence_since_ms_ = 0;
+  uint32_t stream_started_ms_ = 0;
+  uint32_t fixed_duration_ms_ = 0;
   static constexpr int32_t kSilenceLevelThreshold = 200;     // 平均絶対値がこの値以下を無音とみなす
   static constexpr uint32_t kSilenceDurationMs = 3000;        // 無音とみなす継続時間
 };
diff --git a/firmware/include/wake_up_word.hpp b/firmware/include/wake_up_word.hpp
@@ -33,9 +33,4 @@ class WakeUpWord
   StateMachine &state_;
   const int sample_rate_;
   std::function<void()> on_wake_word_detected_;
-
-  // Idle 時のログ用カウンタ
-  uint32_t loop_count_ = 0;
-  uint32_t error_count_ = 0;
-  uint32_t last_log_time_ = 0;
 };
diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.c b/firmware/lib/generated_protobuf/websocket-message.pb.c
@@ -45,6 +45,12 @@ PB_BIND(stackchan_websocket_v1_ServoCommand, stackchan_websocket_v1_ServoCommand
 PB_BIND(stackchan_websocket_v1_ServoDoneEvent, stackchan_websocket_v1_ServoDoneEvent, AUTO)
 
 
+PB_BIND(stackchan_websocket_v1_ToneCommand, stackchan_websocket_v1_ToneCommand, AUTO)
+
+
+PB_BIND(stackchan_websocket_v1_ToneDoneEvent, stackchan_websocket_v1_ToneDoneEvent, AUTO)
+
+
 PB_BIND(stackchan_websocket_v1_FirmwareMetadata, stackchan_websocket_v1_FirmwareMetadata, AUTO)
 
 

diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.h b/firmware/lib/generated_protobuf/websocket-message.pb.h
diff --git a/firmware/src/listening.cpp b/firmware/src/listening.cpp
@@ -49,6 +49,7 @@ void Listening::end()
 {
   stopStreaming();
   M5.Mic.end();
+  fixed_duration_ms_ = 0;
 }
 
 bool Listening::startStreaming()
@@ -57,6 +58,7 @@ bool Listening::startStreaming()
   seq_counter_ = 0;
   last_level_ = 0;
   silence_since_ms_ = 0;
+  stream_started_ms_ = millis();
   streaming_ = true;
   return sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_START, nullptr, 0);
 }
@@ -128,10 +130,18 @@ void Listening::loop()
     }
   }
 
-  // 無音が3秒続いたら終了
-  if (shouldStopForSilence())
+  // 固定時間録音 or 無音が3秒続いたら終了
+  if ((fixed_duration_ms_ > 0 && shouldStopForFixedDuration()) ||
+      (fixed_duration_ms_ == 0 && shouldStopForSilence()))
   {
-    log_i("Auto stop: silence detected (avg=%ld)", static_cast<long>(last_level_));
+    if (fixed_duration_ms_ > 0)
+    {
+      log_i("Auto stop: fixed duration reached (%lu ms)", static_cast<unsigned long>(fixed_duration_ms_));
+    }
+    else
+    {
+      log_i("Auto stop: silence detected (avg=%ld)", static_cast<long>(last_level_));
+    }
     if (!stopStreaming())
     {
       log_i("WS send failed (tail/end)");
@@ -187,6 +197,22 @@ bool Listening::shouldStopForSilence() const
   return elapsed >= kSilenceDurationMs;
 }
 
+void Listening::setFixedDurationMs(uint32_t durationMs)
+{
+  fixed_duration_ms_ = durationMs;
+}
+
+bool Listening::shouldStopForFixedDuration() const
+{
+  if (fixed_duration_ms_ == 0 || stream_started_ms_ == 0)
+  {
+    return false;
+  }
+
+  uint32_t elapsed = millis() - stream_started_ms_;
+  return elapsed >= fixed_duration_ms_;
+}
+
 bool Listening::sendPacket(stackchan_websocket_v1_MessageType type, const int16_t *samples, size_t sampleCount)
 {
   if ((WiFi.status() != WL_CONNECTED) || !ws_.isConnected())

diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp
@@ -42,8 +42,12 @@ namespace
 uint32_t g_uplink_seq = 0;
 uint32_t g_last_comm_ms = 0;
 constexpr uint32_t kCommTimeoutMs = 60000;
+constexpr int kToneChannel = 1;
 stackchan_websocket_v1_WebSocketMessage g_tx_message = stackchan_websocket_v1_WebSocketMessage_init_zero;
 stackchan_websocket_v1_WebSocketMessage g_rx_message = stackchan_websocket_v1_WebSocketMessage_init_zero;
+bool g_tone_playing = false;
+bool g_tone_restore_state_pending = false;
+StateMachine::State g_tone_restore_state = StateMachine::Idle;
 
 void markCommunicationActive()
 {
@@ -163,20 +167,39 @@ void notifyServoDone()
   }
 }
 
+void notifyToneDone()
+{
+  auto &message = g_tx_message;
+  message = stackchan_websocket_v1_WebSocketMessage_init_zero;
+  message.kind = stackchan_websocket_v1_MessageKind_MESSAGE_KIND_TONE_DONE_EVT;
+  message.message_type = stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA;
+  message.seq = g_uplink_seq++;
+  message.which_body = stackchan_websocket_v1_WebSocketMessage_tone_done_evt_tag;
+  message.body.tone_done_evt.done = true;
+  if (!sendUplinkMessage(message))
+  {
+    log_w("Failed to send ToneDoneEvt");
+  }
+}
+
 bool applyRemoteStateCommand(const stackchan_websocket_v1_StateCommand &command)
 {
   switch (command.state)
   {
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE:
+    listening.setFixedDurationMs(0);
     stateMachine.setState(StateMachine::Idle);
     return true;
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING:
+    listening.setFixedDurationMs(command.listening_duration_ms);
     stateMachine.setState(StateMachine::Listening);
     return true;
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING:
+    listening.setFixedDurationMs(0);
     stateMachine.setState(StateMachine::Thinking);
     return true;
   case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING:
+    listening.setFixedDurationMs(0);
     stateMachine.setState(StateMachine::Speaking);
     return true;
   default:
@@ -234,6 +257,70 @@ bool applyServoCommand(const stackchan_websocket_v1_ServoCommandSequence &sequen
   }
   return true;
 }
+
+bool applyToneCommand(const stackchan_websocket_v1_ToneCommand &command)
+{
+  if (command.frequency <= 0.0f)
+  {
+    log_w("ToneCmd frequency must be positive");
+    return false;
+  }
+  if (command.duration_ms == 0)
+  {
+    log_w("ToneCmd duration must be positive");
+    return false;
+  }
+
+  StateMachine::State previous_state = stateMachine.getState();
+  if (previous_state != StateMachine::Speaking)
+  {
+    g_tone_restore_state = previous_state;
+    g_tone_restore_state_pending = true;
+    stateMachine.setState(StateMachine::Speaking);
+  }
+  else
+  {
+    g_tone_restore_state_pending = false;
+  }
+
+  if (!M5.Speaker.tone(command.frequency, command.duration_ms, kToneChannel, true))
+  {
+    if (g_tone_restore_state_pending)
+    {
+      stateMachine.setState(g_tone_restore_state);
+      g_tone_restore_state_pending = false;
+    }
+    log_w(
+        "Failed to start tone frequency=%.1f duration=%lu",
+        command.frequency,
+        static_cast<unsigned long>(command.duration_ms));
+    return false;
+  }
+
+  g_tone_playing = true;
+  return true;
+}
+
+void pollTonePlayback()
+{
+  if (!g_tone_playing)
+  {
+    return;
+  }
+
+  if (M5.Speaker.isPlaying(kToneChannel) != 0)
+  {
+    return;
+  }
+
+  g_tone_playing = false;
+  if (g_tone_restore_state_pending)
+  {
+    stateMachine.setState(g_tone_restore_state);
+    g_tone_restore_state_pending = false;
+  }
+  notifyToneDone();
+}
 } // namespace
 
 void connectWiFi()
@@ -335,6 +422,17 @@ void handleWsEvent(WStype_t type, uint8_t *payload, size_t length)
         log_w("ServoCmd protobuf body mismatch type=%u body=%u", (unsigned)rx.message_type, (unsigned)rx.which_body);
       }
       break;
+    case stackchan_websocket_v1_MessageKind_MESSAGE_KIND_TONE_CMD:
+      if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA &&
+          rx.which_body == stackchan_websocket_v1_WebSocketMessage_tone_cmd_tag)
+      {
+        applyToneCommand(rx.body.tone_cmd);
+      }
+      else
+      {
+        log_w("ToneCmd protobuf body mismatch type=%u body=%u", (unsigned)rx.message_type, (unsigned)rx.which_body);
+      }
+      break;
     case stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_METADATA:
       if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA &&
           rx.which_body == stackchan_websocket_v1_WebSocketMessage_server_metadata_tag)
@@ -452,6 +550,7 @@ void loop()
   wsClient.loop();
   handleCommunicationTimeout();
   servo.loop();
+  pollTonePlayback();
 
   StateMachine::State current = stateMachine.getState();
   switch (current)

diff --git a/firmware/src/speaking.cpp b/firmware/src/speaking.cpp
@@ -23,7 +23,17 @@ void Speaking::init()
 void Speaking::begin()
 {
   // 念のためマイクを停止し、再生に集中させる
+  uint8_t current_volume = M5.Speaker.getVolume();
   M5.Mic.end();
+  delay(20);
+  M5.Speaker.end();
+  delay(10);
+  bool speaker_ready = M5.Speaker.begin();
+  M5.Speaker.setVolume(current_volume);
+  if (!speaker_ready)
+  {
+    log_w("Failed to initialize speaker");
+  }
 }
 
 void Speaking::end()
@@ -32,7 +42,6 @@ void Speaking::end()
   {
     M5.Speaker.stop();
   }
-  M5.Speaker.end();
   reset();
 }
 
@@ -106,7 +115,12 @@ void Speaking::handleWavEnd(uint32_t seq)
     const int16_t *samples = reinterpret_cast<const int16_t *>(buf.data());
     size_t sample_len = buf.size() / sizeof(int16_t);
     bool stereo = channels_ > 1;
-    M5.Speaker.playRaw(samples, sample_len, sample_rate_, stereo, 1, 0);
+    bool accepted = M5.Speaker.playRaw(samples, sample_len, sample_rate_, stereo, 1, 0);
+    if (!accepted)
+    {
+      log_w("Failed to queue raw audio for playback");
+      playing_ = false;
+    }
   }
 }