Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
.vscode/launch.json
.vscode/ipch
.log
tmp/
firmware/include/config.h
recordings
._.DS_Store
Expand Down
91 changes: 91 additions & 0 deletions example_apps/record_wakeup_word.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from __future__ import annotations

import asyncio
import logging
import os
import wave
from datetime import UTC, datetime
from logging import getLogger
from pathlib import Path

from dotenv import load_dotenv

from stackchan_server.app import StackChanApp
from stackchan_server.static import LISTEN_AUDIO_FORMAT
from stackchan_server.ws_proxy import EmptyTranscriptError, WsProxy

logger = getLogger(__name__)
logging.basicConfig(
level=os.getenv("STACKCHAN_LOG_LEVEL", "INFO"),
format="%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s",
datefmt="%H:%M:%S",
)

load_dotenv()


app = StackChanApp()


@app.setup
async def setup(proxy: WsProxy):
logger.info("WebSocket connected")


@app.talk_session
async def talk_session(proxy: WsProxy):
while True:
try:
text = await proxy.listen()
except EmptyTranscriptError:
return
if not text:
return
logger.info("Heard: %s", text)
await proxy.speak(text)


@app.webapi("/record_wakeup_word")
async def record_wakeup_word(proxy: WsProxy, args: dict):
duration_ms = 2500
logger.info("Recording wakeup word duration_ms=%d", duration_ms)
await proxy.speak(
"これからウェイクアップワードの録音を開始します。ピッと鳴ったら、ウェイクアップワードを話してください。"
)
await proxy.speak(
"50回録音します。トーンを変えたり、ちょっと遠くから話したりして、いろいろなパターンを録音してください。"
)

output_dir = Path("tmp")
output_dir.mkdir(parents=True, exist_ok=True)

for i in range(50):
if i > 0 and i % 10 == 0:
await proxy.speak(f"あと{50 - i}回")

await proxy.tone(2000, 200)
raw_audio = await proxy.listen_raw(duration=duration_ms)

filename = f"wakeup_word_{datetime.now(UTC).strftime('%Y%m%d_%H%M%S_%f')}.wav"
filepath = output_dir / filename

with wave.open(str(filepath), "wb") as wav_fp:
wav_fp.setnchannels(LISTEN_AUDIO_FORMAT.channels)
wav_fp.setsampwidth(LISTEN_AUDIO_FORMAT.sample_width)
wav_fp.setframerate(LISTEN_AUDIO_FORMAT.sample_rate_hz)
wav_fp.writeframes(raw_audio)

logger.info("Saved wakeup word recording to %s", filepath)

await proxy.speak(
"お疲れ様でした"
)

return {
"path": str(filepath),
"bytes": len(raw_audio),
"sample_rate": LISTEN_AUDIO_FORMAT.sample_rate_hz,
"channels": LISTEN_AUDIO_FORMAT.channels,
"sample_width": LISTEN_AUDIO_FORMAT.sample_width,
"duration_ms": duration_ms,
}
6 changes: 6 additions & 0 deletions firmware/include/listening.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ class Listening
// 無音が所定時間続いているか判定
bool shouldStopForSilence() const;

// 固定時間録音モードを設定(0で通常の無音停止モード)
void setFixedDurationMs(uint32_t durationMs);
bool shouldStopForFixedDuration() const;

private:
void updateLevelStats(const int16_t *samples, size_t sampleCount);
bool sendPacket(stackchan_websocket_v1_MessageType type, const int16_t *samples, size_t sampleCount);
Expand All @@ -60,6 +64,8 @@ class Listening
// 無音判定関連
int32_t last_level_ = 0;
uint32_t silence_since_ms_ = 0;
uint32_t stream_started_ms_ = 0;
uint32_t fixed_duration_ms_ = 0;
static constexpr int32_t kSilenceLevelThreshold = 200; // 平均絶対値がこの値以下を無音とみなす
static constexpr uint32_t kSilenceDurationMs = 3000; // 無音とみなす継続時間
};
5 changes: 0 additions & 5 deletions firmware/include/wake_up_word.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,4 @@ class WakeUpWord
StateMachine &state_;
const int sample_rate_;
std::function<void()> on_wake_word_detected_;

// Idle 時のログ用カウンタ
uint32_t loop_count_ = 0;
uint32_t error_count_ = 0;
uint32_t last_log_time_ = 0;
};
6 changes: 6 additions & 0 deletions firmware/lib/generated_protobuf/websocket-message.pb.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ PB_BIND(stackchan_websocket_v1_ServoCommand, stackchan_websocket_v1_ServoCommand
PB_BIND(stackchan_websocket_v1_ServoDoneEvent, stackchan_websocket_v1_ServoDoneEvent, AUTO)


PB_BIND(stackchan_websocket_v1_ToneCommand, stackchan_websocket_v1_ToneCommand, AUTO)


PB_BIND(stackchan_websocket_v1_ToneDoneEvent, stackchan_websocket_v1_ToneDoneEvent, AUTO)


PB_BIND(stackchan_websocket_v1_FirmwareMetadata, stackchan_websocket_v1_FirmwareMetadata, AUTO)


Expand Down
64 changes: 56 additions & 8 deletions firmware/lib/generated_protobuf/websocket-message.pb.h

Large diffs are not rendered by default.

32 changes: 29 additions & 3 deletions firmware/src/listening.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ void Listening::end()
{
stopStreaming();
M5.Mic.end();
fixed_duration_ms_ = 0;
}

bool Listening::startStreaming()
Expand All @@ -57,6 +58,7 @@ bool Listening::startStreaming()
seq_counter_ = 0;
last_level_ = 0;
silence_since_ms_ = 0;
stream_started_ms_ = millis();
streaming_ = true;
return sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_START, nullptr, 0);
}
Expand Down Expand Up @@ -128,10 +130,18 @@ void Listening::loop()
}
}

// 無音が3秒続いたら終了
if (shouldStopForSilence())
// 固定時間録音 or 無音が3秒続いたら終了
if ((fixed_duration_ms_ > 0 && shouldStopForFixedDuration()) ||
(fixed_duration_ms_ == 0 && shouldStopForSilence()))
{
log_i("Auto stop: silence detected (avg=%ld)", static_cast<long>(last_level_));
if (fixed_duration_ms_ > 0)
{
log_i("Auto stop: fixed duration reached (%lu ms)", static_cast<unsigned long>(fixed_duration_ms_));
}
else
{
log_i("Auto stop: silence detected (avg=%ld)", static_cast<long>(last_level_));
}
if (!stopStreaming())
{
log_i("WS send failed (tail/end)");
Expand Down Expand Up @@ -187,6 +197,22 @@ bool Listening::shouldStopForSilence() const
return elapsed >= kSilenceDurationMs;
}

void Listening::setFixedDurationMs(uint32_t durationMs)
{
fixed_duration_ms_ = durationMs;
}

bool Listening::shouldStopForFixedDuration() const
{
if (fixed_duration_ms_ == 0 || stream_started_ms_ == 0)
{
return false;
}

uint32_t elapsed = millis() - stream_started_ms_;
return elapsed >= fixed_duration_ms_;
}

bool Listening::sendPacket(stackchan_websocket_v1_MessageType type, const int16_t *samples, size_t sampleCount)
{
if ((WiFi.status() != WL_CONNECTED) || !ws_.isConnected())
Expand Down
99 changes: 99 additions & 0 deletions firmware/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,12 @@ namespace
uint32_t g_uplink_seq = 0;
uint32_t g_last_comm_ms = 0;
constexpr uint32_t kCommTimeoutMs = 60000;
constexpr int kToneChannel = 1;
stackchan_websocket_v1_WebSocketMessage g_tx_message = stackchan_websocket_v1_WebSocketMessage_init_zero;
stackchan_websocket_v1_WebSocketMessage g_rx_message = stackchan_websocket_v1_WebSocketMessage_init_zero;
bool g_tone_playing = false;
bool g_tone_restore_state_pending = false;
StateMachine::State g_tone_restore_state = StateMachine::Idle;

void markCommunicationActive()
{
Expand Down Expand Up @@ -163,20 +167,39 @@ void notifyServoDone()
}
}

void notifyToneDone()
{
auto &message = g_tx_message;
message = stackchan_websocket_v1_WebSocketMessage_init_zero;
message.kind = stackchan_websocket_v1_MessageKind_MESSAGE_KIND_TONE_DONE_EVT;
message.message_type = stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA;
message.seq = g_uplink_seq++;
message.which_body = stackchan_websocket_v1_WebSocketMessage_tone_done_evt_tag;
message.body.tone_done_evt.done = true;
if (!sendUplinkMessage(message))
{
log_w("Failed to send ToneDoneEvt");
}
}

bool applyRemoteStateCommand(const stackchan_websocket_v1_StateCommand &command)
{
switch (command.state)
{
case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE:
listening.setFixedDurationMs(0);
stateMachine.setState(StateMachine::Idle);
return true;
case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING:
listening.setFixedDurationMs(command.listening_duration_ms);
stateMachine.setState(StateMachine::Listening);
return true;
case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING:
listening.setFixedDurationMs(0);
stateMachine.setState(StateMachine::Thinking);
return true;
case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING:
listening.setFixedDurationMs(0);
stateMachine.setState(StateMachine::Speaking);
return true;
default:
Expand Down Expand Up @@ -234,6 +257,70 @@ bool applyServoCommand(const stackchan_websocket_v1_ServoCommandSequence &sequen
}
return true;
}

bool applyToneCommand(const stackchan_websocket_v1_ToneCommand &command)
{
if (command.frequency <= 0.0f)
{
log_w("ToneCmd frequency must be positive");
return false;
}
if (command.duration_ms == 0)
{
log_w("ToneCmd duration must be positive");
return false;
}

StateMachine::State previous_state = stateMachine.getState();
if (previous_state != StateMachine::Speaking)
{
g_tone_restore_state = previous_state;
g_tone_restore_state_pending = true;
stateMachine.setState(StateMachine::Speaking);
}
else
{
g_tone_restore_state_pending = false;
}

if (!M5.Speaker.tone(command.frequency, command.duration_ms, kToneChannel, true))
{
if (g_tone_restore_state_pending)
{
stateMachine.setState(g_tone_restore_state);
g_tone_restore_state_pending = false;
}
log_w(
"Failed to start tone frequency=%.1f duration=%lu",
command.frequency,
static_cast<unsigned long>(command.duration_ms));
return false;
}

g_tone_playing = true;
return true;
}

void pollTonePlayback()
{
if (!g_tone_playing)
{
return;
}

if (M5.Speaker.isPlaying(kToneChannel) != 0)
{
return;
}

g_tone_playing = false;
if (g_tone_restore_state_pending)
{
stateMachine.setState(g_tone_restore_state);
g_tone_restore_state_pending = false;
}
notifyToneDone();
}
} // namespace

void connectWiFi()
Expand Down Expand Up @@ -335,6 +422,17 @@ void handleWsEvent(WStype_t type, uint8_t *payload, size_t length)
log_w("ServoCmd protobuf body mismatch type=%u body=%u", (unsigned)rx.message_type, (unsigned)rx.which_body);
}
break;
case stackchan_websocket_v1_MessageKind_MESSAGE_KIND_TONE_CMD:
if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA &&
rx.which_body == stackchan_websocket_v1_WebSocketMessage_tone_cmd_tag)
{
applyToneCommand(rx.body.tone_cmd);
}
else
{
log_w("ToneCmd protobuf body mismatch type=%u body=%u", (unsigned)rx.message_type, (unsigned)rx.which_body);
}
break;
case stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVER_METADATA:
if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA &&
rx.which_body == stackchan_websocket_v1_WebSocketMessage_server_metadata_tag)
Expand Down Expand Up @@ -452,6 +550,7 @@ void loop()
wsClient.loop();
handleCommunicationTimeout();
servo.loop();
pollTonePlayback();

StateMachine::State current = stateMachine.getState();
switch (current)
Expand Down
18 changes: 16 additions & 2 deletions firmware/src/speaking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,17 @@ void Speaking::init()
void Speaking::begin()
{
// 念のためマイクを停止し、再生に集中させる
uint8_t current_volume = M5.Speaker.getVolume();
M5.Mic.end();
delay(20);
M5.Speaker.end();
delay(10);
bool speaker_ready = M5.Speaker.begin();
M5.Speaker.setVolume(current_volume);
if (!speaker_ready)
{
log_w("Failed to initialize speaker");
}
}

void Speaking::end()
Expand All @@ -32,7 +42,6 @@ void Speaking::end()
{
M5.Speaker.stop();
}
M5.Speaker.end();
reset();
}

Expand Down Expand Up @@ -106,7 +115,12 @@ void Speaking::handleWavEnd(uint32_t seq)
const int16_t *samples = reinterpret_cast<const int16_t *>(buf.data());
size_t sample_len = buf.size() / sizeof(int16_t);
bool stereo = channels_ > 1;
M5.Speaker.playRaw(samples, sample_len, sample_rate_, stereo, 1, 0);
bool accepted = M5.Speaker.playRaw(samples, sample_len, sample_rate_, stereo, 1, 0);
if (!accepted)
{
log_w("Failed to queue raw audio for playback");
playing_ = false;
}
}
}

Expand Down
Loading
Loading