From 2b32ea883fe72504eb893de2839471e601b93f87 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sun, 19 Apr 2026 14:56:55 +0900
Subject: [PATCH 1/5] feat: Add protobuf support for WebSocket messages and
 refactor WebSocket handling

- Updated `pyproject.toml` to include `grpcio-tools` and `protobuf` dependencies.
- Added generated protobuf files for WebSocket messages in `stackchan_server/generated_protobuf`.
- Implemented `protobuf_ws.py` for encoding and parsing WebSocket messages.
- Refactored `speak.py` to utilize new protobuf message encoding for audio WAV messages.
- Updated `ws_proxy.py` to handle protobuf messages and removed legacy struct-based message handling.
- Enhanced error handling for invalid protobuf messages in WebSocket communication.
---
 Makefile                                      |  26 ++
 firmware/include/listening.hpp                |   2 +-
 firmware/include/protocols.hpp                |  44 ++-
 .../generated_protobuf/websocket-message.pb.c |  56 +++
 .../generated_protobuf/websocket-message.pb.h | 370 ++++++++++++++++++
 firmware/src/listening.cpp                    |  44 ++-
 firmware/src/main.cpp                         | 204 +++++++---
 firmware/src/protocols.cpp                    | 163 ++++++++
 platformio.ini                                |   1 +
 protobuf/websocket-message.options            |   2 +
 protobuf/websocket-message.proto              | 110 ++++++
 pyproject.toml                                |   2 +
 .../generated_protobuf/__init__.py            |   1 +
 .../websocket_message_pb2.py                  |  68 ++++
 stackchan_server/protobuf_ws.py               | 170 ++++++++
 stackchan_server/speak.py                     |  56 +--
 stackchan_server/ws_proxy.py                  | 214 ++++------
 uv.lock                                       |  46 +++
 18 files changed, 1314 insertions(+), 265 deletions(-)
 create mode 100644 firmware/lib/generated_protobuf/websocket-message.pb.c
 create mode 100644 firmware/lib/generated_protobuf/websocket-message.pb.h
 create mode 100644 firmware/src/protocols.cpp
 create mode 100644 protobuf/websocket-message.options
 create mode 100644 protobuf/websocket-message.proto
 create mode 100644 stackchan_server/generated_protobuf/__init__.py
 create mode 100644 stackchan_server/generated_protobuf/websocket_message_pb2.py
 create mode 100644 stackchan_server/protobuf_ws.py

diff --git a/Makefile b/Makefile
index 318a6f4..761a0b1 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,12 @@
+UV ?= uv
+PROTO_DIR := protobuf
+PROTO_FILE := $(PROTO_DIR)/websocket-message.proto
+PY_PROTO_OUT_DIR := stackchan_server/generated_protobuf
+FW_PROTO_OUT_DIR := firmware/lib/generated_protobuf
+NANOPB_GENERATOR := .pio/libdeps/m5stack-cores3-m5unified/Nanopb/generator/nanopb_generator.py
+
+.PHONY: lint lint-fix protobuf protobuf-python protobuf-firmware clean-protobuf
+
 lint:
 	uv run ruff check stackchan_server example_apps
 	uv run ty check stackchan_server example_apps
@@ -5,3 +14,20 @@ lint:
 lint-fix:
 	uv run ruff check --fix stackchan_server example_apps
 	uv run ty check stackchan_server example_apps
+
+protobuf: protobuf-python protobuf-firmware
+
+protobuf-python: $(PROTO_FILE)
+	mkdir -p $(PY_PROTO_OUT_DIR)
+	touch $(PY_PROTO_OUT_DIR)/__init__.py
+	$(UV) run python -m grpc_tools.protoc -I$(PROTO_DIR) --python_out=$(PY_PROTO_OUT_DIR) $(PROTO_FILE)
+
+protobuf-firmware: $(PROTO_FILE)
+	@test -f $(NANOPB_GENERATOR) || (echo "nanopb generator not found: $(NANOPB_GENERATOR)" && exit 1)
+	mkdir -p $(FW_PROTO_OUT_DIR)
+	$(UV) run python $(NANOPB_GENERATOR) --proto-path=$(PROTO_DIR) --output-dir=$(FW_PROTO_OUT_DIR) $(PROTO_FILE)
+
+clean-protobuf:
+	rm -f $(PY_PROTO_OUT_DIR)/websocket_message_pb2.py
+	rm -f stackchan_server/generated/websocket_message_pb2.py
+	rm -f $(FW_PROTO_OUT_DIR)/websocket-message.pb.h $(FW_PROTO_OUT_DIR)/websocket-message.pb.c
diff --git a/firmware/include/listening.hpp b/firmware/include/listening.hpp
index f3a45ee..cf00113 100644
--- a/firmware/include/listening.hpp
+++ b/firmware/include/listening.hpp
@@ -53,7 +53,7 @@ class Listening
   size_t ring_read_ = 0;
   size_t ring_available_ = 0;
 
-  uint16_t seq_counter_ = 0;
+  uint32_t seq_counter_ = 0;
   bool streaming_ = false;
   bool events_registered_ = false;
 
diff --git a/firmware/include/protocols.hpp b/firmware/include/protocols.hpp
index b39838d..feea050 100644
--- a/firmware/include/protocols.hpp
+++ b/firmware/include/protocols.hpp
@@ -1,15 +1,14 @@
 // Protocol definitions shared between CoreS3 firmware and other components
 #pragma once
 
+#include <cstddef>
 #include <cstdint>
+#include <vector>
 
-// WebSocket binary protocol (audio + future kinds)
-// Header layout (little-endian, packed):
-//  - kind: uint8_t   (message kind)
-//  - messageType: uint8_t  (START/DATA/END)
-//  - reserved: uint8_t (0, future flags)
-//  - seq: uint16 (sequence number)
-//  - payloadBytes: uint16 (bytes following the header)
+#include "../lib/generated_protobuf/websocket-message.pb.h"
+
+// Internal compatibility metadata for message routing after protobuf decode.
+// This is no longer sent on the wire directly.
 
 enum class MessageKind : uint8_t
 {
@@ -35,8 +34,8 @@ struct __attribute__((packed)) WsHeader
 	uint8_t kind;        // MessageKind
 	uint8_t messageType; // MessageType
 	uint8_t reserved;    // 0 (flags/reserved)
-	uint16_t seq;        // sequence number
-	uint16_t payloadBytes; // bytes following the header
+	uint32_t seq;        // sequence number
+	uint32_t payloadBytes; // bytes following the header
 };
 
 // payload for kind=StateCmd, messageType=DATA
@@ -59,3 +58,30 @@ enum class ServoCommandOp : uint8_t
 	MoveX = 1,
 	MoveY = 2,
 };
+
+constexpr size_t kProtoAudioChunkMaxBytes = 4096;
+constexpr size_t kProtoServoCommandMaxCount = 255;
+constexpr size_t kMaxEncodedWebSocketMessageBytes = stackchan_websocket_v1_WebSocketMessage_size;
+
+stackchan_websocket_v1_MessageKind toProtoMessageKind(MessageKind kind);
+stackchan_websocket_v1_MessageType toProtoMessageType(MessageType type);
+stackchan_websocket_v1_StackchanState toProtoState(RemoteState state);
+stackchan_websocket_v1_ServoOperation toProtoServoOperation(ServoCommandOp op);
+
+RemoteState fromProtoState(stackchan_websocket_v1_StackchanState state);
+ServoCommandOp fromProtoServoOperation(stackchan_websocket_v1_ServoOperation op);
+
+bool setProtoAudioChunk(
+	stackchan_websocket_v1_AudioChunk &chunk,
+	const uint8_t *data,
+	size_t data_len);
+const uint8_t *getProtoAudioChunkBytes(const stackchan_websocket_v1_AudioChunk &chunk);
+size_t getProtoAudioChunkSize(const stackchan_websocket_v1_AudioChunk &chunk);
+
+bool encodeWebSocketMessage(
+	const stackchan_websocket_v1_WebSocketMessage &message,
+	std::vector<uint8_t> &encoded);
+bool decodeWebSocketMessage(
+	const uint8_t *data,
+	size_t data_len,
+	stackchan_websocket_v1_WebSocketMessage &message);
diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.c b/firmware/lib/generated_protobuf/websocket-message.pb.c
new file mode 100644
index 0000000..e024c81
--- /dev/null
+++ b/firmware/lib/generated_protobuf/websocket-message.pb.c
@@ -0,0 +1,56 @@
+/* Automatically generated nanopb constant definitions */
+/* Generated by nanopb-0.4.9.1 */
+
+#include "websocket-message.pb.h"
+#if PB_PROTO_HEADER_VERSION != 40
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+PB_BIND(stackchan_websocket_v1_WebSocketMessage, stackchan_websocket_v1_WebSocketMessage, 4)
+
+
+PB_BIND(stackchan_websocket_v1_AudioPcmStart, stackchan_websocket_v1_AudioPcmStart, AUTO)
+
+
+PB_BIND(stackchan_websocket_v1_AudioPcmEnd, stackchan_websocket_v1_AudioPcmEnd, AUTO)
+
+
+PB_BIND(stackchan_websocket_v1_AudioWavStart, stackchan_websocket_v1_AudioWavStart, AUTO)
+
+
+PB_BIND(stackchan_websocket_v1_AudioWavEnd, stackchan_websocket_v1_AudioWavEnd, AUTO)
+
+
+PB_BIND(stackchan_websocket_v1_AudioChunk, stackchan_websocket_v1_AudioChunk, 4)
+
+
+PB_BIND(stackchan_websocket_v1_StateCommand, stackchan_websocket_v1_StateCommand, AUTO)
+
+
+PB_BIND(stackchan_websocket_v1_WakeWordEvent, stackchan_websocket_v1_WakeWordEvent, AUTO)
+
+
+PB_BIND(stackchan_websocket_v1_StateEvent, stackchan_websocket_v1_StateEvent, AUTO)
+
+
+PB_BIND(stackchan_websocket_v1_SpeakDoneEvent, stackchan_websocket_v1_SpeakDoneEvent, AUTO)
+
+
+PB_BIND(stackchan_websocket_v1_ServoCommandSequence, stackchan_websocket_v1_ServoCommandSequence, 2)
+
+
+PB_BIND(stackchan_websocket_v1_ServoCommand, stackchan_websocket_v1_ServoCommand, AUTO)
+
+
+PB_BIND(stackchan_websocket_v1_ServoDoneEvent, stackchan_websocket_v1_ServoDoneEvent, AUTO)
+
+
+
+
+
+
+
+
+
+
+
diff --git a/firmware/lib/generated_protobuf/websocket-message.pb.h b/firmware/lib/generated_protobuf/websocket-message.pb.h
new file mode 100644
index 0000000..ce13703
--- /dev/null
+++ b/firmware/lib/generated_protobuf/websocket-message.pb.h
@@ -0,0 +1,370 @@
+/* Automatically generated nanopb header */
+/* Generated by nanopb-0.4.9.1 */
+
+#ifndef PB_STACKCHAN_WEBSOCKET_V1_WEBSOCKET_MESSAGE_PB_H_INCLUDED
+#define PB_STACKCHAN_WEBSOCKET_V1_WEBSOCKET_MESSAGE_PB_H_INCLUDED
+#include <pb.h>
+
+#if PB_PROTO_HEADER_VERSION != 40
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+/* Enum definitions */
+typedef enum _stackchan_websocket_v1_MessageKind {
+    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_UNSPECIFIED = 0,
+    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_PCM = 1,
+    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_WAV = 2,
+    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_STATE_CMD = 3,
+    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_WAKE_WORD_EVT = 4,
+    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_STATE_EVT = 5,
+    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SPEAK_DONE_EVT = 6,
+    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_CMD = 7,
+    stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_DONE_EVT = 8
+} stackchan_websocket_v1_MessageKind;
+
+typedef enum _stackchan_websocket_v1_MessageType {
+    stackchan_websocket_v1_MessageType_MESSAGE_TYPE_UNSPECIFIED = 0,
+    stackchan_websocket_v1_MessageType_MESSAGE_TYPE_START = 1,
+    stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA = 2,
+    stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END = 3
+} stackchan_websocket_v1_MessageType;
+
+typedef enum _stackchan_websocket_v1_StackchanState {
+    stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE = 0,
+    stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING = 1,
+    stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING = 2,
+    stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING = 3
+} stackchan_websocket_v1_StackchanState;
+
+typedef enum _stackchan_websocket_v1_ServoOperation {
+    stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP = 0,
+    stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X = 1,
+    stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y = 2
+} stackchan_websocket_v1_ServoOperation;
+
+/* Struct definitions */
+typedef struct _stackchan_websocket_v1_AudioPcmStart {
+    char dummy_field;
+} stackchan_websocket_v1_AudioPcmStart;
+
+typedef struct _stackchan_websocket_v1_AudioPcmEnd {
+    char dummy_field;
+} stackchan_websocket_v1_AudioPcmEnd;
+
+typedef struct _stackchan_websocket_v1_AudioWavStart {
+    uint32_t sample_rate;
+    uint32_t channels;
+} stackchan_websocket_v1_AudioWavStart;
+
+typedef struct _stackchan_websocket_v1_AudioWavEnd {
+    char dummy_field;
+} stackchan_websocket_v1_AudioWavEnd;
+
+typedef PB_BYTES_ARRAY_T(4096) stackchan_websocket_v1_AudioChunk_pcm_bytes_t;
+typedef struct _stackchan_websocket_v1_AudioChunk {
+    stackchan_websocket_v1_AudioChunk_pcm_bytes_t pcm_bytes;
+} stackchan_websocket_v1_AudioChunk;
+
+typedef struct _stackchan_websocket_v1_StateCommand {
+    stackchan_websocket_v1_StackchanState state;
+} stackchan_websocket_v1_StateCommand;
+
+typedef struct _stackchan_websocket_v1_WakeWordEvent {
+    bool detected;
+} stackchan_websocket_v1_WakeWordEvent;
+
+typedef struct _stackchan_websocket_v1_StateEvent {
+    stackchan_websocket_v1_StackchanState state;
+} stackchan_websocket_v1_StateEvent;
+
+typedef struct _stackchan_websocket_v1_SpeakDoneEvent {
+    bool done;
+} stackchan_websocket_v1_SpeakDoneEvent;
+
+typedef struct _stackchan_websocket_v1_ServoCommand {
+    stackchan_websocket_v1_ServoOperation op;
+    int32_t angle; /* used by MOVE_X / MOVE_Y */
+    int32_t duration_ms; /* used by all operations */
+} stackchan_websocket_v1_ServoCommand;
+
+typedef struct _stackchan_websocket_v1_ServoCommandSequence {
+    pb_size_t commands_count;
+    stackchan_websocket_v1_ServoCommand commands[255];
+} stackchan_websocket_v1_ServoCommandSequence;
+
+typedef struct _stackchan_websocket_v1_ServoDoneEvent {
+    bool done;
+} stackchan_websocket_v1_ServoDoneEvent;
+
+/* One WebSocket binary frame carries exactly one WebSocketMessage.
+
+ Instead of concatenating two protobuf messages such as Header + Body,
+ this envelope keeps the routing metadata and the typed body together in a
+ single protobuf message. The `kind` / `message_type` fields preserve the
+ current protocol semantics, while `body` provides strongly typed payloads
+ for Python and firmware implementations. */
+typedef struct _stackchan_websocket_v1_WebSocketMessage {
+    stackchan_websocket_v1_MessageKind kind;
+    stackchan_websocket_v1_MessageType message_type;
+    uint32_t seq; /* current implementation uses uint16, but proto uses uint32 */
+    pb_size_t which_body;
+    union {
+        stackchan_websocket_v1_AudioPcmStart audio_pcm_start;
+        stackchan_websocket_v1_AudioChunk audio_pcm_data;
+        stackchan_websocket_v1_AudioPcmEnd audio_pcm_end;
+        stackchan_websocket_v1_AudioWavStart audio_wav_start;
+        stackchan_websocket_v1_AudioChunk audio_wav_data;
+        stackchan_websocket_v1_AudioWavEnd audio_wav_end;
+        stackchan_websocket_v1_StateCommand state_cmd;
+        stackchan_websocket_v1_WakeWordEvent wake_word_evt;
+        stackchan_websocket_v1_StateEvent state_evt;
+        stackchan_websocket_v1_SpeakDoneEvent speak_done_evt;
+        stackchan_websocket_v1_ServoCommandSequence servo_cmd;
+        stackchan_websocket_v1_ServoDoneEvent servo_done_evt;
+    } body;
+} stackchan_websocket_v1_WebSocketMessage;
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Helper constants for enums */
+#define _stackchan_websocket_v1_MessageKind_MIN stackchan_websocket_v1_MessageKind_MESSAGE_KIND_UNSPECIFIED
+#define _stackchan_websocket_v1_MessageKind_MAX stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_DONE_EVT
+#define _stackchan_websocket_v1_MessageKind_ARRAYSIZE ((stackchan_websocket_v1_MessageKind)(stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_DONE_EVT+1))
+
+#define _stackchan_websocket_v1_MessageType_MIN stackchan_websocket_v1_MessageType_MESSAGE_TYPE_UNSPECIFIED
+#define _stackchan_websocket_v1_MessageType_MAX stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END
+#define _stackchan_websocket_v1_MessageType_ARRAYSIZE ((stackchan_websocket_v1_MessageType)(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END+1))
+
+#define _stackchan_websocket_v1_StackchanState_MIN stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE
+#define _stackchan_websocket_v1_StackchanState_MAX stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING
+#define _stackchan_websocket_v1_StackchanState_ARRAYSIZE ((stackchan_websocket_v1_StackchanState)(stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING+1))
+
+#define _stackchan_websocket_v1_ServoOperation_MIN stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP
+#define _stackchan_websocket_v1_ServoOperation_MAX stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y
+#define _stackchan_websocket_v1_ServoOperation_ARRAYSIZE ((stackchan_websocket_v1_ServoOperation)(stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y+1))
+
+#define stackchan_websocket_v1_WebSocketMessage_kind_ENUMTYPE stackchan_websocket_v1_MessageKind
+#define stackchan_websocket_v1_WebSocketMessage_message_type_ENUMTYPE stackchan_websocket_v1_MessageType
+
+
+
+
+
+
+#define stackchan_websocket_v1_StateCommand_state_ENUMTYPE stackchan_websocket_v1_StackchanState
+
+
+#define stackchan_websocket_v1_StateEvent_state_ENUMTYPE stackchan_websocket_v1_StackchanState
+
+
+
+#define stackchan_websocket_v1_ServoCommand_op_ENUMTYPE stackchan_websocket_v1_ServoOperation
+
+
+
+/* Initializer values for message structs */
+#define stackchan_websocket_v1_WebSocketMessage_init_default {_stackchan_websocket_v1_MessageKind_MIN, _stackchan_websocket_v1_MessageType_MIN, 0, 0, {stackchan_websocket_v1_AudioPcmStart_init_default}}
+#define stackchan_websocket_v1_AudioPcmStart_init_default {0}
+#define stackchan_websocket_v1_AudioPcmEnd_init_default {0}
+#define stackchan_websocket_v1_AudioWavStart_init_default {0, 0}
+#define stackchan_websocket_v1_AudioWavEnd_init_default {0}
+#define stackchan_websocket_v1_AudioChunk_init_default {{0, {0}}}
+#define stackchan_websocket_v1_StateCommand_init_default {_stackchan_websocket_v1_StackchanState_MIN}
+#define stackchan_websocket_v1_WakeWordEvent_init_default {0}
+#define stackchan_websocket_v1_StateEvent_init_default {_stackchan_websocket_v1_StackchanState_MIN}
+#define stackchan_websocket_v1_SpeakDoneEvent_init_default {0}
+#define stackchan_websocket_v1_ServoCommandSequence_init_default {0, {stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default, stackchan_websocket_v1_ServoCommand_init_default}}
+#define stackchan_websocket_v1_ServoCommand_init_default {_stackchan_websocket_v1_ServoOperation_MIN, 0, 0}
+#define stackchan_websocket_v1_ServoDoneEvent_init_default {0}
+#define stackchan_websocket_v1_WebSocketMessage_init_zero {_stackchan_websocket_v1_MessageKind_MIN, _stackchan_websocket_v1_MessageType_MIN, 0, 0, {stackchan_websocket_v1_AudioPcmStart_init_zero}}
+#define stackchan_websocket_v1_AudioPcmStart_init_zero {0}
+#define stackchan_websocket_v1_AudioPcmEnd_init_zero {0}
+#define stackchan_websocket_v1_AudioWavStart_init_zero {0, 0}
+#define stackchan_websocket_v1_AudioWavEnd_init_zero {0}
+#define stackchan_websocket_v1_AudioChunk_init_zero {{0, {0}}}
+#define stackchan_websocket_v1_StateCommand_init_zero {_stackchan_websocket_v1_StackchanState_MIN}
+#define stackchan_websocket_v1_WakeWordEvent_init_zero {0}
+#define stackchan_websocket_v1_StateEvent_init_zero {_stackchan_websocket_v1_StackchanState_MIN}
+#define stackchan_websocket_v1_SpeakDoneEvent_init_zero {0}
+#define stackchan_websocket_v1_ServoCommandSequence_init_zero {0, {stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero, stackchan_websocket_v1_ServoCommand_init_zero}}
+#define stackchan_websocket_v1_ServoCommand_init_zero {_stackchan_websocket_v1_ServoOperation_MIN, 0, 0}
+#define stackchan_websocket_v1_ServoDoneEvent_init_zero {0}
+
+/* Field tags (for use in manual encoding/decoding) */
+#define stackchan_websocket_v1_AudioWavStart_sample_rate_tag 1
+#define stackchan_websocket_v1_AudioWavStart_channels_tag 2
+#define stackchan_websocket_v1_AudioChunk_pcm_bytes_tag 1
+#define stackchan_websocket_v1_StateCommand_state_tag 1
+#define stackchan_websocket_v1_WakeWordEvent_detected_tag 1
+#define stackchan_websocket_v1_StateEvent_state_tag 1
+#define stackchan_websocket_v1_SpeakDoneEvent_done_tag 1
+#define stackchan_websocket_v1_ServoCommand_op_tag 1
+#define stackchan_websocket_v1_ServoCommand_angle_tag 2
+#define stackchan_websocket_v1_ServoCommand_duration_ms_tag 3
+#define stackchan_websocket_v1_ServoCommandSequence_commands_tag 1
+#define stackchan_websocket_v1_ServoDoneEvent_done_tag 1
+#define stackchan_websocket_v1_WebSocketMessage_kind_tag 1
+#define stackchan_websocket_v1_WebSocketMessage_message_type_tag 2
+#define stackchan_websocket_v1_WebSocketMessage_seq_tag 3
+#define stackchan_websocket_v1_WebSocketMessage_audio_pcm_start_tag 10
+#define stackchan_websocket_v1_WebSocketMessage_audio_pcm_data_tag 11
+#define stackchan_websocket_v1_WebSocketMessage_audio_pcm_end_tag 12
+#define stackchan_websocket_v1_WebSocketMessage_audio_wav_start_tag 20
+#define stackchan_websocket_v1_WebSocketMessage_audio_wav_data_tag 21
+#define stackchan_websocket_v1_WebSocketMessage_audio_wav_end_tag 22
+#define stackchan_websocket_v1_WebSocketMessage_state_cmd_tag 30
+#define stackchan_websocket_v1_WebSocketMessage_wake_word_evt_tag 31
+#define stackchan_websocket_v1_WebSocketMessage_state_evt_tag 32
+#define stackchan_websocket_v1_WebSocketMessage_speak_done_evt_tag 33
+#define stackchan_websocket_v1_WebSocketMessage_servo_cmd_tag 34
+#define stackchan_websocket_v1_WebSocketMessage_servo_done_evt_tag 35
+
+/* Struct field encoding specification for nanopb */
+#define stackchan_websocket_v1_WebSocketMessage_FIELDLIST(X, a) \
+X(a, STATIC,   SINGULAR, UENUM,    kind,              1) \
+X(a, STATIC,   SINGULAR, UENUM,    message_type,      2) \
+X(a, STATIC,   SINGULAR, UINT32,   seq,               3) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,audio_pcm_start,body.audio_pcm_start),  10) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,audio_pcm_data,body.audio_pcm_data),  11) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,audio_pcm_end,body.audio_pcm_end),  12) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,audio_wav_start,body.audio_wav_start),  20) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,audio_wav_data,body.audio_wav_data),  21) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,audio_wav_end,body.audio_wav_end),  22) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,state_cmd,body.state_cmd),  30) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,wake_word_evt,body.wake_word_evt),  31) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,state_evt,body.state_evt),  32) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,speak_done_evt,body.speak_done_evt),  33) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,servo_cmd,body.servo_cmd),  34) \
+X(a, STATIC,   ONEOF,    MESSAGE,  (body,servo_done_evt,body.servo_done_evt),  35)
+#define stackchan_websocket_v1_WebSocketMessage_CALLBACK NULL
+#define stackchan_websocket_v1_WebSocketMessage_DEFAULT NULL
+#define stackchan_websocket_v1_WebSocketMessage_body_audio_pcm_start_MSGTYPE stackchan_websocket_v1_AudioPcmStart
+#define stackchan_websocket_v1_WebSocketMessage_body_audio_pcm_data_MSGTYPE stackchan_websocket_v1_AudioChunk
+#define stackchan_websocket_v1_WebSocketMessage_body_audio_pcm_end_MSGTYPE stackchan_websocket_v1_AudioPcmEnd
+#define stackchan_websocket_v1_WebSocketMessage_body_audio_wav_start_MSGTYPE stackchan_websocket_v1_AudioWavStart
+#define stackchan_websocket_v1_WebSocketMessage_body_audio_wav_data_MSGTYPE stackchan_websocket_v1_AudioChunk
+#define stackchan_websocket_v1_WebSocketMessage_body_audio_wav_end_MSGTYPE stackchan_websocket_v1_AudioWavEnd
+#define stackchan_websocket_v1_WebSocketMessage_body_state_cmd_MSGTYPE stackchan_websocket_v1_StateCommand
+#define stackchan_websocket_v1_WebSocketMessage_body_wake_word_evt_MSGTYPE stackchan_websocket_v1_WakeWordEvent
+#define stackchan_websocket_v1_WebSocketMessage_body_state_evt_MSGTYPE stackchan_websocket_v1_StateEvent
+#define stackchan_websocket_v1_WebSocketMessage_body_speak_done_evt_MSGTYPE stackchan_websocket_v1_SpeakDoneEvent
+#define stackchan_websocket_v1_WebSocketMessage_body_servo_cmd_MSGTYPE stackchan_websocket_v1_ServoCommandSequence
+#define stackchan_websocket_v1_WebSocketMessage_body_servo_done_evt_MSGTYPE stackchan_websocket_v1_ServoDoneEvent
+
+#define stackchan_websocket_v1_AudioPcmStart_FIELDLIST(X, a) \
+
+#define stackchan_websocket_v1_AudioPcmStart_CALLBACK NULL
+#define stackchan_websocket_v1_AudioPcmStart_DEFAULT NULL
+
+#define stackchan_websocket_v1_AudioPcmEnd_FIELDLIST(X, a) \
+
+#define stackchan_websocket_v1_AudioPcmEnd_CALLBACK NULL
+#define stackchan_websocket_v1_AudioPcmEnd_DEFAULT NULL
+
+#define stackchan_websocket_v1_AudioWavStart_FIELDLIST(X, a) \
+X(a, STATIC,   SINGULAR, UINT32,   sample_rate,       1) \
+X(a, STATIC,   SINGULAR, UINT32,   channels,          2)
+#define stackchan_websocket_v1_AudioWavStart_CALLBACK NULL
+#define stackchan_websocket_v1_AudioWavStart_DEFAULT NULL
+
+#define stackchan_websocket_v1_AudioWavEnd_FIELDLIST(X, a) \
+
+#define stackchan_websocket_v1_AudioWavEnd_CALLBACK NULL
+#define stackchan_websocket_v1_AudioWavEnd_DEFAULT NULL
+
+#define stackchan_websocket_v1_AudioChunk_FIELDLIST(X, a) \
+X(a, STATIC,   SINGULAR, BYTES,    pcm_bytes,         1)
+#define stackchan_websocket_v1_AudioChunk_CALLBACK NULL
+#define stackchan_websocket_v1_AudioChunk_DEFAULT NULL
+
+#define stackchan_websocket_v1_StateCommand_FIELDLIST(X, a) \
+X(a, STATIC,   SINGULAR, UENUM,    state,             1)
+#define stackchan_websocket_v1_StateCommand_CALLBACK NULL
+#define stackchan_websocket_v1_StateCommand_DEFAULT NULL
+
+#define stackchan_websocket_v1_WakeWordEvent_FIELDLIST(X, a) \
+X(a, STATIC,   SINGULAR, BOOL,     detected,          1)
+#define stackchan_websocket_v1_WakeWordEvent_CALLBACK NULL
+#define stackchan_websocket_v1_WakeWordEvent_DEFAULT NULL
+
+#define stackchan_websocket_v1_StateEvent_FIELDLIST(X, a) \
+X(a, STATIC,   SINGULAR, UENUM,    state,             1)
+#define stackchan_websocket_v1_StateEvent_CALLBACK NULL
+#define stackchan_websocket_v1_StateEvent_DEFAULT NULL
+
+#define stackchan_websocket_v1_SpeakDoneEvent_FIELDLIST(X, a) \
+X(a, STATIC,   SINGULAR, BOOL,     done,              1)
+#define stackchan_websocket_v1_SpeakDoneEvent_CALLBACK NULL
+#define stackchan_websocket_v1_SpeakDoneEvent_DEFAULT NULL
+
+#define stackchan_websocket_v1_ServoCommandSequence_FIELDLIST(X, a) \
+X(a, STATIC,   REPEATED, MESSAGE,  commands,          1)
+#define stackchan_websocket_v1_ServoCommandSequence_CALLBACK NULL
+#define stackchan_websocket_v1_ServoCommandSequence_DEFAULT NULL
+#define stackchan_websocket_v1_ServoCommandSequence_commands_MSGTYPE stackchan_websocket_v1_ServoCommand
+
+#define stackchan_websocket_v1_ServoCommand_FIELDLIST(X, a) \
+X(a, STATIC,   SINGULAR, UENUM,    op,                1) \
+X(a, STATIC,   SINGULAR, SINT32,   angle,             2) \
+X(a, STATIC,   SINGULAR, SINT32,   duration_ms,       3)
+#define stackchan_websocket_v1_ServoCommand_CALLBACK NULL
+#define stackchan_websocket_v1_ServoCommand_DEFAULT NULL
+
+#define stackchan_websocket_v1_ServoDoneEvent_FIELDLIST(X, a) \
+X(a, STATIC,   SINGULAR, BOOL,     done,              1)
+#define stackchan_websocket_v1_ServoDoneEvent_CALLBACK NULL
+#define stackchan_websocket_v1_ServoDoneEvent_DEFAULT NULL
+
+extern const pb_msgdesc_t stackchan_websocket_v1_WebSocketMessage_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_AudioPcmStart_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_AudioPcmEnd_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_AudioWavStart_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_AudioWavEnd_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_AudioChunk_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_StateCommand_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_WakeWordEvent_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_StateEvent_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_SpeakDoneEvent_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_ServoCommandSequence_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_ServoCommand_msg;
+extern const pb_msgdesc_t stackchan_websocket_v1_ServoDoneEvent_msg;
+
+/* Defines for backwards compatibility with code written before nanopb-0.4.0 */
+#define stackchan_websocket_v1_WebSocketMessage_fields &stackchan_websocket_v1_WebSocketMessage_msg
+#define stackchan_websocket_v1_AudioPcmStart_fields &stackchan_websocket_v1_AudioPcmStart_msg
+#define stackchan_websocket_v1_AudioPcmEnd_fields &stackchan_websocket_v1_AudioPcmEnd_msg
+#define stackchan_websocket_v1_AudioWavStart_fields &stackchan_websocket_v1_AudioWavStart_msg
+#define stackchan_websocket_v1_AudioWavEnd_fields &stackchan_websocket_v1_AudioWavEnd_msg
+#define stackchan_websocket_v1_AudioChunk_fields &stackchan_websocket_v1_AudioChunk_msg
+#define stackchan_websocket_v1_StateCommand_fields &stackchan_websocket_v1_StateCommand_msg
+#define stackchan_websocket_v1_WakeWordEvent_fields &stackchan_websocket_v1_WakeWordEvent_msg
+#define stackchan_websocket_v1_StateEvent_fields &stackchan_websocket_v1_StateEvent_msg
+#define stackchan_websocket_v1_SpeakDoneEvent_fields &stackchan_websocket_v1_SpeakDoneEvent_msg
+#define stackchan_websocket_v1_ServoCommandSequence_fields &stackchan_websocket_v1_ServoCommandSequence_msg
+#define stackchan_websocket_v1_ServoCommand_fields &stackchan_websocket_v1_ServoCommand_msg
+#define stackchan_websocket_v1_ServoDoneEvent_fields &stackchan_websocket_v1_ServoDoneEvent_msg
+
+/* Maximum encoded size of messages (where known) */
+#define STACKCHAN_WEBSOCKET_V1_WEBSOCKET_MESSAGE_PB_H_MAX_SIZE stackchan_websocket_v1_WebSocketMessage_size
+#define stackchan_websocket_v1_AudioChunk_size   4099
+#define stackchan_websocket_v1_AudioPcmEnd_size  0
+#define stackchan_websocket_v1_AudioPcmStart_size 0
+#define stackchan_websocket_v1_AudioWavEnd_size  0
+#define stackchan_websocket_v1_AudioWavStart_size 12
+#define stackchan_websocket_v1_ServoCommandSequence_size 4080
+#define stackchan_websocket_v1_ServoCommand_size 14
+#define stackchan_websocket_v1_ServoDoneEvent_size 2
+#define stackchan_websocket_v1_SpeakDoneEvent_size 2
+#define stackchan_websocket_v1_StateCommand_size 2
+#define stackchan_websocket_v1_StateEvent_size   2
+#define stackchan_websocket_v1_WakeWordEvent_size 2
+#define stackchan_websocket_v1_WebSocketMessage_size 4113
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
diff --git a/firmware/src/listening.cpp b/firmware/src/listening.cpp
index 5ca9193..2aa9d77 100644
--- a/firmware/src/listening.cpp
+++ b/firmware/src/listening.cpp
@@ -4,6 +4,11 @@
 #include <vector>
 #include <cstdlib>
 
+namespace
+{
+stackchan_websocket_v1_WebSocketMessage g_listening_tx_message = stackchan_websocket_v1_WebSocketMessage_init_zero;
+}
+
 Listening::Listening(WebSocketsClient &ws, StateMachine &sm, int sampleRate)
     : ws_(ws), state_(sm), sample_rate_(sampleRate),
       chunk_samples_(static_cast<size_t>(sampleRate) / 8),
@@ -189,19 +194,38 @@ bool Listening::sendPacket(MessageType type, const int16_t *samples, size_t samp
     return false;
   }
 
-  WsHeader header{};
-  header.kind = static_cast<uint8_t>(MessageKind::AudioPcm);
-  header.messageType = static_cast<uint8_t>(type);
-  header.reserved = 0;
-  header.seq = seq_counter_++;
-  header.payloadBytes = static_cast<uint16_t>(sampleCount * sizeof(int16_t));
+  auto &message = g_listening_tx_message;
+  message = stackchan_websocket_v1_WebSocketMessage_init_zero;
+  message.kind = toProtoMessageKind(MessageKind::AudioPcm);
+  message.message_type = toProtoMessageType(type);
+  message.seq = seq_counter_++;
+
+  switch (type)
+  {
+  case MessageType::START:
+    message.which_body = stackchan_websocket_v1_WebSocketMessage_audio_pcm_start_tag;
+    break;
+  case MessageType::DATA:
+    message.which_body = stackchan_websocket_v1_WebSocketMessage_audio_pcm_data_tag;
+    if (!setProtoAudioChunk(
+            message.body.audio_pcm_data,
+            reinterpret_cast<const uint8_t *>(samples),
+            sampleCount * sizeof(int16_t)))
+    {
+      return false;
+    }
+    break;
+  case MessageType::END:
+    message.which_body = stackchan_websocket_v1_WebSocketMessage_audio_pcm_end_tag;
+    break;
+  default:
+    return false;
+  }
 
   std::vector<uint8_t> packet;
-  packet.resize(sizeof(WsHeader) + header.payloadBytes);
-  memcpy(packet.data(), &header, sizeof(WsHeader));
-  if (header.payloadBytes > 0 && samples != nullptr)
+  if (!encodeWebSocketMessage(message, packet))
   {
-    memcpy(packet.data() + sizeof(WsHeader), samples, header.payloadBytes);
+    return false;
   }
 
   ws_.sendBIN(packet.data(), packet.size());
diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp
index bb0f904..ca932b3 100644
--- a/firmware/src/main.cpp
+++ b/firmware/src/main.cpp
@@ -6,6 +6,7 @@
 #include <WebSocketsClient.h>
 #include <algorithm>
 #include <cstring>
+#include <limits>
 #include <vector>
 #include "config.h"
 #include "../include/protocols.hpp"
@@ -37,9 +38,11 @@ static BodyServo servo;
 // Protocol types are defined in include/protocols.hpp
 namespace
 {
-uint16_t g_uplink_seq = 0;
+uint32_t g_uplink_seq = 0;
 uint32_t g_last_comm_ms = 0;
 constexpr uint32_t kCommTimeoutMs = 60000;
+stackchan_websocket_v1_WebSocketMessage g_tx_message = stackchan_websocket_v1_WebSocketMessage_init_zero;
+stackchan_websocket_v1_WebSocketMessage g_rx_message = stackchan_websocket_v1_WebSocketMessage_init_zero;
 
 void markCommunicationActive()
 {
@@ -64,36 +67,41 @@ void handleCommunicationTimeout()
   }
 }
 
-bool sendUplinkPacket(MessageKind kind, MessageType msgType, const uint8_t *payload, size_t payload_len)
+bool sendUplinkMessage(const stackchan_websocket_v1_WebSocketMessage &message)
 {
   if ((WiFi.status() != WL_CONNECTED) || !wsClient.isConnected())
   {
     return false;
   }
 
-  WsHeader header{};
-  header.kind = static_cast<uint8_t>(kind);
-  header.messageType = static_cast<uint8_t>(msgType);
-  header.reserved = 0;
-  header.seq = g_uplink_seq++;
-  header.payloadBytes = static_cast<uint16_t>(payload_len);
-
   std::vector<uint8_t> packet;
-  packet.resize(sizeof(WsHeader) + payload_len);
-  memcpy(packet.data(), &header, sizeof(WsHeader));
-  if (payload_len > 0 && payload != nullptr)
+  if (!encodeWebSocketMessage(message, packet))
   {
-    memcpy(packet.data() + sizeof(WsHeader), payload, payload_len);
+    return false;
   }
+
   wsClient.sendBIN(packet.data(), packet.size());
   markCommunicationActive();
   return true;
 }
 
+void appendInt16Le(std::vector<uint8_t> &payload, int16_t value)
+{
+  size_t start = payload.size();
+  payload.resize(start + sizeof(value));
+  memcpy(payload.data() + start, &value, sizeof(value));
+}
+
 void notifyWakeWordDetected()
 {
-  const uint8_t payload = 1; // detected
-  if (!sendUplinkPacket(MessageKind::WakeWordEvt, MessageType::DATA, &payload, sizeof(payload)))
+  auto &message = g_tx_message;
+  message = stackchan_websocket_v1_WebSocketMessage_init_zero;
+  message.kind = toProtoMessageKind(MessageKind::WakeWordEvt);
+  message.message_type = toProtoMessageType(MessageType::DATA);
+  message.seq = g_uplink_seq++;
+  message.which_body = stackchan_websocket_v1_WebSocketMessage_wake_word_evt_tag;
+  message.body.wake_word_evt.detected = true;
+  if (!sendUplinkMessage(message))
   {
     log_w("Failed to send WakeWordEvt");
   }
@@ -101,17 +109,29 @@ void notifyWakeWordDetected()
 
 void notifyCurrentState(StateMachine::State state)
 {
-  const uint8_t payload = static_cast<uint8_t>(state);
-  if (!sendUplinkPacket(MessageKind::StateEvt, MessageType::DATA, &payload, sizeof(payload)))
+  auto &message = g_tx_message;
+  message = stackchan_websocket_v1_WebSocketMessage_init_zero;
+  message.kind = toProtoMessageKind(MessageKind::StateEvt);
+  message.message_type = toProtoMessageType(MessageType::DATA);
+  message.seq = g_uplink_seq++;
+  message.which_body = stackchan_websocket_v1_WebSocketMessage_state_evt_tag;
+  message.body.state_evt.state = static_cast<stackchan_websocket_v1_StackchanState>(static_cast<uint8_t>(state));
+  if (!sendUplinkMessage(message))
   {
-    log_w("Failed to send StateEvt state=%u", static_cast<unsigned>(payload));
+    log_w("Failed to send StateEvt state=%u", static_cast<unsigned>(state));
   }
 }
 
 void notifySpeakDone()
 {
-  const uint8_t payload = 1; // done
-  if (!sendUplinkPacket(MessageKind::SpeakDoneEvt, MessageType::DATA, &payload, sizeof(payload)))
+  auto &message = g_tx_message;
+  message = stackchan_websocket_v1_WebSocketMessage_init_zero;
+  message.kind = toProtoMessageKind(MessageKind::SpeakDoneEvt);
+  message.message_type = toProtoMessageType(MessageType::DATA);
+  message.seq = g_uplink_seq++;
+  message.which_body = stackchan_websocket_v1_WebSocketMessage_speak_done_evt_tag;
+  message.body.speak_done_evt.done = true;
+  if (!sendUplinkMessage(message))
   {
     log_w("Failed to send SpeakDoneEvt");
   }
@@ -119,22 +139,22 @@ void notifySpeakDone()
 
 void notifyServoDone()
 {
-  const uint8_t payload = 1; // done
-  if (!sendUplinkPacket(MessageKind::ServoDoneEvt, MessageType::DATA, &payload, sizeof(payload)))
+  auto &message = g_tx_message;
+  message = stackchan_websocket_v1_WebSocketMessage_init_zero;
+  message.kind = toProtoMessageKind(MessageKind::ServoDoneEvt);
+  message.message_type = toProtoMessageType(MessageType::DATA);
+  message.seq = g_uplink_seq++;
+  message.which_body = stackchan_websocket_v1_WebSocketMessage_servo_done_evt_tag;
+  message.body.servo_done_evt.done = true;
+  if (!sendUplinkMessage(message))
   {
     log_w("Failed to send ServoDoneEvt");
   }
 }
 
-bool applyRemoteStateCommand(const uint8_t *body, size_t bodyLen)
+bool applyRemoteStateCommand(const stackchan_websocket_v1_StateCommand &command)
 {
-  if (body == nullptr || bodyLen < 1)
-  {
-    log_w("StateCmd payload too short: %u", static_cast<unsigned>(bodyLen));
-    return false;
-  }
-
-  RemoteState target = static_cast<RemoteState>(body[0]);
+  RemoteState target = fromProtoState(command.state);
   switch (target)
   {
   case RemoteState::Idle:
@@ -150,14 +170,54 @@ bool applyRemoteStateCommand(const uint8_t *body, size_t bodyLen)
     stateMachine.setState(StateMachine::Speaking);
     return true;
   default:
-    log_w("Unknown remote state: %u", static_cast<unsigned>(body[0]));
+    log_w("Unknown remote state");
     return false;
   }
 }
 
-bool applyServoCommand(const uint8_t *body, size_t bodyLen)
+bool applyServoCommand(const stackchan_websocket_v1_ServoCommandSequence &sequence)
 {
-  if (!servo.enqueueSequence(body, bodyLen))
+  if (sequence.commands_count > kProtoServoCommandMaxCount)
+  {
+    log_w("ServoCmd count too large: %u", static_cast<unsigned>(sequence.commands_count));
+    return false;
+  }
+
+  std::vector<uint8_t> payload;
+  payload.reserve(1 + sequence.commands_count * 4);
+  payload.push_back(static_cast<uint8_t>(sequence.commands_count));
+
+  for (pb_size_t i = 0; i < sequence.commands_count; ++i)
+  {
+    const auto &command = sequence.commands[i];
+    const ServoCommandOp op = fromProtoServoOperation(command.op);
+
+    if (command.duration_ms < std::numeric_limits<int16_t>::min() ||
+        command.duration_ms > std::numeric_limits<int16_t>::max())
+    {
+      log_w("ServoCmd duration out of range at command=%u", static_cast<unsigned>(i));
+      return false;
+    }
+
+    payload.push_back(static_cast<uint8_t>(op));
+    if (op == ServoCommandOp::Sleep)
+    {
+      appendInt16Le(payload, static_cast<int16_t>(command.duration_ms));
+      continue;
+    }
+
+    if (command.angle < std::numeric_limits<int8_t>::min() ||
+        command.angle > std::numeric_limits<int8_t>::max())
+    {
+      log_w("ServoCmd angle out of range at command=%u", static_cast<unsigned>(i));
+      return false;
+    }
+
+    payload.push_back(static_cast<uint8_t>(static_cast<int8_t>(command.angle)));
+    appendInt16Le(payload, static_cast<int16_t>(command.duration_ms));
+  }
+
+  if (!servo.enqueueSequence(payload.data(), payload.size()))
   {
     log_w("Failed to apply servo command");
     return false;
@@ -202,49 +262,75 @@ void handleWsEvent(WStype_t type, uint8_t *payload, size_t length)
   case WStype_BIN:
   {
     markCommunicationActive();
-    if (length < sizeof(WsHeader))
+    auto &rx = g_rx_message;
+    rx = stackchan_websocket_v1_WebSocketMessage_init_zero;
+    if (!decodeWebSocketMessage(payload, length, rx))
     {
-      // M5.Display.println("WS bin too short");
-      log_i("WS bin too short: %d", (int)length);
+      log_i("WS protobuf decode failed: %d", (int)length);
       break;
     }
 
-    WsHeader rx{};
-    memcpy(&rx, payload, sizeof(WsHeader));
-    size_t rx_payload_len = length - sizeof(WsHeader);
-    if (rx_payload_len != rx.payloadBytes)
-    {
-      // M5.Display.println("WS payload len mismatch");
-      log_i("WS payload len mismatch: expected=%u got=%u", (unsigned)rx.payloadBytes, (unsigned)rx_payload_len);
-      break;
-    }
+    log_i("WS protobuf kind=%u len=%d", (unsigned)rx.kind, (int)length);
 
-    const uint8_t *body = payload + sizeof(WsHeader);
-    log_i("WS bin kind=%u len=%d", (unsigned)rx.kind, (int)length);
-
-    switch (static_cast<MessageKind>(rx.kind))
+    switch (rx.kind)
+    {
+    case stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_WAV:
     {
-    case MessageKind::AudioWav:
-      speaking.handleWavMessage(rx, body, rx_payload_len);
+      WsHeader compat{};
+      compat.kind = static_cast<uint8_t>(MessageKind::AudioWav);
+      compat.messageType = static_cast<uint8_t>(rx.message_type);
+      compat.seq = rx.seq;
+
+      if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_START &&
+          rx.which_body == stackchan_websocket_v1_WebSocketMessage_audio_wav_start_tag)
+      {
+        uint8_t body[6]{};
+        uint32_t sample_rate = rx.body.audio_wav_start.sample_rate;
+        uint16_t channels = static_cast<uint16_t>(rx.body.audio_wav_start.channels);
+        memcpy(body, &sample_rate, sizeof(sample_rate));
+        memcpy(body + sizeof(sample_rate), &channels, sizeof(channels));
+        compat.payloadBytes = sizeof(body);
+        speaking.handleWavMessage(compat, body, sizeof(body));
+      }
+      else if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA &&
+               rx.which_body == stackchan_websocket_v1_WebSocketMessage_audio_wav_data_tag)
+      {
+        size_t body_len = getProtoAudioChunkSize(rx.body.audio_wav_data);
+        compat.payloadBytes = body_len;
+        speaking.handleWavMessage(compat, getProtoAudioChunkBytes(rx.body.audio_wav_data), body_len);
+      }
+      else if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END &&
+               rx.which_body == stackchan_websocket_v1_WebSocketMessage_audio_wav_end_tag)
+      {
+        compat.payloadBytes = 0;
+        speaking.handleWavMessage(compat, nullptr, 0);
+      }
+      else
+      {
+        log_w("AudioWav protobuf body mismatch type=%u body=%u", (unsigned)rx.message_type, (unsigned)rx.which_body);
+      }
       break;
-    case MessageKind::StateCmd:
-      if (static_cast<MessageType>(rx.messageType) == MessageType::DATA)
+    }
+    case stackchan_websocket_v1_MessageKind_MESSAGE_KIND_STATE_CMD:
+      if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA &&
+          rx.which_body == stackchan_websocket_v1_WebSocketMessage_state_cmd_tag)
       {
-        applyRemoteStateCommand(body, rx_payload_len);
+        applyRemoteStateCommand(rx.body.state_cmd);
       }
       else
       {
-        log_w("StateCmd unsupported msgType=%u", static_cast<unsigned>(rx.messageType));
+        log_w("StateCmd protobuf body mismatch type=%u body=%u", (unsigned)rx.message_type, (unsigned)rx.which_body);
       }
       break;
-    case MessageKind::ServoCmd:
-      if (static_cast<MessageType>(rx.messageType) == MessageType::DATA)
+    case stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_CMD:
+      if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA &&
+          rx.which_body == stackchan_websocket_v1_WebSocketMessage_servo_cmd_tag)
       {
-        applyServoCommand(body, rx_payload_len);
+        applyServoCommand(rx.body.servo_cmd);
       }
       else
       {
-        log_w("ServoCmd unsupported msgType=%u", static_cast<unsigned>(rx.messageType));
+        log_w("ServoCmd protobuf body mismatch type=%u body=%u", (unsigned)rx.message_type, (unsigned)rx.which_body);
       }
       break;
     default:
diff --git a/firmware/src/protocols.cpp b/firmware/src/protocols.cpp
new file mode 100644
index 0000000..8b61312
--- /dev/null
+++ b/firmware/src/protocols.cpp
@@ -0,0 +1,163 @@
+#include "../include/protocols.hpp"
+
+#include <cstring>
+
+#include <pb_decode.h>
+#include <pb_encode.h>
+
+stackchan_websocket_v1_MessageKind toProtoMessageKind(MessageKind kind)
+{
+	switch (kind)
+	{
+	case MessageKind::AudioPcm:
+		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_PCM;
+	case MessageKind::AudioWav:
+		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_WAV;
+	case MessageKind::StateCmd:
+		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_STATE_CMD;
+	case MessageKind::WakeWordEvt:
+		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_WAKE_WORD_EVT;
+	case MessageKind::StateEvt:
+		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_STATE_EVT;
+	case MessageKind::SpeakDoneEvt:
+		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SPEAK_DONE_EVT;
+	case MessageKind::ServoCmd:
+		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_CMD;
+	case MessageKind::ServoDoneEvt:
+		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_DONE_EVT;
+	default:
+		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_UNSPECIFIED;
+	}
+}
+
+stackchan_websocket_v1_MessageType toProtoMessageType(MessageType type)
+{
+	switch (type)
+	{
+	case MessageType::START:
+		return stackchan_websocket_v1_MessageType_MESSAGE_TYPE_START;
+	case MessageType::DATA:
+		return stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA;
+	case MessageType::END:
+		return stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END;
+	default:
+		return stackchan_websocket_v1_MessageType_MESSAGE_TYPE_UNSPECIFIED;
+	}
+}
+
+stackchan_websocket_v1_StackchanState toProtoState(RemoteState state)
+{
+	switch (state)
+	{
+	case RemoteState::Idle:
+		return stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE;
+	case RemoteState::Listening:
+		return stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING;
+	case RemoteState::Thinking:
+		return stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING;
+	case RemoteState::Speaking:
+		return stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING;
+	default:
+		return stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE;
+	}
+}
+
+stackchan_websocket_v1_ServoOperation toProtoServoOperation(ServoCommandOp op)
+{
+	switch (op)
+	{
+	case ServoCommandOp::Sleep:
+		return stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP;
+	case ServoCommandOp::MoveX:
+		return stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X;
+	case ServoCommandOp::MoveY:
+		return stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y;
+	default:
+		return stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP;
+	}
+}
+
+RemoteState fromProtoState(stackchan_websocket_v1_StackchanState state)
+{
+	switch (state)
+	{
+	case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE:
+		return RemoteState::Idle;
+	case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING:
+		return RemoteState::Listening;
+	case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING:
+		return RemoteState::Thinking;
+	case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING:
+		return RemoteState::Speaking;
+	default:
+		return RemoteState::Idle;
+	}
+}
+
+ServoCommandOp fromProtoServoOperation(stackchan_websocket_v1_ServoOperation op)
+{
+	switch (op)
+	{
+	case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP:
+		return ServoCommandOp::Sleep;
+	case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X:
+		return ServoCommandOp::MoveX;
+	case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y:
+		return ServoCommandOp::MoveY;
+	default:
+		return ServoCommandOp::Sleep;
+	}
+}
+
+bool setProtoAudioChunk(
+	stackchan_websocket_v1_AudioChunk &chunk,
+	const uint8_t *data,
+	size_t data_len)
+{
+	if (data_len > kProtoAudioChunkMaxBytes)
+	{
+		return false;
+	}
+
+	chunk.pcm_bytes.size = static_cast<pb_size_t>(data_len);
+	if (data_len > 0 && data != nullptr)
+	{
+		memcpy(chunk.pcm_bytes.bytes, data, data_len);
+	}
+	return true;
+}
+
+const uint8_t *getProtoAudioChunkBytes(const stackchan_websocket_v1_AudioChunk &chunk)
+{
+	return chunk.pcm_bytes.bytes;
+}
+
+size_t getProtoAudioChunkSize(const stackchan_websocket_v1_AudioChunk &chunk)
+{
+	return chunk.pcm_bytes.size;
+}
+
+bool encodeWebSocketMessage(
+	const stackchan_websocket_v1_WebSocketMessage &message,
+	std::vector<uint8_t> &encoded)
+{
+	encoded.assign(kMaxEncodedWebSocketMessageBytes, 0);
+	pb_ostream_t stream = pb_ostream_from_buffer(encoded.data(), encoded.size());
+	if (!pb_encode(&stream, stackchan_websocket_v1_WebSocketMessage_fields, &message))
+	{
+		encoded.clear();
+		return false;
+	}
+	encoded.resize(stream.bytes_written);
+	return true;
+}
+
+bool decodeWebSocketMessage(
+	const uint8_t *data,
+	size_t data_len,
+	stackchan_websocket_v1_WebSocketMessage &message)
+{
+	message = stackchan_websocket_v1_WebSocketMessage_init_zero;
+	pb_istream_t stream = pb_istream_from_buffer(data, data_len);
+	return pb_decode(&stream, stackchan_websocket_v1_WebSocketMessage_fields, &message);
+}
diff --git a/platformio.ini b/platformio.ini
index 8543ec1..394794a 100644
--- a/platformio.ini
+++ b/platformio.ini
@@ -20,6 +20,7 @@ lib_deps =
     Links2004/WebSockets@^2.7.2
     ESP32Async/AsyncTCP@^3.4.10
     madhephaestus/ESP32Servo@^3.1.3
+    nanopb/Nanopb@^0.4.91
     https://github.com/74th/ESP-SR-For-M5Unified.git@1.0.0
     https://github.com/mongonta0716/SCServo.git
 
diff --git a/protobuf/websocket-message.options b/protobuf/websocket-message.options
new file mode 100644
index 0000000..233e71d
--- /dev/null
+++ b/protobuf/websocket-message.options
@@ -0,0 +1,2 @@
+stackchan.websocket.v1.AudioChunk.pcm_bytes max_size:4096
+stackchan.websocket.v1.ServoCommandSequence.commands max_count:255
diff --git a/protobuf/websocket-message.proto b/protobuf/websocket-message.proto
new file mode 100644
index 0000000..d1de065
--- /dev/null
+++ b/protobuf/websocket-message.proto
@@ -0,0 +1,110 @@
+syntax = "proto3";
+
+package stackchan.websocket.v1;
+
+// One WebSocket binary frame carries exactly one WebSocketMessage.
+//
+// Instead of concatenating two protobuf messages such as Header + Body,
+// this envelope keeps the routing metadata and the typed body together in a
+// single protobuf message. The `kind` / `message_type` fields preserve the
+// current protocol semantics, while `body` provides strongly typed payloads
+// for Python and firmware implementations.
+message WebSocketMessage {
+  MessageKind kind = 1;
+  MessageType message_type = 2;
+  uint32 seq = 3; // current implementation uses uint16, but proto uses uint32
+
+  oneof body {
+    AudioPcmStart audio_pcm_start = 10;
+    AudioChunk audio_pcm_data = 11;
+    AudioPcmEnd audio_pcm_end = 12;
+
+    AudioWavStart audio_wav_start = 20;
+    AudioChunk audio_wav_data = 21;
+    AudioWavEnd audio_wav_end = 22;
+
+    StateCommand state_cmd = 30;
+    WakeWordEvent wake_word_evt = 31;
+    StateEvent state_evt = 32;
+    SpeakDoneEvent speak_done_evt = 33;
+    ServoCommandSequence servo_cmd = 34;
+    ServoDoneEvent servo_done_evt = 35;
+  }
+}
+
+enum MessageKind {
+  MESSAGE_KIND_UNSPECIFIED = 0;
+  MESSAGE_KIND_AUDIO_PCM = 1;
+  MESSAGE_KIND_AUDIO_WAV = 2;
+  MESSAGE_KIND_STATE_CMD = 3;
+  MESSAGE_KIND_WAKE_WORD_EVT = 4;
+  MESSAGE_KIND_STATE_EVT = 5;
+  MESSAGE_KIND_SPEAK_DONE_EVT = 6;
+  MESSAGE_KIND_SERVO_CMD = 7;
+  MESSAGE_KIND_SERVO_DONE_EVT = 8;
+}
+
+enum MessageType {
+  MESSAGE_TYPE_UNSPECIFIED = 0;
+  MESSAGE_TYPE_START = 1;
+  MESSAGE_TYPE_DATA = 2;
+  MESSAGE_TYPE_END = 3;
+}
+
+enum StackchanState {
+  STACKCHAN_STATE_IDLE = 0;
+  STACKCHAN_STATE_LISTENING = 1;
+  STACKCHAN_STATE_THINKING = 2;
+  STACKCHAN_STATE_SPEAKING = 3;
+}
+
+enum ServoOperation {
+  SERVO_OPERATION_SLEEP = 0;
+  SERVO_OPERATION_MOVE_X = 1;
+  SERVO_OPERATION_MOVE_Y = 2;
+}
+
+message AudioPcmStart {}
+
+message AudioPcmEnd {}
+
+message AudioWavStart {
+  uint32 sample_rate = 1;
+  uint32 channels = 2;
+}
+
+message AudioWavEnd {}
+
+message AudioChunk {
+  bytes pcm_bytes = 1;
+}
+
+message StateCommand {
+  StackchanState state = 1;
+}
+
+message WakeWordEvent {
+  bool detected = 1;
+}
+
+message StateEvent {
+  StackchanState state = 1;
+}
+
+message SpeakDoneEvent {
+  bool done = 1;
+}
+
+message ServoCommandSequence {
+  repeated ServoCommand commands = 1;
+}
+
+message ServoCommand {
+  ServoOperation op = 1;
+  sint32 angle = 2;       // used by MOVE_X / MOVE_Y
+  sint32 duration_ms = 3; // used by all operations
+}
+
+message ServoDoneEvent {
+  bool done = 1;
+}
diff --git a/pyproject.toml b/pyproject.toml
index 7f1a290..90638f2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,10 +16,12 @@ dependencies = [
     "voicevox-client>=1.1.0",
     "python-dotenv>=1.2.1",
     "pydantic-settings>=2.13.1",
+    "protobuf>=6.33.3",
 ]
 
 [dependency-groups]
 dev = [
+    "grpcio-tools>=1.76.0",
     "ruff>=0.15.2",
     "ty>=0.0.17",
 ]
diff --git a/stackchan_server/generated_protobuf/__init__.py b/stackchan_server/generated_protobuf/__init__.py
new file mode 100644
index 0000000..79429d2
--- /dev/null
+++ b/stackchan_server/generated_protobuf/__init__.py
@@ -0,0 +1 @@
+"""Generated protobuf modules for StackChan WebSocket messages."""
\ No newline at end of file
diff --git a/stackchan_server/generated_protobuf/websocket_message_pb2.py b/stackchan_server/generated_protobuf/websocket_message_pb2.py
new file mode 100644
index 0000000..985a939
--- /dev/null
+++ b/stackchan_server/generated_protobuf/websocket_message_pb2.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# NO CHECKED-IN PROTOBUF GENCODE
+# source: websocket-message.proto
+# Protobuf Python Version: 6.31.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import runtime_version as _runtime_version
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+_runtime_version.ValidateProtobufRuntimeVersion(
+    _runtime_version.Domain.PUBLIC,
+    6,
+    31,
+    1,
+    '',
+    'websocket-message.proto'
+)
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17websocket-message.proto\x12\x16stackchan.websocket.v1\"\x8c\x07\n\x10WebSocketMessage\x12\x31\n\x04kind\x18\x01 \x01(\x0e\x32#.stackchan.websocket.v1.MessageKind\x12\x39\n\x0cmessage_type\x18\x02 \x01(\x0e\x32#.stackchan.websocket.v1.MessageType\x12\x0b\n\x03seq\x18\x03 \x01(\r\x12@\n\x0f\x61udio_pcm_start\x18\n \x01(\x0b\x32%.stackchan.websocket.v1.AudioPcmStartH\x00\x12<\n\x0e\x61udio_pcm_data\x18\x0b \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_pcm_end\x18\x0c \x01(\x0b\x32#.stackchan.websocket.v1.AudioPcmEndH\x00\x12@\n\x0f\x61udio_wav_start\x18\x14 \x01(\x0b\x32%.stackchan.websocket.v1.AudioWavStartH\x00\x12<\n\x0e\x61udio_wav_data\x18\x15 \x01(\x0b\x32\".stackchan.websocket.v1.AudioChunkH\x00\x12<\n\raudio_wav_end\x18\x16 \x01(\x0b\x32#.stackchan.websocket.v1.AudioWavEndH\x00\x12\x39\n\tstate_cmd\x18\x1e \x01(\x0b\x32$.stackchan.websocket.v1.StateCommandH\x00\x12>\n\rwake_word_evt\x18\x1f \x01(\x0b\x32%.stackchan.websocket.v1.WakeWordEventH\x00\x12\x37\n\tstate_evt\x18  \x01(\x0b\x32\".stackchan.websocket.v1.StateEventH\x00\x12@\n\x0espeak_done_evt\x18! \x01(\x0b\x32&.stackchan.websocket.v1.SpeakDoneEventH\x00\x12\x41\n\tservo_cmd\x18\" \x01(\x0b\x32,.stackchan.websocket.v1.ServoCommandSequenceH\x00\x12@\n\x0eservo_done_evt\x18# \x01(\x0b\x32&.stackchan.websocket.v1.ServoDoneEventH\x00\x42\x06\n\x04\x62ody\"\x0f\n\rAudioPcmStart\"\r\n\x0b\x41udioPcmEnd\"6\n\rAudioWavStart\x12\x13\n\x0bsample_rate\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\"\r\n\x0b\x41udioWavEnd\"\x1f\n\nAudioChunk\x12\x11\n\tpcm_bytes\x18\x01 \x01(\x0c\"E\n\x0cStateCommand\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"!\n\rWakeWordEvent\x12\x10\n\x08\x64\x65tected\x18\x01 \x01(\x08\"C\n\nStateEvent\x12\x35\n\x05state\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.StackchanState\"\x1e\n\x0eSpeakDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08\"N\n\x14ServoCommandSequence\x12\x36\n\x08\x63ommands\x18\x01 \x03(\x0b\x32$.stackchan.websocket.v1.ServoCommand\"f\n\x0cServoCommand\x12\x32\n\x02op\x18\x01 \x01(\x0e\x32&.stackchan.websocket.v1.ServoOperation\x12\r\n\x05\x61ngle\x18\x02 \x01(\x11\x12\x13\n\x0b\x64uration_ms\x18\x03 \x01(\x11\"\x1e\n\x0eServoDoneEvent\x12\x0c\n\x04\x64one\x18\x01 \x01(\x08*\x99\x02\n\x0bMessageKind\x12\x1c\n\x18MESSAGE_KIND_UNSPECIFIED\x10\x00\x12\x1a\n\x16MESSAGE_KIND_AUDIO_PCM\x10\x01\x12\x1a\n\x16MESSAGE_KIND_AUDIO_WAV\x10\x02\x12\x1a\n\x16MESSAGE_KIND_STATE_CMD\x10\x03\x12\x1e\n\x1aMESSAGE_KIND_WAKE_WORD_EVT\x10\x04\x12\x1a\n\x16MESSAGE_KIND_STATE_EVT\x10\x05\x12\x1f\n\x1bMESSAGE_KIND_SPEAK_DONE_EVT\x10\x06\x12\x1a\n\x16MESSAGE_KIND_SERVO_CMD\x10\x07\x12\x1f\n\x1bMESSAGE_KIND_SERVO_DONE_EVT\x10\x08*p\n\x0bMessageType\x12\x1c\n\x18MESSAGE_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12MESSAGE_TYPE_START\x10\x01\x12\x15\n\x11MESSAGE_TYPE_DATA\x10\x02\x12\x14\n\x10MESSAGE_TYPE_END\x10\x03*\x85\x01\n\x0eStackchanState\x12\x18\n\x14STACKCHAN_STATE_IDLE\x10\x00\x12\x1d\n\x19STACKCHAN_STATE_LISTENING\x10\x01\x12\x1c\n\x18STACKCHAN_STATE_THINKING\x10\x02\x12\x1c\n\x18STACKCHAN_STATE_SPEAKING\x10\x03*c\n\x0eServoOperation\x12\x19\n\x15SERVO_OPERATION_SLEEP\x10\x00\x12\x1a\n\x16SERVO_OPERATION_MOVE_X\x10\x01\x12\x1a\n\x16SERVO_OPERATION_MOVE_Y\x10\x02\x62\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'websocket_message_pb2', _globals)
+if not _descriptor._USE_C_DESCRIPTORS:
+  DESCRIPTOR._loaded_options = None
+  _globals['_MESSAGEKIND']._serialized_start=1522
+  _globals['_MESSAGEKIND']._serialized_end=1803
+  _globals['_MESSAGETYPE']._serialized_start=1805
+  _globals['_MESSAGETYPE']._serialized_end=1917
+  _globals['_STACKCHANSTATE']._serialized_start=1920
+  _globals['_STACKCHANSTATE']._serialized_end=2053
+  _globals['_SERVOOPERATION']._serialized_start=2055
+  _globals['_SERVOOPERATION']._serialized_end=2154
+  _globals['_WEBSOCKETMESSAGE']._serialized_start=52
+  _globals['_WEBSOCKETMESSAGE']._serialized_end=960
+  _globals['_AUDIOPCMSTART']._serialized_start=962
+  _globals['_AUDIOPCMSTART']._serialized_end=977
+  _globals['_AUDIOPCMEND']._serialized_start=979
+  _globals['_AUDIOPCMEND']._serialized_end=992
+  _globals['_AUDIOWAVSTART']._serialized_start=994
+  _globals['_AUDIOWAVSTART']._serialized_end=1048
+  _globals['_AUDIOWAVEND']._serialized_start=1050
+  _globals['_AUDIOWAVEND']._serialized_end=1063
+  _globals['_AUDIOCHUNK']._serialized_start=1065
+  _globals['_AUDIOCHUNK']._serialized_end=1096
+  _globals['_STATECOMMAND']._serialized_start=1098
+  _globals['_STATECOMMAND']._serialized_end=1167
+  _globals['_WAKEWORDEVENT']._serialized_start=1169
+  _globals['_WAKEWORDEVENT']._serialized_end=1202
+  _globals['_STATEEVENT']._serialized_start=1204
+  _globals['_STATEEVENT']._serialized_end=1271
+  _globals['_SPEAKDONEEVENT']._serialized_start=1273
+  _globals['_SPEAKDONEEVENT']._serialized_end=1303
+  _globals['_SERVOCOMMANDSEQUENCE']._serialized_start=1305
+  _globals['_SERVOCOMMANDSEQUENCE']._serialized_end=1383
+  _globals['_SERVOCOMMAND']._serialized_start=1385
+  _globals['_SERVOCOMMAND']._serialized_end=1487
+  _globals['_SERVODONEEVENT']._serialized_start=1489
+  _globals['_SERVODONEEVENT']._serialized_end=1519
+# @@protoc_insertion_point(module_scope)
diff --git a/stackchan_server/protobuf_ws.py b/stackchan_server/protobuf_ws.py
new file mode 100644
index 0000000..94b808c
--- /dev/null
+++ b/stackchan_server/protobuf_ws.py
@@ -0,0 +1,170 @@
+from __future__ import annotations
+
+from collections.abc import Sequence
+from enum import StrEnum
+from typing import Any, Literal, cast
+
+from .generated_protobuf import websocket_message_pb2 as _ws_pb2
+
+ws_pb2: Any = _ws_pb2
+
+ServoMoveType: type[StrEnum] | None = None
+ServoWaitType: type[StrEnum] | None = None
+ServoMoveCommand = tuple[Literal["move_x", "move_y"] | StrEnum, int, int]
+ServoSleepCommand = tuple[Literal["sleep"] | StrEnum, int]
+ServoCommand = ServoMoveCommand | ServoSleepCommand
+
+
+def _ensure_range(value: int, *, minimum: int, maximum: int, label: str) -> int:
+    if not minimum <= value <= maximum:
+        raise ValueError(f"{label} must be between {minimum} and {maximum}: {value}")
+    return value
+
+
+def parse_websocket_message(data: bytes) -> Any:
+    message = ws_pb2.WebSocketMessage()
+    message.ParseFromString(data)
+    return message
+
+
+def _new_message(kind: int, message_type: int, seq: int) -> Any:
+    return ws_pb2.WebSocketMessage(kind=kind, message_type=message_type, seq=seq)
+
+
+def encode_audio_pcm_start_message(seq: int) -> bytes:
+    message = _new_message(
+        ws_pb2.MESSAGE_KIND_AUDIO_PCM,
+        ws_pb2.MESSAGE_TYPE_START,
+        seq,
+    )
+    message.audio_pcm_start.SetInParent()
+    return message.SerializeToString()
+
+
+def encode_audio_pcm_data_message(seq: int, pcm_bytes: bytes) -> bytes:
+    message = _new_message(
+        ws_pb2.MESSAGE_KIND_AUDIO_PCM,
+        ws_pb2.MESSAGE_TYPE_DATA,
+        seq,
+    )
+    message.audio_pcm_data.pcm_bytes = pcm_bytes
+    return message.SerializeToString()
+
+
+def encode_audio_pcm_end_message(seq: int) -> bytes:
+    message = _new_message(
+        ws_pb2.MESSAGE_KIND_AUDIO_PCM,
+        ws_pb2.MESSAGE_TYPE_END,
+        seq,
+    )
+    message.audio_pcm_end.SetInParent()
+    return message.SerializeToString()
+
+
+def encode_audio_wav_start_message(seq: int, *, sample_rate: int, channels: int) -> bytes:
+    message = _new_message(
+        ws_pb2.MESSAGE_KIND_AUDIO_WAV,
+        ws_pb2.MESSAGE_TYPE_START,
+        seq,
+    )
+    message.audio_wav_start.sample_rate = int(sample_rate)
+    message.audio_wav_start.channels = int(channels)
+    return message.SerializeToString()
+
+
+def encode_audio_wav_data_message(seq: int, pcm_bytes: bytes) -> bytes:
+    message = _new_message(
+        ws_pb2.MESSAGE_KIND_AUDIO_WAV,
+        ws_pb2.MESSAGE_TYPE_DATA,
+        seq,
+    )
+    message.audio_wav_data.pcm_bytes = pcm_bytes
+    return message.SerializeToString()
+
+
+def encode_audio_wav_end_message(seq: int) -> bytes:
+    message = _new_message(
+        ws_pb2.MESSAGE_KIND_AUDIO_WAV,
+        ws_pb2.MESSAGE_TYPE_END,
+        seq,
+    )
+    message.audio_wav_end.SetInParent()
+    return message.SerializeToString()
+
+
+def encode_state_command_message(seq: int, state_id: int) -> bytes:
+    message = _new_message(
+        ws_pb2.MESSAGE_KIND_STATE_CMD,
+        ws_pb2.MESSAGE_TYPE_DATA,
+        seq,
+    )
+    message.state_cmd.state = int(state_id)
+    return message.SerializeToString()
+
+
+def encode_servo_command_message(seq: int, commands: Sequence[ServoCommand]) -> bytes:
+    normalized = list(commands)
+    _ensure_range(len(normalized), minimum=0, maximum=255, label="servo command count")
+
+    message = _new_message(
+        ws_pb2.MESSAGE_KIND_SERVO_CMD,
+        ws_pb2.MESSAGE_TYPE_DATA,
+        seq,
+    )
+
+    for index, command in enumerate(normalized):
+        encoded = message.servo_cmd.commands.add()
+        if len(command) == 2:
+            name, raw_duration_ms = cast(ServoSleepCommand, command)
+            if str(name) != "sleep":
+                raise ValueError(f"unsupported servo command at index {index}: {name}")
+            encoded.op = ws_pb2.SERVO_OPERATION_SLEEP
+            encoded.duration_ms = _ensure_range(
+                int(raw_duration_ms),
+                minimum=-32768,
+                maximum=32767,
+                label="sleep duration",
+            )
+            continue
+
+        if len(command) == 3:
+            name, raw_angle, raw_duration_ms = cast(ServoMoveCommand, command)
+            if str(name) not in ("move_x", "move_y"):
+                raise ValueError(f"unsupported servo command at index {index}: {name}")
+            encoded.op = (
+                ws_pb2.SERVO_OPERATION_MOVE_X
+                if str(name) == "move_x"
+                else ws_pb2.SERVO_OPERATION_MOVE_Y
+            )
+            encoded.angle = _ensure_range(
+                int(raw_angle),
+                minimum=-128,
+                maximum=127,
+                label="servo angle",
+            )
+            encoded.duration_ms = _ensure_range(
+                int(raw_duration_ms),
+                minimum=-32768,
+                maximum=32767,
+                label="servo duration",
+            )
+            continue
+
+        raise ValueError(f"unsupported servo command at index {index}: {command}")
+
+    return message.SerializeToString()
+
+
+__all__ = [
+    "ServoCommand",
+    "encode_audio_pcm_data_message",
+    "encode_audio_pcm_end_message",
+    "encode_audio_pcm_start_message",
+    "encode_audio_wav_data_message",
+    "encode_audio_wav_end_message",
+    "encode_audio_wav_start_message",
+    "encode_servo_command_message",
+    "encode_state_command_message",
+    "parse_websocket_message",
+    "ws_pb2",
+]
diff --git a/stackchan_server/speak.py b/stackchan_server/speak.py
index b748a37..743ae36 100644
--- a/stackchan_server/speak.py
+++ b/stackchan_server/speak.py
@@ -2,7 +2,6 @@
 
 import asyncio
 import io
-import struct
 import wave
 from datetime import UTC, datetime
 from logging import getLogger
@@ -12,6 +11,11 @@
 from fastapi import WebSocket, WebSocketDisconnect
 
 from .listen import TimeoutError
+from .protobuf_ws import (
+    encode_audio_wav_data_message,
+    encode_audio_wav_end_message,
+    encode_audio_wav_start_message,
+)
 from .types import AudioFormat, SpeechSynthesizer, StreamingSpeechSynthesizer
 
 logger = getLogger(__name__)
@@ -22,11 +26,11 @@ def __init__(
         self,
         *,
         websocket: WebSocket,
-        ws_header_fmt: str,
-        wav_kind: int,
-        start_msg_type: int,
-        data_msg_type: int,
-        end_msg_type: int,
+        ws_header_fmt: str | None = None,
+        wav_kind: int | None = None,
+        start_msg_type: int | None = None,
+        data_msg_type: int | None = None,
+        end_msg_type: int | None = None,
         down_wav_chunk: int,
         down_segment_millis: int,
         down_segment_stagger_millis: int,
@@ -36,11 +40,6 @@ def __init__(
         debug_recording: bool,
     ) -> None:
         self.ws = websocket
-        self.ws_header_fmt = ws_header_fmt
-        self.wav_kind = wav_kind
-        self.start_msg_type = start_msg_type
-        self.data_msg_type = data_msg_type
-        self.end_msg_type = end_msg_type
         self.down_wav_chunk = down_wav_chunk
         self.down_segment_millis = down_segment_millis
         self.down_segment_stagger_millis = down_segment_stagger_millis
@@ -309,40 +308,21 @@ async def _send_segment(
         next_seq: Callable[[], int],
     ) -> None:
         logger.info("Sending segment bytes=%d", len(segment_pcm))
-        start_payload = struct.pack("<IH", tts_sample_rate, tts_channels)
-        start_hdr = struct.pack(
-            self.ws_header_fmt,
-            self.wav_kind,
-            self.start_msg_type,
-            0,
-            next_seq(),
-            len(start_payload),
+        await self.ws.send_bytes(
+            encode_audio_wav_start_message(
+                next_seq(),
+                sample_rate=tts_sample_rate,
+                channels=tts_channels,
+            )
         )
-        await self.ws.send_bytes(start_hdr + start_payload)
 
         seg_offset = 0
         seg_total = len(segment_pcm)
         while seg_offset < seg_total:
             chunk = segment_pcm[seg_offset : seg_offset + self.down_wav_chunk]
-            data_hdr = struct.pack(
-                self.ws_header_fmt,
-                self.wav_kind,
-                self.data_msg_type,
-                0,
-                next_seq(),
-                len(chunk),
-            )
-            await self.ws.send_bytes(data_hdr + chunk)
+            await self.ws.send_bytes(encode_audio_wav_data_message(next_seq(), chunk))
             seg_offset += len(chunk)
 
-        end_hdr = struct.pack(
-            self.ws_header_fmt,
-            self.wav_kind,
-            self.end_msg_type,
-            0,
-            next_seq(),
-            0,
-        )
-        await self.ws.send_bytes(end_hdr)
+        await self.ws.send_bytes(encode_audio_wav_end_message(next_seq()))
 
 __all__ = ["SpeakHandler"]
diff --git a/stackchan_server/ws_proxy.py b/stackchan_server/ws_proxy.py
index b92ef41..488f17c 100644
--- a/stackchan_server/ws_proxy.py
+++ b/stackchan_server/ws_proxy.py
@@ -2,29 +2,34 @@
 
 import asyncio
 import os
-import struct
 from collections import deque
 from contextlib import suppress
 from enum import IntEnum, StrEnum
 from logging import getLogger
 from pathlib import Path
-from typing import Literal, Optional, Sequence, TypeAlias, cast
+from typing import Any, Literal, Optional, Sequence, TypeAlias
 
 from fastapi import WebSocket, WebSocketDisconnect
+from google.protobuf.message import DecodeError
 
+from .generated_protobuf import websocket_message_pb2 as _ws_pb2
 from .listen import EmptyTranscriptError, ListenHandler, TimeoutError
+from .protobuf_ws import (
+    encode_servo_command_message,
+    encode_state_command_message,
+    parse_websocket_message,
+)
 from .speak import SpeakHandler
 from .static import LISTEN_AUDIO_FORMAT
 from .types import SpeechRecognizer, SpeechSynthesizer
 
 logger = getLogger(__name__)
 
+ws_pb2: Any = _ws_pb2
+
 _BASE_DIR = Path(__file__).resolve().parent
 _RECORDINGS_DIR = _BASE_DIR / "recordings"
 
-_WS_HEADER_FMT = "<BBBHH"  # kind, msg_type, reserved, seq, payload_bytes
-_WS_HEADER_SIZE = struct.calcsize(_WS_HEADER_FMT)
-
 _DOWN_WAV_CHUNK = 4096  # bytes per WebSocket frame for synthesized audio (raw PCM)
 _DOWN_SEGMENT_MILLIS = (
     2000  # duration of a single START-DATA-END segment in milliseconds
@@ -43,29 +48,6 @@ class FirmwareState(IntEnum):
     SPEAKING = 3
 
 
-class _WsKind(IntEnum):
-    PCM = 1
-    WAV = 2
-    STATE_CMD = 3
-    WAKEWORD_EVT = 4
-    STATE_EVT = 5
-    SPEAK_DONE_EVT = 6
-    SERVO_CMD = 7
-    SERVO_DONE_EVT = 8
-
-
-class _WsMsgType(IntEnum):
-    START = 1
-    DATA = 2
-    END = 3
-
-
-class _ServoOp(IntEnum):
-    SLEEP = 0
-    MOVE_X = 1
-    MOVE_Y = 2
-
-
 class ServoMoveType(StrEnum):
     MOVE_X = "move_x"
     MOVE_Y = "move_y"
@@ -82,64 +64,6 @@ class ServoWaitType(StrEnum):
 ServoCommand: TypeAlias = ServoMoveCommand | ServoSleepCommand
 
 
-def _ensure_range(value: int, *, minimum: int, maximum: int, label: str) -> int:
-    if not minimum <= value <= maximum:
-        raise ValueError(f"{label} must be between {minimum} and {maximum}: {value}")
-    return value
-
-
-def _encode_servo_commands(commands: Sequence[ServoCommand]) -> bytes:
-    normalized = list(commands)
-    _ensure_range(len(normalized), minimum=0, maximum=255, label="servo command count")
-
-    payload = bytearray()
-    payload.append(len(normalized))
-
-    for index, command in enumerate(normalized):
-        if len(command) == 2:
-            sleep_command = cast(ServoSleepCommand, command)
-            name, raw_duration_ms = sleep_command
-            name = str(name)
-            if name != "sleep":
-                raise ValueError(
-                    f"unsupported servo command at index {index}: {name}"
-                )
-            duration_ms = _ensure_range(
-                int(raw_duration_ms),
-                minimum=-32768,
-                maximum=32767,
-                label="sleep duration",
-            )
-            payload.append(_ServoOp.SLEEP)
-            payload.extend(struct.pack("<h", duration_ms))
-            continue
-
-        if len(command) == 3:
-            move_command = cast(ServoMoveCommand, command)
-            name, raw_angle, raw_duration_ms = move_command
-            name = str(name)
-            if name not in ("move_x", "move_y"):
-                raise ValueError(
-                    f"unsupported servo command at index {index}: {name}"
-                )
-            angle = _ensure_range(
-                int(raw_angle), minimum=-128, maximum=127, label="servo angle"
-            )
-            duration_ms = _ensure_range(
-                int(raw_duration_ms),
-                minimum=-32768,
-                maximum=32767,
-                label="servo duration",
-            )
-            payload.append(_ServoOp.MOVE_X if name == "move_x" else _ServoOp.MOVE_Y)
-            payload.extend(struct.pack("<bh", angle, duration_ms))
-            continue
-
-        raise ValueError(f"unsupported servo command at index {index}: {command}")
-
-    return bytes(payload)
-
-
 class WsProxy:
     def __init__(
         self,
@@ -164,11 +88,6 @@ def __init__(
         )
         self._speaker = SpeakHandler(
             websocket=self.ws,
-            ws_header_fmt=_WS_HEADER_FMT,
-            wav_kind=_WsKind.WAV.value,
-            start_msg_type=_WsMsgType.START.value,
-            data_msg_type=_WsMsgType.DATA.value,
-            end_msg_type=_WsMsgType.END.value,
             down_wav_chunk=_DOWN_WAV_CHUNK,
             down_segment_millis=_DOWN_SEGMENT_MILLIS,
             down_segment_stagger_millis=_DOWN_SEGMENT_STAGGER_MILLIS,
@@ -237,13 +156,14 @@ async def reset_state(self) -> None:
         await self.send_state_command(FirmwareState.IDLE)
 
     async def move_servo(self, commands: Sequence[ServoCommand]) -> None:
-        payload = _encode_servo_commands(commands)
         previous_counter = self._servo_sent_counter
         target_counter = previous_counter + 1
         self._servo_sent_counter = target_counter
         self._pending_servo_wait_targets.append(target_counter)
         try:
-            await self._send_packet(_WsKind.SERVO_CMD, _WsMsgType.DATA, payload)
+            await self.ws.send_bytes(
+                encode_servo_command_message(self._next_down_seq(), commands)
+            )
         except Exception:
             if (
                 self._pending_servo_wait_targets
@@ -285,60 +205,65 @@ async def start_talking(self, text: str) -> None:
     async def _receive_loop(self) -> None:
         try:
             while True:
-                message = await self.ws.receive_bytes()
-                if len(message) < _WS_HEADER_SIZE:
-                    await self.ws.close(code=1003, reason="header too short")
+                raw_message = await self.ws.receive_bytes()
+                try:
+                    message = parse_websocket_message(raw_message)
+                except DecodeError:
+                    await self.ws.close(code=1003, reason="invalid protobuf message")
                     break
 
-                kind, msg_type, _reserved, _seq, payload_bytes = struct.unpack(
-                    _WS_HEADER_FMT, message[:_WS_HEADER_SIZE]
-                )
-
-                payload = message[_WS_HEADER_SIZE:]
-                if payload_bytes != len(payload):
-                    await self.ws.close(code=1003, reason="payload length mismatch")
-                    break
+                if message.kind == ws_pb2.MESSAGE_KIND_AUDIO_PCM:
+                    body_name = message.WhichOneof("body")
 
-                if kind == _WsKind.PCM:
-                    if msg_type == _WsMsgType.START:
+                    if (
+                        message.message_type == ws_pb2.MESSAGE_TYPE_START
+                        and body_name == "audio_pcm_start"
+                    ):
                         if not await self._listener.handle_start(self.ws):
                             break
                         continue
 
-                    if msg_type == _WsMsgType.DATA:
+                    if (
+                        message.message_type == ws_pb2.MESSAGE_TYPE_DATA
+                        and body_name == "audio_pcm_data"
+                    ):
+                        payload = bytes(message.audio_pcm_data.pcm_bytes)
                         if not await self._listener.handle_data(
-                            self.ws, payload_bytes, payload
+                            self.ws, len(payload), payload
                         ):
                             break
                         continue
 
-                    if msg_type == _WsMsgType.END:
+                    if (
+                        message.message_type == ws_pb2.MESSAGE_TYPE_END
+                        and body_name == "audio_pcm_end"
+                    ):
                         await self._listener.handle_end(
                             self.ws,
-                            payload_bytes=payload_bytes,
-                            payload=payload,
+                            payload_bytes=0,
+                            payload=b"",
                             send_state_command=self.send_state_command,
                             thinking_state=FirmwareState.THINKING,
                         )
                         continue
 
-                    await self.ws.close(code=1003, reason="unknown PCM msg type")
+                    await self.ws.close(code=1003, reason="unknown PCM protobuf body")
                     break
 
-                if kind == _WsKind.WAKEWORD_EVT:
-                    self._handle_wakeword_event(msg_type, payload)
+                if message.kind == ws_pb2.MESSAGE_KIND_WAKE_WORD_EVT:
+                    self._handle_wakeword_event(message)
                     continue
 
-                if kind == _WsKind.STATE_EVT:
-                    self._handle_state_event(msg_type, payload)
+                if message.kind == ws_pb2.MESSAGE_KIND_STATE_EVT:
+                    self._handle_state_event(message)
                     continue
 
-                if kind == _WsKind.SPEAK_DONE_EVT:
-                    self._handle_speak_done_event(msg_type, payload)
+                if message.kind == ws_pb2.MESSAGE_KIND_SPEAK_DONE_EVT:
+                    self._handle_speak_done_event(message)
                     continue
 
-                if kind == _WsKind.SERVO_DONE_EVT:
-                    self._handle_servo_done_event(msg_type, payload)
+                if message.kind == ws_pb2.MESSAGE_KIND_SERVO_DONE_EVT:
+                    self._handle_servo_done_event(message)
                     continue
 
                 await self.ws.close(code=1003, reason="unsupported kind")
@@ -348,20 +273,22 @@ async def _receive_loop(self) -> None:
         finally:
             self._closed = True
 
-    def _handle_wakeword_event(self, msg_type: int, payload: bytes) -> None:
-        if msg_type != _WsMsgType.DATA:
+    def _handle_wakeword_event(self, message: Any) -> None:
+        if message.message_type != ws_pb2.MESSAGE_TYPE_DATA:
             return
-        if len(payload) < 1:
+        if message.WhichOneof("body") != "wake_word_evt":
+            return
+        if not message.wake_word_evt.detected:
             return
         logger.info("Received wakeword event")
         self._wakeword_event.set()
 
-    def _handle_state_event(self, msg_type: int, payload: bytes) -> None:
-        if msg_type != _WsMsgType.DATA:
+    def _handle_state_event(self, message: Any) -> None:
+        if message.message_type != ws_pb2.MESSAGE_TYPE_DATA:
             return
-        if len(payload) < 1:
+        if message.WhichOneof("body") != "state_evt":
             return
-        raw_state = int(payload[0])
+        raw_state = int(message.state_evt.state)
         try:
             state = FirmwareState(raw_state)
             self._current_firmware_state = state
@@ -369,38 +296,29 @@ def _handle_state_event(self, msg_type: int, payload: bytes) -> None:
         except ValueError:
             logger.info("Received firmware state=%d", raw_state)
 
-    def _handle_speak_done_event(self, msg_type: int, payload: bytes) -> None:
-        if msg_type != _WsMsgType.DATA:
+    def _handle_speak_done_event(self, message: Any) -> None:
+        if message.message_type != ws_pb2.MESSAGE_TYPE_DATA:
+            return
+        if message.WhichOneof("body") != "speak_done_evt":
             return
-        if len(payload) < 1:
+        if not message.speak_done_evt.done:
             return
         self._speaker.handle_speak_done_event()
 
-    def _handle_servo_done_event(self, msg_type: int, payload: bytes) -> None:
-        if msg_type != _WsMsgType.DATA:
+    def _handle_servo_done_event(self, message: Any) -> None:
+        if message.message_type != ws_pb2.MESSAGE_TYPE_DATA:
+            return
+        if message.WhichOneof("body") != "servo_done_evt":
             return
-        if len(payload) < 1:
+        if not message.servo_done_evt.done:
             return
         self._servo_done_counter += 1
         logger.info("Received servo done event")
 
     async def _send_state_command(self, state_id: int | FirmwareState) -> None:
-        payload = struct.pack("<B", int(state_id))
-        await self._send_packet(_WsKind.STATE_CMD, _WsMsgType.DATA, payload)
-
-    async def _send_packet(
-        self, kind: _WsKind, msg_type: _WsMsgType, payload: bytes = b""
-    ) -> None:
-        hdr = struct.pack(
-            _WS_HEADER_FMT,
-            int(kind),
-            int(msg_type),
-            0,
-            self._down_seq,
-            len(payload),
+        await self.ws.send_bytes(
+            encode_state_command_message(self._next_down_seq(), int(state_id))
         )
-        await self.ws.send_bytes(hdr + payload)
-        self._down_seq += 1
 
     async def _wait_for_counter(
         self,
diff --git a/uv.lock b/uv.lock
index 2540b7b..1a6db32 100644
--- a/uv.lock
+++ b/uv.lock
@@ -549,6 +549,39 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8c/cc/27ba60ad5a5f2067963e6a858743500df408eb5855e98be778eaef8c9b02/grpcio_status-1.76.0-py3-none-any.whl", hash = "sha256:380568794055a8efbbd8871162df92012e0228a5f6dffaf57f2a00c534103b18", size = 14425 },
 ]
 
+[[package]]
+name = "grpcio-tools"
+version = "1.76.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "grpcio" },
+    { name = "protobuf" },
+    { name = "setuptools" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a0/77/17d60d636ccd86a0db0eccc24d02967bbc3eea86b9db7324b04507ebaa40/grpcio_tools-1.76.0.tar.gz", hash = "sha256:ce80169b5e6adf3e8302f3ebb6cb0c3a9f08089133abca4b76ad67f751f5ad88", size = 5390807 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/13/01/b16fe73f129df49811d886dc99d3813a33cf4d1c6e101252b81c895e929f/grpcio_tools-1.76.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:ff48969f81858397ef33a36b326f2dbe2053a48b254593785707845db73c8f44", size = 2546312 },
+    { url = "https://files.pythonhosted.org/packages/25/17/2594c5feb76bb0b25bfbf91ec1075b276e1b2325e4bc7ea649a7b5dbf353/grpcio_tools-1.76.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:aa2f030fd0ef17926026ee8e2b700e388d3439155d145c568fa6b32693277613", size = 5839627 },
+    { url = "https://files.pythonhosted.org/packages/c7/c6/097b1aa26fbf72fb3cdb30138a2788529e4f10d8759de730a83f5c06726e/grpcio_tools-1.76.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bacbf3c54f88c38de8e28f8d9b97c90b76b105fb9ddef05d2c50df01b32b92af", size = 2592817 },
+    { url = "https://files.pythonhosted.org/packages/03/78/d1d985b48592a674509a85438c1a3d4c36304ddfc99d1b05d27233b51062/grpcio_tools-1.76.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0d4e4afe9a0e3c24fad2f1af45f98cf8700b2bfc4d790795756ba035d2ea7bdc", size = 2905186 },
+    { url = "https://files.pythonhosted.org/packages/b9/0e/770afbb47f0b5f594b93a7b46a95b892abda5eebe60efb511e96cee52170/grpcio_tools-1.76.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fbbd4e1fc5af98001ceef5e780e8c10921d94941c3809238081e73818ef707f1", size = 2656188 },
+    { url = "https://files.pythonhosted.org/packages/3d/2b/017c2fcf4c5d3cf00cf7d5ce21eb88521de0d89bdcf26538ad2862ec6d07/grpcio_tools-1.76.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b05efe5a59883ab8292d596657273a60e0c3e4f5a9723c32feb9fc3a06f2f3ef", size = 3109141 },
+    { url = "https://files.pythonhosted.org/packages/e9/5f/2495f88e3d50c6f2c2da2752bad4fa3a30c52ece6c9d8b0c636cd8b1430b/grpcio_tools-1.76.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:be483b90e62b7892eb71fa1fc49750bee5b2ee35b5ec99dd2b32bed4bedb5d71", size = 3657892 },
+    { url = "https://files.pythonhosted.org/packages/5e/1d/c4f39d31b19d9baf35d900bf3f969ce1c842f63a8560c8003ed2e5474760/grpcio_tools-1.76.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:630cd7fd3e8a63e20703a7ad816979073c2253e591b5422583c27cae2570de73", size = 3324778 },
+    { url = "https://files.pythonhosted.org/packages/b4/b6/35ee3a6e4af85a93da28428f81f4b29bcb36f6986b486ad71910fcc02e25/grpcio_tools-1.76.0-cp313-cp313-win32.whl", hash = "sha256:eb2567280f9f6da5444043f0e84d8408c7a10df9ba3201026b30e40ef3814736", size = 993084 },
+    { url = "https://files.pythonhosted.org/packages/f3/7a/5bd72344d86ee860e5920c9a7553cfe3bc7b1fce79f18c00ac2497f5799f/grpcio_tools-1.76.0-cp313-cp313-win_amd64.whl", hash = "sha256:0071b1c0bd0f5f9d292dca4efab32c92725d418e57f9c60acdc33c0172af8b53", size = 1158151 },
+    { url = "https://files.pythonhosted.org/packages/f0/c0/aa20eebe8f3553b7851643e9c88d237c3a6ca30ade646897e25dbb27be99/grpcio_tools-1.76.0-cp314-cp314-linux_armv7l.whl", hash = "sha256:c53c5719ef2a435997755abde3826ba4087174bd432aa721d8fac781fcea79e4", size = 2546297 },
+    { url = "https://files.pythonhosted.org/packages/d9/98/6af702804934443c1d0d4d27d21b990d92d22ddd1b6bec6b056558cbbffa/grpcio_tools-1.76.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:e3db1300d7282264639eeee7243f5de7e6a7c0283f8bf05d66c0315b7b0f0b36", size = 5839804 },
+    { url = "https://files.pythonhosted.org/packages/ea/8d/7725fa7b134ef8405ffe0a37c96eeb626e5af15d70e1bdac4f8f1abf842e/grpcio_tools-1.76.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b018a4b7455a7e8c16d0fdb3655a6ba6c9536da6de6c5d4f11b6bb73378165b", size = 2593922 },
+    { url = "https://files.pythonhosted.org/packages/de/ff/5b6b5012c79fa72f9107dc13f7226d9ce7e059ea639fd8c779e0dd284386/grpcio_tools-1.76.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:ec6e4de3866e47cfde56607b1fae83ecc5aa546e06dec53de11f88063f4b5275", size = 2905327 },
+    { url = "https://files.pythonhosted.org/packages/24/01/2691d369ea462cd6b6c92544122885ca01f7fa5ac75dee023e975e675858/grpcio_tools-1.76.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b8da4d828883913f1852bdd67383713ae5c11842f6c70f93f31893eab530aead", size = 2656214 },
+    { url = "https://files.pythonhosted.org/packages/6a/e7/3f8856e6ec3dd492336a91572993344966f237b0e3819fbe96437b19d313/grpcio_tools-1.76.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:5c120c2cf4443121800e7f9bcfe2e94519fa25f3bb0b9882359dd3b252c78a7b", size = 3109889 },
+    { url = "https://files.pythonhosted.org/packages/f3/e4/ce5248072e47db276dc7e069e93978dcde490c959788ce7cce8081d0bfdc/grpcio_tools-1.76.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:8b7df5591d699cd9076065f1f15049e9c3597e0771bea51c8c97790caf5e4197", size = 3657939 },
+    { url = "https://files.pythonhosted.org/packages/f6/df/81ff88af93c52135e425cd5ec9fe8b186169c7d5f9e0409bdf2bbedc3919/grpcio_tools-1.76.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a25048c5f984d33e3f5b6ad7618e98736542461213ade1bd6f2fcfe8ce804e3d", size = 3324752 },
+    { url = "https://files.pythonhosted.org/packages/35/3d/f6b83044afbf6522254a3b509515a00fed16a819c87731a478dbdd1d35c1/grpcio_tools-1.76.0-cp314-cp314-win32.whl", hash = "sha256:4b77ce6b6c17869858cfe14681ad09ed3a8a80e960e96035de1fd87f78158740", size = 1015578 },
+    { url = "https://files.pythonhosted.org/packages/95/4d/31236cddb7ffb09ba4a49f4f56d2608fec3bbb21c7a0a975d93bca7cd22e/grpcio_tools-1.76.0-cp314-cp314-win_amd64.whl", hash = "sha256:2ccd2c8d041351cc29d0fc4a84529b11ee35494a700b535c1f820b642f2a72fc", size = 1190242 },
+]
+
 [[package]]
 name = "h11"
 version = "0.16.0"
@@ -1187,6 +1220,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6d/78/097c0798b1dab9f8affe73da9642bb4500e098cb27fd8dc9724816ac747b/ruff-0.15.2-py3-none-win_arm64.whl", hash = "sha256:cabddc5822acdc8f7b5527b36ceac55cc51eec7b1946e60181de8fe83ca8876e", size = 10941649 },
 ]
 
+[[package]]
+name = "setuptools"
+version = "82.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4f/db/cfac1baf10650ab4d1c111714410d2fbb77ac5a616db26775db562c8fab2/setuptools-82.0.1.tar.gz", hash = "sha256:7d872682c5d01cfde07da7bccc7b65469d3dca203318515ada1de5eda35efbf9", size = 1152316 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9d/76/f789f7a86709c6b087c5a2f52f911838cad707cc613162401badc665acfe/setuptools-82.0.1-py3-none-any.whl", hash = "sha256:a59e362652f08dcd477c78bb6e7bd9d80a7995bc73ce773050228a348ce2e5bb", size = 1006223 },
+]
+
 [[package]]
 name = "sniffio"
 version = "1.3.1"
@@ -1412,6 +1454,7 @@ dependencies = [
     { name = "fastapi" },
     { name = "google-cloud-speech" },
     { name = "google-genai" },
+    { name = "protobuf" },
     { name = "pydantic-settings" },
     { name = "python-dotenv" },
     { name = "uvicorn", extra = ["standard"] },
@@ -1420,6 +1463,7 @@ dependencies = [
 
 [package.dev-dependencies]
 dev = [
+    { name = "grpcio-tools" },
     { name = "ruff" },
     { name = "ty" },
 ]
@@ -1432,6 +1476,7 @@ requires-dist = [
     { name = "fastapi", specifier = ">=0.128.0" },
     { name = "google-cloud-speech", specifier = ">=2.35.0" },
     { name = "google-genai", specifier = ">=1.59.0" },
+    { name = "protobuf", specifier = ">=6.33.3" },
     { name = "pydantic-settings", specifier = ">=2.13.1" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "uvicorn", extras = ["standard"], specifier = ">=0.40.0" },
@@ -1440,6 +1485,7 @@ requires-dist = [
 
 [package.metadata.requires-dev]
 dev = [
+    { name = "grpcio-tools", specifier = ">=1.76.0" },
     { name = "ruff", specifier = ">=0.15.2" },
     { name = "ty", specifier = ">=0.0.17" },
 ]

From 595e765cef8815a31edf7d97c3413320a3b3a6d6 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sun, 19 Apr 2026 15:14:59 +0900
Subject: [PATCH 2/5] =?UTF-8?q?feat:=20Protobuf=E3=83=A1=E3=83=83=E3=82=BB?=
 =?UTF-8?q?=E3=83=BC=E3=82=B8=E3=82=BF=E3=82=A4=E3=83=97=E3=81=AB=E5=9F=BA?=
 =?UTF-8?q?=E3=81=A5=E3=81=8FWebSocket=E3=83=A1=E3=83=83=E3=82=BB=E3=83=BC?=
 =?UTF-8?q?=E3=82=B8=E5=87=A6=E7=90=86=E3=81=AE=E6=9B=B4=E6=96=B0=E3=81=A8?=
 =?UTF-8?q?=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF=E3=82=BF=E3=83=AA=E3=83=B3?=
 =?UTF-8?q?=E3=82=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 firmware/include/listening.hpp |   2 +-
 firmware/include/protocols.hpp |  60 ----------------
 firmware/include/servo.hpp     |   2 +-
 firmware/include/speaking.hpp  |   8 ++-
 firmware/src/listening.cpp     |  20 +++---
 firmware/src/main.cpp          |  53 ++++++--------
 firmware/src/protocols.cpp     | 104 ---------------------------
 firmware/src/servo.cpp         |  20 +++---
 firmware/src/speaking.cpp      | 125 +++++++++++++++------------------
 9 files changed, 103 insertions(+), 291 deletions(-)

diff --git a/firmware/include/listening.hpp b/firmware/include/listening.hpp
index cf00113..0e18ba8 100644
--- a/firmware/include/listening.hpp
+++ b/firmware/include/listening.hpp
@@ -36,7 +36,7 @@ class Listening
 
 private:
   void updateLevelStats(const int16_t *samples, size_t sampleCount);
-  bool sendPacket(MessageType type, const int16_t *samples, size_t sampleCount);
+  bool sendPacket(stackchan_websocket_v1_MessageType type, const int16_t *samples, size_t sampleCount);
   void ringPush(const int16_t *src, size_t samples);
   size_t ringPop(int16_t *dst, size_t samples);
 
diff --git a/firmware/include/protocols.hpp b/firmware/include/protocols.hpp
index feea050..89cb7d6 100644
--- a/firmware/include/protocols.hpp
+++ b/firmware/include/protocols.hpp
@@ -7,70 +7,10 @@
 
 #include "../lib/generated_protobuf/websocket-message.pb.h"
 
-// Internal compatibility metadata for message routing after protobuf decode.
-// This is no longer sent on the wire directly.
-
-enum class MessageKind : uint8_t
-{
-	AudioPcm = 1, // uplink PCM16LE stream (client -> server)
-	AudioWav = 2, // downlink WAV bytes (server -> client)
-	StateCmd = 3, // state transition command (server -> client)
-	WakeWordEvt = 4, // wake word event (client -> server)
-	StateEvt = 5, // current state event (client -> server)
-	SpeakDoneEvt = 6, // speaking completed event (client -> server)
-	ServoCmd = 7, // servo command sequence (server -> client)
-	ServoDoneEvt = 8, // servo sequence completed event (client -> server)
-};
-
-enum class MessageType : uint8_t
-{
-	START = 1,
-	DATA = 2,
-	END = 3,
-};
-
-struct __attribute__((packed)) WsHeader
-{
-	uint8_t kind;        // MessageKind
-	uint8_t messageType; // MessageType
-	uint8_t reserved;    // 0 (flags/reserved)
-	uint32_t seq;        // sequence number
-	uint32_t payloadBytes; // bytes following the header
-};
-
-// payload for kind=StateCmd, messageType=DATA
-// 1 byte: target state id (matches StateMachine::State)
-enum class RemoteState : uint8_t
-{
-	Idle = 0,
-	Listening = 1,
-	Thinking = 2,
-	Speaking = 3,
-};
-
-// payload for kind=ServoCmd, messageType=DATA
-// <uint8_t command_count><commands...>
-//   command op=Sleep: <uint8_t op><int16_t duration_ms>
-//   command op=MoveX/Y: <uint8_t op><int8_t angle><int16_t duration_ms>
-enum class ServoCommandOp : uint8_t
-{
-	Sleep = 0,
-	MoveX = 1,
-	MoveY = 2,
-};
-
 constexpr size_t kProtoAudioChunkMaxBytes = 4096;
 constexpr size_t kProtoServoCommandMaxCount = 255;
 constexpr size_t kMaxEncodedWebSocketMessageBytes = stackchan_websocket_v1_WebSocketMessage_size;
 
-stackchan_websocket_v1_MessageKind toProtoMessageKind(MessageKind kind);
-stackchan_websocket_v1_MessageType toProtoMessageType(MessageType type);
-stackchan_websocket_v1_StackchanState toProtoState(RemoteState state);
-stackchan_websocket_v1_ServoOperation toProtoServoOperation(ServoCommandOp op);
-
-RemoteState fromProtoState(stackchan_websocket_v1_StackchanState state);
-ServoCommandOp fromProtoServoOperation(stackchan_websocket_v1_ServoOperation op);
-
 bool setProtoAudioChunk(
 	stackchan_websocket_v1_AudioChunk &chunk,
 	const uint8_t *data,
diff --git a/firmware/include/servo.hpp b/firmware/include/servo.hpp
index f9b4e3b..de7aafd 100644
--- a/firmware/include/servo.hpp
+++ b/firmware/include/servo.hpp
@@ -51,7 +51,7 @@ class BodyServo
 
   struct Step
   {
-    ServoCommandOp op;
+    stackchan_websocket_v1_ServoOperation op;
     int8_t angle = 0;
     int16_t duration_ms = 0;
   };
diff --git a/firmware/include/speaking.hpp b/firmware/include/speaking.hpp
index fbf90c1..e3630b3 100644
--- a/firmware/include/speaking.hpp
+++ b/firmware/include/speaking.hpp
@@ -19,8 +19,10 @@ class Speaking
   void begin();
   void end();
 
-  // Process one WS audio message of kind AudioWav
-  void handleWavMessage(const WsHeader &hdr, const uint8_t *body, size_t bodyLen);
+  // Process AudioWav protobuf messages.
+  void handleWavStart(uint32_t seq, uint32_t sampleRate, uint16_t channels);
+  void handleWavData(uint32_t seq, const uint8_t *body, size_t bodyLen);
+  void handleWavEnd(uint32_t seq);
 
   // Called from main loop to progress playback state
   void loop();
@@ -37,7 +39,7 @@ class Speaking
   bool playing_ = false;
   bool mic_was_enabled_ = false;
   bool streaming_ = false;
-  uint16_t next_seq_ = 0;
+  uint32_t next_seq_ = 0;
   uint32_t sample_rate_ = 24000;
   uint16_t channels_ = 1;
   std::function<void()> on_speak_finished_;
diff --git a/firmware/src/listening.cpp b/firmware/src/listening.cpp
index 2aa9d77..edb2e35 100644
--- a/firmware/src/listening.cpp
+++ b/firmware/src/listening.cpp
@@ -58,7 +58,7 @@ bool Listening::startStreaming()
   last_level_ = 0;
   silence_since_ms_ = 0;
   streaming_ = true;
-  return sendPacket(MessageType::START, nullptr, 0);
+  return sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_START, nullptr, 0);
 }
 
 bool Listening::stopStreaming()
@@ -79,7 +79,7 @@ bool Listening::stopStreaming()
     {
       size_t chunk = std::min({chunk_samples_, to_send, tail_capacity});
       size_t sent = ringPop(tail_buf.data(), chunk);
-      if (!sendPacket(MessageType::DATA, tail_buf.data(), sent))
+      if (!sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA, tail_buf.data(), sent))
       {
         ok = false;
         break;
@@ -89,7 +89,7 @@ bool Listening::stopStreaming()
   }
 
   streaming_ = false;
-  ok = sendPacket(MessageType::END, nullptr, 0) && ok;
+  ok = sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END, nullptr, 0) && ok;
   return ok;
 }
 
@@ -119,7 +119,7 @@ void Listening::loop()
     }
 
     size_t got = ringPop(send_buf.data(), chunk_samples_);
-    if (!sendPacket(MessageType::DATA, send_buf.data(), got))
+    if (!sendPacket(stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA, send_buf.data(), got))
     {
       streaming_ = false;
       log_i("WS send failed (data)");
@@ -187,7 +187,7 @@ bool Listening::shouldStopForSilence() const
   return elapsed >= kSilenceDurationMs;
 }
 
-bool Listening::sendPacket(MessageType type, const int16_t *samples, size_t sampleCount)
+bool Listening::sendPacket(stackchan_websocket_v1_MessageType type, const int16_t *samples, size_t sampleCount)
 {
   if ((WiFi.status() != WL_CONNECTED) || !ws_.isConnected())
   {
@@ -196,16 +196,16 @@ bool Listening::sendPacket(MessageType type, const int16_t *samples, size_t samp
 
   auto &message = g_listening_tx_message;
   message = stackchan_websocket_v1_WebSocketMessage_init_zero;
-  message.kind = toProtoMessageKind(MessageKind::AudioPcm);
-  message.message_type = toProtoMessageType(type);
+  message.kind = stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_PCM;
+  message.message_type = type;
   message.seq = seq_counter_++;
 
   switch (type)
   {
-  case MessageType::START:
+  case stackchan_websocket_v1_MessageType_MESSAGE_TYPE_START:
     message.which_body = stackchan_websocket_v1_WebSocketMessage_audio_pcm_start_tag;
     break;
-  case MessageType::DATA:
+  case stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA:
     message.which_body = stackchan_websocket_v1_WebSocketMessage_audio_pcm_data_tag;
     if (!setProtoAudioChunk(
             message.body.audio_pcm_data,
@@ -215,7 +215,7 @@ bool Listening::sendPacket(MessageType type, const int16_t *samples, size_t samp
       return false;
     }
     break;
-  case MessageType::END:
+  case stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END:
     message.which_body = stackchan_websocket_v1_WebSocketMessage_audio_pcm_end_tag;
     break;
   default:
diff --git a/firmware/src/main.cpp b/firmware/src/main.cpp
index ca932b3..8d41cca 100644
--- a/firmware/src/main.cpp
+++ b/firmware/src/main.cpp
@@ -96,8 +96,8 @@ void notifyWakeWordDetected()
 {
   auto &message = g_tx_message;
   message = stackchan_websocket_v1_WebSocketMessage_init_zero;
-  message.kind = toProtoMessageKind(MessageKind::WakeWordEvt);
-  message.message_type = toProtoMessageType(MessageType::DATA);
+  message.kind = stackchan_websocket_v1_MessageKind_MESSAGE_KIND_WAKE_WORD_EVT;
+  message.message_type = stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA;
   message.seq = g_uplink_seq++;
   message.which_body = stackchan_websocket_v1_WebSocketMessage_wake_word_evt_tag;
   message.body.wake_word_evt.detected = true;
@@ -111,8 +111,8 @@ void notifyCurrentState(StateMachine::State state)
 {
   auto &message = g_tx_message;
   message = stackchan_websocket_v1_WebSocketMessage_init_zero;
-  message.kind = toProtoMessageKind(MessageKind::StateEvt);
-  message.message_type = toProtoMessageType(MessageType::DATA);
+  message.kind = stackchan_websocket_v1_MessageKind_MESSAGE_KIND_STATE_EVT;
+  message.message_type = stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA;
   message.seq = g_uplink_seq++;
   message.which_body = stackchan_websocket_v1_WebSocketMessage_state_evt_tag;
   message.body.state_evt.state = static_cast<stackchan_websocket_v1_StackchanState>(static_cast<uint8_t>(state));
@@ -126,8 +126,8 @@ void notifySpeakDone()
 {
   auto &message = g_tx_message;
   message = stackchan_websocket_v1_WebSocketMessage_init_zero;
-  message.kind = toProtoMessageKind(MessageKind::SpeakDoneEvt);
-  message.message_type = toProtoMessageType(MessageType::DATA);
+  message.kind = stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SPEAK_DONE_EVT;
+  message.message_type = stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA;
   message.seq = g_uplink_seq++;
   message.which_body = stackchan_websocket_v1_WebSocketMessage_speak_done_evt_tag;
   message.body.speak_done_evt.done = true;
@@ -141,8 +141,8 @@ void notifyServoDone()
 {
   auto &message = g_tx_message;
   message = stackchan_websocket_v1_WebSocketMessage_init_zero;
-  message.kind = toProtoMessageKind(MessageKind::ServoDoneEvt);
-  message.message_type = toProtoMessageType(MessageType::DATA);
+  message.kind = stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_DONE_EVT;
+  message.message_type = stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA;
   message.seq = g_uplink_seq++;
   message.which_body = stackchan_websocket_v1_WebSocketMessage_servo_done_evt_tag;
   message.body.servo_done_evt.done = true;
@@ -154,19 +154,18 @@ void notifyServoDone()
 
 bool applyRemoteStateCommand(const stackchan_websocket_v1_StateCommand &command)
 {
-  RemoteState target = fromProtoState(command.state);
-  switch (target)
+  switch (command.state)
   {
-  case RemoteState::Idle:
+  case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE:
     stateMachine.setState(StateMachine::Idle);
     return true;
-  case RemoteState::Listening:
+  case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING:
     stateMachine.setState(StateMachine::Listening);
     return true;
-  case RemoteState::Thinking:
+  case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING:
     stateMachine.setState(StateMachine::Thinking);
     return true;
-  case RemoteState::Speaking:
+  case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING:
     stateMachine.setState(StateMachine::Speaking);
     return true;
   default:
@@ -190,7 +189,7 @@ bool applyServoCommand(const stackchan_websocket_v1_ServoCommandSequence &sequen
   for (pb_size_t i = 0; i < sequence.commands_count; ++i)
   {
     const auto &command = sequence.commands[i];
-    const ServoCommandOp op = fromProtoServoOperation(command.op);
+    const auto op = command.op;
 
     if (command.duration_ms < std::numeric_limits<int16_t>::min() ||
         command.duration_ms > std::numeric_limits<int16_t>::max())
@@ -200,7 +199,7 @@ bool applyServoCommand(const stackchan_websocket_v1_ServoCommandSequence &sequen
     }
 
     payload.push_back(static_cast<uint8_t>(op));
-    if (op == ServoCommandOp::Sleep)
+    if (op == stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP)
     {
       appendInt16Le(payload, static_cast<int16_t>(command.duration_ms));
       continue;
@@ -276,34 +275,24 @@ void handleWsEvent(WStype_t type, uint8_t *payload, size_t length)
     {
     case stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_WAV:
     {
-      WsHeader compat{};
-      compat.kind = static_cast<uint8_t>(MessageKind::AudioWav);
-      compat.messageType = static_cast<uint8_t>(rx.message_type);
-      compat.seq = rx.seq;
-
       if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_START &&
           rx.which_body == stackchan_websocket_v1_WebSocketMessage_audio_wav_start_tag)
       {
-        uint8_t body[6]{};
-        uint32_t sample_rate = rx.body.audio_wav_start.sample_rate;
-        uint16_t channels = static_cast<uint16_t>(rx.body.audio_wav_start.channels);
-        memcpy(body, &sample_rate, sizeof(sample_rate));
-        memcpy(body + sizeof(sample_rate), &channels, sizeof(channels));
-        compat.payloadBytes = sizeof(body);
-        speaking.handleWavMessage(compat, body, sizeof(body));
+        speaking.handleWavStart(
+            rx.seq,
+            rx.body.audio_wav_start.sample_rate,
+            static_cast<uint16_t>(rx.body.audio_wav_start.channels));
       }
       else if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA &&
                rx.which_body == stackchan_websocket_v1_WebSocketMessage_audio_wav_data_tag)
       {
         size_t body_len = getProtoAudioChunkSize(rx.body.audio_wav_data);
-        compat.payloadBytes = body_len;
-        speaking.handleWavMessage(compat, getProtoAudioChunkBytes(rx.body.audio_wav_data), body_len);
+        speaking.handleWavData(rx.seq, getProtoAudioChunkBytes(rx.body.audio_wav_data), body_len);
       }
       else if (rx.message_type == stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END &&
                rx.which_body == stackchan_websocket_v1_WebSocketMessage_audio_wav_end_tag)
       {
-        compat.payloadBytes = 0;
-        speaking.handleWavMessage(compat, nullptr, 0);
+        speaking.handleWavEnd(rx.seq);
       }
       else
       {
diff --git a/firmware/src/protocols.cpp b/firmware/src/protocols.cpp
index 8b61312..16af592 100644
--- a/firmware/src/protocols.cpp
+++ b/firmware/src/protocols.cpp
@@ -5,110 +5,6 @@
 #include <pb_decode.h>
 #include <pb_encode.h>
 
-stackchan_websocket_v1_MessageKind toProtoMessageKind(MessageKind kind)
-{
-	switch (kind)
-	{
-	case MessageKind::AudioPcm:
-		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_PCM;
-	case MessageKind::AudioWav:
-		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_AUDIO_WAV;
-	case MessageKind::StateCmd:
-		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_STATE_CMD;
-	case MessageKind::WakeWordEvt:
-		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_WAKE_WORD_EVT;
-	case MessageKind::StateEvt:
-		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_STATE_EVT;
-	case MessageKind::SpeakDoneEvt:
-		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SPEAK_DONE_EVT;
-	case MessageKind::ServoCmd:
-		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_CMD;
-	case MessageKind::ServoDoneEvt:
-		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_SERVO_DONE_EVT;
-	default:
-		return stackchan_websocket_v1_MessageKind_MESSAGE_KIND_UNSPECIFIED;
-	}
-}
-
-stackchan_websocket_v1_MessageType toProtoMessageType(MessageType type)
-{
-	switch (type)
-	{
-	case MessageType::START:
-		return stackchan_websocket_v1_MessageType_MESSAGE_TYPE_START;
-	case MessageType::DATA:
-		return stackchan_websocket_v1_MessageType_MESSAGE_TYPE_DATA;
-	case MessageType::END:
-		return stackchan_websocket_v1_MessageType_MESSAGE_TYPE_END;
-	default:
-		return stackchan_websocket_v1_MessageType_MESSAGE_TYPE_UNSPECIFIED;
-	}
-}
-
-stackchan_websocket_v1_StackchanState toProtoState(RemoteState state)
-{
-	switch (state)
-	{
-	case RemoteState::Idle:
-		return stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE;
-	case RemoteState::Listening:
-		return stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING;
-	case RemoteState::Thinking:
-		return stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING;
-	case RemoteState::Speaking:
-		return stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING;
-	default:
-		return stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE;
-	}
-}
-
-stackchan_websocket_v1_ServoOperation toProtoServoOperation(ServoCommandOp op)
-{
-	switch (op)
-	{
-	case ServoCommandOp::Sleep:
-		return stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP;
-	case ServoCommandOp::MoveX:
-		return stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X;
-	case ServoCommandOp::MoveY:
-		return stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y;
-	default:
-		return stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP;
-	}
-}
-
-RemoteState fromProtoState(stackchan_websocket_v1_StackchanState state)
-{
-	switch (state)
-	{
-	case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_IDLE:
-		return RemoteState::Idle;
-	case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_LISTENING:
-		return RemoteState::Listening;
-	case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_THINKING:
-		return RemoteState::Thinking;
-	case stackchan_websocket_v1_StackchanState_STACKCHAN_STATE_SPEAKING:
-		return RemoteState::Speaking;
-	default:
-		return RemoteState::Idle;
-	}
-}
-
-ServoCommandOp fromProtoServoOperation(stackchan_websocket_v1_ServoOperation op)
-{
-	switch (op)
-	{
-	case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP:
-		return ServoCommandOp::Sleep;
-	case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X:
-		return ServoCommandOp::MoveX;
-	case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y:
-		return ServoCommandOp::MoveY;
-	default:
-		return ServoCommandOp::Sleep;
-	}
-}
-
 bool setProtoAudioChunk(
 	stackchan_websocket_v1_AudioChunk &chunk,
 	const uint8_t *data,
diff --git a/firmware/src/servo.cpp b/firmware/src/servo.cpp
index bd1f826..2777d02 100644
--- a/firmware/src/servo.cpp
+++ b/firmware/src/servo.cpp
@@ -134,13 +134,13 @@ void BodyServo::loop()
   bool finished = false;
   switch (step.op)
   {
-  case ServoCommandOp::Sleep:
+  case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP:
     finished = static_cast<int32_t>(now - sleep_deadline_ms_) >= 0;
     break;
-  case ServoCommandOp::MoveX:
+  case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X:
     finished = !axis_x_.moving;
     break;
-  case ServoCommandOp::MoveY:
+  case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y:
     finished = !axis_y_.moving;
     break;
   default:
@@ -187,13 +187,13 @@ bool BodyServo::enqueueSequence(const uint8_t *payload, size_t payload_len)
       return false;
     }
 
-    const ServoCommandOp op = static_cast<ServoCommandOp>(payload[offset++]);
+    const auto op = static_cast<stackchan_websocket_v1_ServoOperation>(payload[offset++]);
     Step step{};
     step.op = op;
 
     switch (op)
     {
-    case ServoCommandOp::Sleep:
+    case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP:
       if (offset + sizeof(int16_t) > payload_len)
       {
         log_w("ServoCmd sleep truncated at command=%u", static_cast<unsigned>(i));
@@ -202,8 +202,8 @@ bool BodyServo::enqueueSequence(const uint8_t *payload, size_t payload_len)
       step.duration_ms = readInt16Le(payload + offset);
       offset += sizeof(int16_t);
       break;
-    case ServoCommandOp::MoveX:
-    case ServoCommandOp::MoveY:
+    case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X:
+    case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y:
       if (offset + sizeof(int8_t) + sizeof(int16_t) > payload_len)
       {
         log_w("ServoCmd move truncated at command=%u", static_cast<unsigned>(i));
@@ -380,13 +380,13 @@ void BodyServo::startCurrentStep(uint32_t now)
   step_started_ = true;
   switch (step.op)
   {
-  case ServoCommandOp::Sleep:
+  case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_SLEEP:
     sleep_deadline_ms_ = now + clampDuration(step.duration_ms);
     break;
-  case ServoCommandOp::MoveX:
+  case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_X:
     startMove(axis_x_, step.angle, step.duration_ms);
     break;
-  case ServoCommandOp::MoveY:
+  case stackchan_websocket_v1_ServoOperation_SERVO_OPERATION_MOVE_Y:
     startMove(axis_y_, step.angle, step.duration_ms);
     break;
   default:
diff --git a/firmware/src/speaking.cpp b/firmware/src/speaking.cpp
index 24da288..c1bb8d9 100644
--- a/firmware/src/speaking.cpp
+++ b/firmware/src/speaking.cpp
@@ -1,5 +1,4 @@
 #include "speaking.hpp"
-#include <cstring>
 #include <utility>
 
 void Speaking::reset()
@@ -37,91 +36,77 @@ void Speaking::end()
   reset();
 }
 
-void Speaking::handleWavMessage(const WsHeader &hdr, const uint8_t *body, size_t bodyLen)
+void Speaking::handleWavStart(uint32_t seq, uint32_t sampleRate, uint16_t channels)
 {
-  auto msgType = static_cast<MessageType>(hdr.messageType);
+  current_buffer_ = (current_buffer_ + 1) % 3;
+  std::vector<uint8_t> &buf = buffer_[current_buffer_];
+  buf.clear();
+  playing_ = false;
+  streaming_ = true;
+  next_seq_ = seq + 1;
+  state_.setState(StateMachine::Speaking);
 
-  if (msgType == MessageType::START)
+  if (sampleRate > 0)
   {
-    current_buffer_ = (current_buffer_ + 1) % 3;
-    std::vector<uint8_t> &buf = buffer_[current_buffer_];
-    buf.clear();
-    playing_ = false;
-    streaming_ = true;
-    next_seq_ = hdr.seq + 1;
-    state_.setState(StateMachine::Speaking);
-
-    // START payload (optional): <uint32 sample_rate><uint16 channels>
-    if (body && bodyLen >= 6)
-    {
-      uint32_t sr = 0;
-      uint16_t ch = 1;
-      memcpy(&sr, body, sizeof(sr));
-      memcpy(&ch, body + sizeof(sr), sizeof(ch));
-      if (sr > 0)
-      {
-        sample_rate_ = sr;
-      }
-      if (ch > 0)
-      {
-        channels_ = ch;
-      }
-      log_i("TTS meta: sample_rate=%u channels=%u", (unsigned)sample_rate_, (unsigned)channels_);
-    }
-    else
-    {
-      log_w("TTS START without meta, fallback sr=%u ch=%u", (unsigned)sample_rate_, (unsigned)channels_);
-    }
-    log_i("TTS stream start seq=%u", (unsigned)hdr.seq);
-    return;
+    sample_rate_ = sampleRate;
+  }
+  if (channels > 0)
+  {
+    channels_ = channels;
   }
 
-  if (msgType == MessageType::DATA)
+  log_i("TTS meta: sample_rate=%u channels=%u", (unsigned)sample_rate_, (unsigned)channels_);
+  log_i("TTS stream start seq=%u", (unsigned)seq);
+}
+
+void Speaking::handleWavData(uint32_t seq, const uint8_t *body, size_t bodyLen)
+{
+  if (!streaming_)
   {
-    if (!streaming_)
-    {
-      return;
-    }
+    return;
+  }
 
-    std::vector<uint8_t> &buf = buffer_[current_buffer_];
+  std::vector<uint8_t> &buf = buffer_[current_buffer_];
 
-    if (hdr.seq != next_seq_)
-    {
-      log_w("TTS seq gap: got=%u expected=%u", (unsigned)hdr.seq, (unsigned)next_seq_);
-      // TCP 前提で再送しない。検知だけして次を受ける。
-      next_seq_ = hdr.seq + 1;
-    }
-    else
-    {
-      next_seq_++;
-    }
+  if (seq != next_seq_)
+  {
+    log_w("TTS seq gap: got=%u expected=%u", (unsigned)seq, (unsigned)next_seq_);
+    // TCP 前提で再送しない。検知だけして次を受ける。
+    next_seq_ = seq + 1;
+  }
+  else
+  {
+    next_seq_++;
+  }
 
-    buf.insert(buf.end(), body, body + bodyLen);
-    log_d("TTS chunk size=%u recv=%u", (unsigned)bodyLen, (unsigned)buf.size());
+  buf.insert(buf.end(), body, body + bodyLen);
+  log_d("TTS chunk size=%u recv=%u", (unsigned)bodyLen, (unsigned)buf.size());
+}
+
+void Speaking::handleWavEnd(uint32_t seq)
+{
+  if (!streaming_)
+  {
     return;
   }
 
-  if (msgType == MessageType::END)
+  if (seq != next_seq_)
   {
-    if (!streaming_)
-    {
-      return;
-    }
+    log_w("TTS end seq gap: got=%u expected=%u", (unsigned)seq, (unsigned)next_seq_);
+  }
 
-    std::vector<uint8_t> &buf = buffer_[current_buffer_];
-    streaming_ = false;
-    next_seq_ = 0;
+  std::vector<uint8_t> &buf = buffer_[current_buffer_];
+  streaming_ = false;
+  next_seq_ = 0;
 
-    if (!buf.empty())
-    {
-      playing_ = true;
+  if (!buf.empty())
+  {
+    playing_ = true;
 
-      const int16_t *samples = reinterpret_cast<const int16_t *>(buf.data());
-      size_t sample_len = buf.size() / sizeof(int16_t);
-      bool stereo = channels_ > 1;
-      M5.Speaker.playRaw(samples, sample_len, sample_rate_, stereo, 1, 0);
-    }
-    return;
+    const int16_t *samples = reinterpret_cast<const int16_t *>(buf.data());
+    size_t sample_len = buf.size() / sizeof(int16_t);
+    bool stereo = channels_ > 1;
+    M5.Speaker.playRaw(samples, sample_len, sample_rate_, stereo, 1, 0);
   }
 }
 

From 5db30a5123b2f2d32d987b42aa87660c510d3732 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sun, 19 Apr 2026 15:16:08 +0900
Subject: [PATCH 3/5] =?UTF-8?q?feat:=20SpeakHandler=E3=81=AE=E3=82=B3?=
 =?UTF-8?q?=E3=83=B3=E3=82=B9=E3=83=88=E3=83=A9=E3=82=AF=E3=82=BF=E3=81=8B?=
 =?UTF-8?q?=E3=82=89=E6=9C=AA=E4=BD=BF=E7=94=A8=E3=81=AE=E5=BC=95=E6=95=B0?=
 =?UTF-8?q?=E3=82=92=E5=89=8A=E9=99=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 stackchan_server/speak.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/stackchan_server/speak.py b/stackchan_server/speak.py
index 743ae36..6153576 100644
--- a/stackchan_server/speak.py
+++ b/stackchan_server/speak.py
@@ -26,11 +26,6 @@ def __init__(
         self,
         *,
         websocket: WebSocket,
-        ws_header_fmt: str | None = None,
-        wav_kind: int | None = None,
-        start_msg_type: int | None = None,
-        data_msg_type: int | None = None,
-        end_msg_type: int | None = None,
         down_wav_chunk: int,
         down_segment_millis: int,
         down_segment_stagger_millis: int,

From 0b363e9218a9e7184335454215c07ce264f99395 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sun, 19 Apr 2026 15:30:47 +0900
Subject: [PATCH 4/5] =?UTF-8?q?feat:=20=E3=83=97=E3=83=AD=E3=83=88?=
 =?UTF-8?q?=E3=82=B3=E3=83=AB=E9=96=A2=E9=80=A3=E3=81=AE=E5=9E=8B=E3=82=92?=
 =?UTF-8?q?stackchan=E3=81=8B=E3=82=89wsproto=E3=81=AB=E5=A4=89=E6=9B=B4?=
 =?UTF-8?q?=E3=81=97=E3=80=81protobuf=E3=83=A1=E3=83=83=E3=82=BB=E3=83=BC?=
 =?UTF-8?q?=E3=82=B8=E3=81=AE=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89?=
 =?UTF-8?q?/=E3=83=87=E3=82=B3=E3=83=BC=E3=83=89=E5=87=A6=E7=90=86?=
 =?UTF-8?q?=E3=82=92=E6=9B=B4=E6=96=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AGENTS.md                      |  49 ++++++-----
 docs/websocket_protocols_ja.md | 149 +++++++++++++++++----------------
 2 files changed, 108 insertions(+), 90 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 6283f41..26b7ae4 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -5,6 +5,7 @@
 ## 全体像
 
 - CoreS3 側は `firmware/`、Python サーバー側は `stackchan_server/`。
+- WebSocket の on-wire 形式は手書きバイナリヘッダではなく `protobuf/websocket-message.proto` で定義した protobuf。
 - 音声 uplink は `AudioPcm`、音声 downlink は `AudioWav`（実体は raw PCM）。
 - サーバーは FastAPI を公開し、WebSocket と REST API の両方を持つ。
 - サーボ制御が追加済みで、WebSocket プロトコルには `ServoCmd` / `ServoDoneEvt` がある。
@@ -12,42 +13,49 @@
 ## 状態遷移の要点
 
 - ファームウェア状態: `Idle`, `Listening`, `Thinking`, `Speaking`, `Disconnected`
-- サーバーから指示できるのは `StateCmd` の `0..3` (`Idle`〜`Speaking`)
+- サーバーから指示できるのは `StateCmd` の `Idle` / `Listening` / `Thinking` / `Speaking`
 - `Disconnected` はファームウェア内部状態で、WebSocket 切断時に入る
 - `WakeWordEvt` を受けるか、REST API の wakeword 擬似発火で talk session が始まる
 
 ## WebSocket プロトコル要約
 
-- 共通ヘッダ: `WsHeader` (`<B B B H H>`, packed, little-endian)
+- 1 WebSocket binary frame = 1 protobuf `WebSocketMessage`
+- protobuf 定義: `protobuf/websocket-message.proto`
+- package: `stackchan.websocket.v1`
+- envelope fields
+  - `kind`
+  - `message_type`
+  - `seq`
+  - `oneof body`
 - `kind`
-  - `1=AudioPcm`
-  - `2=AudioWav`
-  - `3=StateCmd`
-  - `4=WakeWordEvt`
-  - `5=StateEvt`
-  - `6=SpeakDoneEvt`
-  - `7=ServoCmd`
-  - `8=ServoDoneEvt`
+  - `AudioPcm`
+  - `AudioWav`
+  - `StateCmd`
+  - `WakeWordEvt`
+  - `StateEvt`
+  - `SpeakDoneEvt`
+  - `ServoCmd`
+  - `ServoDoneEvt`
 - `messageType`
-  - `1=START`
-  - `2=DATA`
-  - `3=END`
+  - `START`
+  - `DATA`
+  - `END`
 
 ### 現行挙動
 
 - `AudioPcm`
   - PCM16LE / 16kHz / 1ch
-  - `START -> DATA* -> END`
+  - `AudioPcmStart -> AudioChunk* -> AudioPcmEnd`
   - `DATA` は 2000 samples（4000 bytes, 約 125ms）ごと
   - 無音 3 秒で自動終了
 - `AudioWav`
   - 名前に反して WAV コンテナではなく PCM ストリーム
-  - `START` payload は `<uint32 sample_rate><uint16 channels>`
+  - `AudioWavStart.sample_rate` / `AudioWavStart.channels` を送る
   - `DATA` chunk は既定 4096 bytes
   - 約 2 秒セグメントで送信し、2 本目は約 1 秒後に先行開始
 - `ServoCmd`
-  - payload: `<uint8 count><commands...>`
-  - op: `0=Sleep`, `1=MoveX`, `2=MoveY`
+  - `ServoCommandSequence.commands[]`
+  - op: `Sleep`, `MoveX`, `MoveY`
   - 新規コマンド受信時は実行中シーケンスを置き換える
 
 ## サーバー側 (`stackchan_server/`)
@@ -82,15 +90,16 @@
 
 - `src/main.cpp`
   - Wi-Fi 接続後、`/ws/stackchan` に接続
-  - `AudioWav`, `StateCmd`, `ServoCmd` を受信処理
+  - protobuf `WebSocketMessage` を decode して `AudioWav`, `StateCmd`, `ServoCmd` を受信処理
   - 通信が 60 秒止まると `Thinking` / `Speaking` から `Idle` に戻す
 - `src/listening.cpp`
   - マイク読み取り 256 サンプル単位
   - 2 秒リングバッファ
+  - protobuf の `AudioPcmStart/Data/End` を送信
   - 無音 3 秒で停止
 - `src/speaking.cpp`
-  - 3 本バッファで TTS セグメント受信
-  - `END` 後に `M5.Speaker.playRaw()` で再生
+  - 3 本バッファで protobuf `AudioWavStart/Data/End` を受信
+  - `AudioWavEnd` 後に `M5.Speaker.playRaw()` で再生
   - 再生完了時に `SpeakDoneEvt`
 - `src/servo.cpp`
   - `ServoCmd` を非同期実行
diff --git a/docs/websocket_protocols_ja.md b/docs/websocket_protocols_ja.md
index c417ab2..b816c13 100644
--- a/docs/websocket_protocols_ja.md
+++ b/docs/websocket_protocols_ja.md
@@ -2,47 +2,56 @@
 コーディングエージェント向け指示: このディレクトリにはプロトコルのみを記述し、CPP、Pythonの実装コードの例を記述する必要はありません。どんなプロトコルが実装されているか確認するために用います。
 -->
 
-# WebSocket バイナリプロトコル仕様
+# WebSocket protobuf プロトコル仕様
 
-このドキュメントは、CoreS3 ファームウェアと Python サーバーがやり取りする WebSocket バイナリプロトコルの現行実装をまとめたものです。
+このドキュメントは、CoreS3 ファームウェアと Python サーバーがやり取りする WebSocket プロトコルの現行実装をまとめたものです。
 
-## 共通ヘッダ
+現行実装では、1 回の WebSocket binary frame に 1 つの protobuf `WebSocketMessage` を格納します。
 
-共通ヘッダ `WsHeader` は `firmware/include/protocols.hpp` で定義されています。
+## protobuf 定義
 
-- packed
-- little-endian
-- 構造: `<B B B H H>`
+- proto file: `protobuf/websocket-message.proto`
+- package: `stackchan.websocket.v1`
+- top-level message: `WebSocketMessage`
+
+### `WebSocketMessage`
 
 | フィールド | 型 | 説明 |
 | --- | --- | --- |
-| `kind` | `uint8` | メッセージ種別 |
-| `messageType` | `uint8` | `1=START`, `2=DATA`, `3=END` |
-| `reserved` | `uint8` | 現在は常に `0` |
-| `seq` | `uint16` | 送信側でインクリメントするシーケンス番号 |
-| `payloadBytes` | `uint16` | ヘッダ直後に続く payload のバイト数 |
-
-### `kind` 一覧
-
-| kind | 名前 | 方向 | 用途 |
-| --- | --- | --- | --- |
-| `1` | `AudioPcm` | CoreS3 → Server | マイク音声 PCM ストリーム |
-| `2` | `AudioWav` | Server → CoreS3 | TTS 音声 PCM ストリーム |
-| `3` | `StateCmd` | Server → CoreS3 | 状態遷移指示 |
-| `4` | `WakeWordEvt` | CoreS3 → Server | ウェイクワード検出通知 |
-| `5` | `StateEvt` | CoreS3 → Server | 現在状態通知 |
-| `6` | `SpeakDoneEvt` | CoreS3 → Server | 音声再生完了通知 |
-| `7` | `ServoCmd` | Server → CoreS3 | サーボ動作シーケンス指示 |
-| `8` | `ServoDoneEvt` | CoreS3 → Server | サーボ動作完了通知 |
-
-## `AudioPcm` (`kind=1`)
+| `kind` | `MessageKind` | メッセージ種別 |
+| `message_type` | `MessageType` | `START` / `DATA` / `END` |
+| `seq` | `uint32` | 送信側でインクリメントするシーケンス番号 |
+| `body` | `oneof` | `kind` / `message_type` に対応する typed body |
+
+### `MessageKind` 一覧
+
+| 名前 | 方向 | 用途 |
+| --- | --- | --- |
+| `AudioPcm` | CoreS3 → Server | マイク音声 PCM ストリーム |
+| `AudioWav` | Server → CoreS3 | TTS 音声 PCM ストリーム |
+| `StateCmd` | Server → CoreS3 | 状態遷移指示 |
+| `WakeWordEvt` | CoreS3 → Server | ウェイクワード検出通知 |
+| `StateEvt` | CoreS3 → Server | 現在状態通知 |
+| `SpeakDoneEvt` | CoreS3 → Server | 音声再生完了通知 |
+| `ServoCmd` | Server → CoreS3 | サーボ動作シーケンス指示 |
+| `ServoDoneEvt` | CoreS3 → Server | サーボ動作完了通知 |
+
+### `MessageType` 一覧
+
+| 名前 | 用途 |
+| --- | --- |
+| `START` | ストリームまたはセグメント開始 |
+| `DATA` | データ本体 |
+| `END` | ストリームまたはセグメント終了 |
+
+## マイク入力 `AudioPcm`
 
 - 方向: CoreS3 → Server
 - フォーマット: PCM16LE / 16kHz / 1ch
-- シーケンス: `START` → `DATA` 複数回 → `END`
-- `START` payload: なし
-- `DATA` payload: PCM16LE 生データ
-- `END` payload: 現行ファームウェアではなし
+- シーケンス: `AudioPcmStart` → `AudioChunk` 複数回 → `AudioPcmEnd`
+- `START` body: `AudioPcmStart {}`
+- `DATA` body: `AudioChunk { bytes pcm_bytes; }`
+- `END` body: `AudioPcmEnd {}`
 
 ### 現行実装メモ
 
@@ -53,19 +62,20 @@
 - 無音判定は平均絶対振幅 `<= 200` が 3 秒継続したときに発火します。
 - 停止時は未送信サンプルを `DATA` で flush してから `END` を送ります。
 
-## `AudioWav` (`kind=2`)
+## スピーカ再生 `AudioWav`
 
 - 方向: Server → CoreS3
 - 名前は `AudioWav` ですが、実際に送っているのは WAV コンテナではなく PCM16LE ストリームです。
-- 1 セグメントの流れは `START` → `DATA` 複数回 → `END` です。
+- 1 セグメントの流れは `AudioWavStart` → `AudioChunk` 複数回 → `AudioWavEnd` です。
 
-### payload 形式
+### body 形式
 
-| messageType | payload |
+| messageType | body |
 | --- | --- |
-| `START` | `<uint32 sample_rate><uint16 channels>` |
-| `DATA` | PCM16LE 生データ |
-| `END` | なし |
+- `START` | `AudioWavStart { sample_rate, channels }` |
+| `DATA` | `AudioChunk { bytes pcm_bytes; }` |
+| `DATA` | `AudioChunk { pcm_bytes }` |
+| `END` | `AudioWavEnd {}` |
 
 ### 現行実装メモ
 
@@ -75,18 +85,18 @@
 - CoreS3 は 3 本の受信バッファを持ち、`END` 到達後に `M5.Speaker.playRaw()` で再生します。
 - `seq` の欠損は検知しますが、TCP 前提のため再送制御は行いません。
 
-## `StateCmd` (`kind=3`)
+## 状態指示 `StateCmd`
 
 - 方向: Server → CoreS3
 - `messageType`: `DATA` のみ
-- payload: 1 byte の target state id
+- body: `StateCommand { state }`
 
-| 値 | 状態 |
-| --- | --- |
-| `0` | `Idle` |
-| `1` | `Listening` |
-| `2` | `Thinking` |
-| `3` | `Speaking` |
+利用する状態名:
+
+- `Idle`
+- `Listening`
+- `Thinking`
+- `Speaking`
 
 ### 現行実装メモ
 
@@ -94,54 +104,53 @@
 - 音声 uplink の `END` を受けると、Server は `Thinking` を指示します。
 - `proxy.speak()` 完了後、Server は `Idle` を指示します。
 
-## `WakeWordEvt` (`kind=4`)
+## ウェイクワード検出 `WakeWordEvt`
 
 - 方向: CoreS3 → Server
 - `messageType`: `DATA` のみ
-- payload: 1 byte (`1=detected`)
+- body: `WakeWordEvent { detected }`
 - `Idle` 中のウェイクワード検出をサーバー側に通知します。
 - REST API の `POST /v1/stackchan/{ip}/wakeword` は、このイベントをサーバー内部で擬似発火させます。
 
-## `StateEvt` (`kind=5`)
+## 状態通知 `StateEvt`
 
 - 方向: CoreS3 → Server
 - `messageType`: `DATA` のみ
-- payload: 1 byte の current state id
+- body: `StateEvent { state }`
 
-| 値 | 状態 |
-| --- | --- |
-| `0` | `Idle` |
-| `1` | `Listening` |
-| `2` | `Thinking` |
-| `3` | `Speaking` |
+利用する状態名:
+
+- `Idle`
+- `Listening`
+- `Thinking`
+- `Speaking`
 
 - CoreS3 は状態遷移の entry hook で送信します。
 - WebSocket 切断中は `Disconnected` 状態になりますが、切断時は uplink 送信できないため `StateEvt` では通知されません。
 
-## `SpeakDoneEvt` (`kind=6`)
+## 発話完了通知 `SpeakDoneEvt`
 
 - 方向: CoreS3 → Server
 - `messageType`: `DATA` のみ
-- payload: 1 byte (`1=done`)
+- body: `SpeakDoneEvent { done }`
 - CoreS3 側の音声再生完了を通知します。
 - Server はこの通知を待って `proxy.speak()` を完了させます。
 
-## `ServoCmd` (`kind=7`)
+## サーボ動作指示 `ServoCmd`
 
 - 方向: Server → CoreS3
 - `messageType`: `DATA` のみ
-- payload はサーボ動作シーケンス全体です。
+- body: `ServoCommandSequence { commands }`
 
-### payload 構造
+### body 構造
 
-- 先頭 1 byte: `<uint8 command_count>`
-- 続いて `command_count` 個のコマンド
+- `commands` は最大 255 個まで（`protobuf/websocket-message.options` で nanopb の `max_count:255` を指定）
 
-| op | 名前 | payload |
-| --- | --- | --- |
-| `0` | `Sleep` | `<uint8 op><int16 duration_ms>` |
-| `1` | `MoveX` | `<uint8 op><int8 angle><int16 duration_ms>` |
-| `2` | `MoveY` | `<uint8 op><int8 angle><int16 duration_ms>` |
+| 名前 | `ServoCommand` のフィールド |
+| --- | --- |
+| `Sleep` | `op`, `duration_ms` |
+| `MoveX` | `op`, `angle`, `duration_ms` |
+| `MoveY` | `op`, `angle`, `duration_ms` |
 
 ### 現行実装メモ
 
@@ -150,10 +159,10 @@
 - `duration_ms <= 0` は即時反映になります。
 - 新しい `ServoCmd` を受けると、実行中シーケンスは置き換えられます。
 
-## `ServoDoneEvt` (`kind=8`)
+## サーボ動作完了通知 `ServoDoneEvt`
 
 - 方向: CoreS3 → Server
 - `messageType`: `DATA` のみ
-- payload: 1 byte (`1=done`)
+- body: `ServoDoneEvent { done }`
 - 直前に受信したサーボシーケンスの完了通知です。
 - Server は `proxy.wait_servo_complete()` でこの完了を待てます。

From 137bdbd8b4e0a9894e8035bbc705358e03c4b7a0 Mon Sep 17 00:00:00 2001
From: Atsushi Morimoto <74th.tech@gmail.com>
Date: Sun, 19 Apr 2026 15:33:39 +0900
Subject: [PATCH 5/5] feat: Add instructions for writing commit messages in
 English

---
 .github/copilot-instructions.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .github/copilot-instructions.md

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 0000000..9086770
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1 @@
+コミットメッセージは英語で書いてください。日本語のコミットメッセージは避けてください。