diff --git a/.env.template b/.env.template
new file mode 100644
index 0000000..eadd276
--- /dev/null
+++ b/.env.template
@@ -0,0 +1,38 @@
+# -- Google Cloud Platform --
+GCLOUD_PROJECT="your-gcloud-project-id"
+GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account-key.json"
+
+# -- Speech Recognition --
+# Google Cloud STT
+STACKCHAN_USE_GOOGLE_CLOUD_STT=1
+STACKCHAN_GOOGLE_CLOUD_STT_LANGUAGE_CODE="ja-JP"
+
+# Whisper CLI
+# STACKCHAN_USE_WHISPER_CLI=1
+# STACKCHAN_WHISPER_CLI_MODEL_PATH="/path/to/whisper.cpp/ggml-small.bin"
+# STACKCHAN_WHISPER_CLI_VAD_MODEL_PATH="/path/to/whisper.cpp/ggml-silero-v5.1.2.bin"
+
+# Whisper Server
+# STACKCHAN_USE_USE_WHISPER_SERVER=1
+# STACKCHAN_WHISPER_SERVER_PORT=8431
+
+# -- Speech Syntheis --
+# Google Cloud TTS
+STACKCHAN_USE_GOOGLE_CLOUD_TTS=1
+STACKCHAN_GOOGLE_CLOUD_TTS_MODEL="gemini-2.5-flash-tts"
+STACKCHAN_GOOGLE_CLOUD_TTS_LANGUAGE_CODE="ja-JP"
+STACKCHAN_GOOGLE_CLOUD_TTS_VOICE_NAME="Despina"
+
+# VOICEVOX
+# STACKCHAN_USE_VOICEVOX=1
+STACKCHAN_VOICEVOX_URL="http://localhost:50021"
+STACKCHAN_VOICEVOX_SPEAKER=1
+
+# -- Claude Agent SDK --
+# using Google Cloud Vertex AI
+CLAUDE_CODE_USE_VERTEX=1
+CLOUD_ML_REGION="global"
+ANTHROPIC_VERTEX_PROJECT_ID="your-gcloud-project-id"
+
+# using Anthoropic API Key
+# ANTHROPIC_API_KEY="your-anthropic-api-key"
diff --git a/docs/run_sample_app_ja.md b/docs/run_sample_app_ja.md
new file mode 100644
index 0000000..0f6c20c
--- /dev/null
+++ b/docs/run_sample_app_ja.md
@@ -0,0 +1,94 @@
+# サンプルアプリケーションの実行
+
+複数のサンプルアプリケーションが実装されています。
+
+- [example_apps/echo_with_move.py](../example_apps/echo_with_move.py): 音声認識した内容をそのまま音声合成して返す。ボディも動かして聞くポーズをします。
+- [example_apps/gemini.py](../example_app/gemini.py): Gemini応答
+- [example_apps/claude_agent.py](../example_app/claude_agent.py): Claude Agent SDKを利用したエージェント
+
+## おうむ返しサンプル
+
+uv で必要なライブラリをインストールします。
+
+```
+uv sync
+```
+
+その後、以下のコマンドでPythonサーバを起動します。
+
+```
+uv run uvicorn app.echo_with_move:app.fastapi --host 0.0.0.0 --port 8000
+```
+
+スタックチャンを起動して、「Disconnected」から「Idle」のステータス表示になれば接続されています。
+
+試しに「ハイ！スタックチャン！」と話しかけて、聞くポーズになることを確認して、話しかけてみてください。
+
+## Gemini応答サンプル
+
+uv で必要なライブラリをインストールします。
+追加でgeminiのクライアントが必要です。
+
+```
+uv sync --group example-gemini
+```
+
+その後、以下のコマンドでPythonサーバを起動します。
+
+```
+uv run uvicorn app.gemini:app.fastapi --host 0.0.0.0 --port 8000
+```
+
+スタックチャンを起動して、「Disconnected」から「Idle」のステータス表示になれば接続されています。
+
+試しに「ハイ！スタックチャン！」と話しかけて、聞くポーズになることを確認して、話しかけてみてください。
+
+## Claude Agent SDKサンプル
+
+Claude Agent SDKのエージェントはファイルシステムの変更権限を持ちます。
+意図しないファイル編集するような指示を与えないように注意してください。
+
+TODO: サンプルアプリはファイル編集、読取権限を剥奪する
+
+### NodeJSのインストール
+
+NodeJSのインストールも必要です。
+以下からインストールを進めてください。
+
+> https://nodejs.org/ja/download
+
+### Pythonライブラリのインストール
+
+uv で必要なライブラリをインストールします。
+追加でclaude agent sdkのクライアントが必要です。
+
+```
+uv sync --group example-claude-agent-sdk
+```
+
+### Claude Agent SDKの環境変数設定
+
+Claude Agent SDKを利用するには、VertexAIを利用する場合、以下の.envもしくは環境変数の設定が必要です。
+
+#### VertexAIを利用する場合
+
+- `CLAUDE_CODE_USE_VERTEX`: `1`
+- `CLOUD_ML_REGION`: リージョン設定 "global"
+- `ANTHROPIC_VERTEX_PROJECT_ID`: Google CloudのプロジェクトID（`GCLOUD_PROJECT`と同じ値）
+- `GOOGLE_APPLICATION_CREDENTIALS`: Google Cloudのサービスアカウントの秘密鍵のJSONファイルのパス
+
+#### Claude APIを利用する場合
+
+- `ANTHROPIC_API_KEY`: Claude APIのAPIキー
+
+### サーバの起動
+
+その後、以下のコマンドでPythonサーバを起動します。
+
+```
+uv run uvicorn app.claude_agent_sdk:app.fastapi --host 0.0.0.0 --port 8000
+```
+
+スタックチャンを起動して、「Disconnected」から「Idle」のステータス表示になれば接続されています。
+
+試しに「ハイ！スタックチャン！」と話しかけて、聞くポーズになることを確認して、話しかけてみてください。
diff --git a/docs/server_ja.md b/docs/server_ja.md
new file mode 100644
index 0000000..fdd5b58
--- /dev/null
+++ b/docs/server_ja.md
@@ -0,0 +1,137 @@
+# サーバの設定
+
+## .envファイルの作成
+
+サーバの基本設定は`.env`ファイル、もしくは環境変数に全て記述します。
+リポジトリのルートディレクトリに、`.env`という名前のファイルを作成してください。
+
+次から、`.env`ファイルに記述する環境変数を説明します。
+
+## Google Cloudの設定
+
+[./google_cloud_ja.md](./google_cloud_ja.md) にて、Google Cloudのプロジェクトを作成し、サービスアカウントの秘密鍵のJSONファイルダウンロードしました。
+
+この値を記述します。
+
+- `GCLOUD_PROJECT`: プロジェクト名
+- `GOOGLE_APPLICATION_CREDENTIALS`: ダウンロードしたサービスアカウントの秘密鍵のJSONファイルのパス
+
+プロジェクト名は、秘密鍵のJSONファイル中に記載されている、`project_id`の値と同じものを指定してください。
+
+```
+GCLOUD_PROJECT="your-gcloud-project-id"
+GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account-key.json"
+```
+
+この設定は、Google Cloud Speech-to-Text(STT)、Google Cloud Text-to-Speech(TTS)の両方で使用されます。
+
+## 音声認識の設定
+
+音声認識エンジンとして、以下に対応しています。
+
+- Google Cloud Speech-to-Text
+- Whisper.cppのwhisper-server
+- Whisper.cppのwhisper-cli
+
+### Google Cloud Speech-to-Textの設定
+
+以下の値を設定します。
+
+- `STACKCHAN_USE_GOOGLE_CLOUD_STT`: `1`
+- `STACKCHAN_GOOGLE_CLOUD_STT_LANGUAGE_CODE`: BCP-47 言語コード
+
+BCP-47 言語コードは以下のページを参照してください。
+
+https://docs.cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages
+
+```
+STACKCHAN_USE_GOOGLE_CLOUD_STT=1
+STACKCHAN_GOOGLE_CLOUD_STT_LANGUAGE_CODE="ja-JP"
+```
+
+### Whisper.cppのwhisper-cliの設定
+
+(WIP)
+
+```
+STACKCHAN_USE_WHISPER_CLI=1
+STACKCHAN_WHISPER_CLI_MODEL_PATH="/path/to/whisper.cpp/ggml-small.bin"
+STACKCHAN_WHISPER_CLI_VAD_MODEL_PATH="/path/to/whisper.cpp/ggml-silero-v5.1.2.bin"
+```
+
+### Whisper.cppのwhisper-serverの設定
+
+(WIP)
+
+```
+STACKCHAN_USE_USE_WHISPER_SERVER=1
+STACKCHAN_WHISPER_SERVER_PORT=8431
+```
+
+## 音声合成の設定
+
+音声合成エンジンとして、以下に対応しています。
+
+- Google Cloud Text-to-Speech
+- VOICEVOX
+
+### Google Cloud Text-to-Speechの設定
+
+以下の値を設定します。
+
+- `STACKCHAN_USE_GOOGLE_CLOUD_TTS`: `1`
+- `STACKCHAN_GOOGLE_CLOUD_TTS_MODEL`: モデル
+- `STACKCHAN_GOOGLE_CLOUD_TTS_LANGUAGE_CODE`: BCP-47 言語コード
+- `STACKCHAN_GOOGLE_CLOUD_TTS_VOICE_NAME`: 音声の名前
+
+モデルは以下から選択できます。
+
+https://docs.cloud.google.com/text-to-speech/docs/gemini-tts#available_models
+
+対応言語は以下から確認してください。
+
+https://docs.cloud.google.com/text-to-speech/docs/gemini-tts#available_languages
+
+ボイス名は以下から確認してください。（サンプル音声は英語ですが、日本語に対応しているようです）
+
+https://docs.cloud.google.com/text-to-speech/docs/gemini-tts#voice_options
+
+```
+STACKCHAN_USE_GOOGLE_CLOUD_TTS=1
+STACKCHAN_GOOGLE_CLOUD_TTS_MODEL="gemini-2.5-flash-tts"
+STACKCHAN_GOOGLE_CLOUD_TTS_LANGUAGE_CODE="ja-JP"
+STACKCHAN_GOOGLE_CLOUD_TTS_VOICE_NAME="Despina"
+```
+
+### VOICEVOXの設定
+
+VOICEVOXを利用する際には、VOICEVOXの利用規約の参照をお願いします。
+
+https://voicevox.hiroshiba.jp/
+
+[./setup_ja.md](./setup_ja.md) の手順で立ち上げた場合、http://localhost:50021/ でVOICEVOXのAPIが利用できるようになっています。
+
+以下の値を設定します。
+
+- `STACKCHAN_USE_VOICEVOX`: `1`
+- `STACKCHAN_VOICEVOX_URL`: VOICEVOXのAPIのURL
+- `STACKCHAN_VOICEVOX_SPEAKER`: 使用するキャラクターのスピーカーID
+
+キャラクターの一覧は下記のページにあります。
+キャラクターによって利用規約が異なります。
+
+https://voicevox.hiroshiba.jp/#characters
+
+キャラクターからスピーカーIDの確認が必要です。
+
+VOICEVOXが立ち上がっている場合、/speakers にアクセスすると、スピーカーIDとキャラクターの対応表がJSON形式で表示されます。
+キャラクター毎の構造があり、その中にキャラクターのstyleがあります。
+styleの id がスピーカーIDになります。
+
+> http://localhost:50021/speakers
+
+```
+STACKCHAN_USE_VOICEVOX=1
+STACKCHAN_VOICEVOX_URL="http://localhost:50021"
+STACKCHAN_VOICEVOX_SPEAKER=1
+```
diff --git a/docs/setup_ja.md b/docs/setup_ja.md
index c22d117..9b81c60 100644
--- a/docs/setup_ja.md
+++ b/docs/setup_ja.md
@@ -51,6 +51,9 @@ M5Stack Basic、M5Stack Core2は対応していません。
 
 - Tower Pro SG90
     - [秋月電子通商](https://akizukidenshi.com/catalog/g/g108761/)
+- FEETECH SCS0009
+    - [秋月電子通商](https://akizukidenshi.com/catalog/g/g131664/)
+    - [スイッチサイエンス](https://www.switch-science.com/products/8042)
 
 ### 対応確認済み外装ケースと接続ボード
 
@@ -79,23 +82,48 @@ M5Stack Basic、M5Stack Core2は対応していません。
 
 [./firmware_ja.md](./firmware_ja.md)
 
-## VOICEVOXのDockerコンテナの実行
+## （オプション）VOICEVOXのDockerコンテナの実行
+
+標準ではGoogle Cloud Text-to-Speechを利用して音声合成を行います。
+無料で利用できる中品質の音声合成エンジンのVOICEVOXも利用できます。
+キャラクターボイスが多くてかわいいので、VOICEVOXもぜひ試してみてください。
+
+VOICEVOXを利用する際には、VOICEVOXの利用規約の参照をお願いします。
+
+https://voicevox.hiroshiba.jp/
 
-標準では、音声合成にVOICEVOXを利用します。
 VOICEVOXはDockerイメージが提供されているため、Docker環境を構築して実行します。
 
-Dockerがインストールされていない場合は以下のページヲ参照して、Dockerをインストールしてください。
+Dockerがインストールされていない場合は以下のページを参照して、Dockerをインストールしてください。
 
 > 今すぐ始める | Docker
 >
 > https://www.docker.com/ja-jp/get-started/
 
-VOICEVOXのDockerコンテナの実行方法は、以下のページを参照してください。
+Dockerがインストールできたら、リポジトリのディレクトリで以下のコマンドを実行してください。
 
 ```
 docker compose run --rm --service-ports voicevox
 ```
 
+以下のサイトにアクセスし、「VOICEVOX Engine」と表示されていれば成功です。
+
+> http://localhost:50021/
+
+## （オプション）Whisper.cppのwhisper-cliのインストール
+
+標準ではGoogle Cloud Speech-to-Textを利用して音声認識を行います。
+無料で利用できるWhisper.cppのwhisper-cliも利用できます。
+
+TODO
+
+## （オプション）Whisper.cppのwhisper-serverのインストールと実行
+
+標準ではGoogle Cloud Speech-to-Textを利用して音声認識を行います。
+無料で利用できるWhisper.cppのwhisper-serverも利用できます。
+
+TODO
+
 ## Python開発環境の構築
 
 このリポジトリでは、WebソケットサーバをPythonで実装しています。
@@ -105,23 +133,32 @@ Pythonの環境構築の方法は、パッケージマネージャuvのページ
 >
 > https://docs.astral.sh/uv/getting-started/installation/
 
+## サーバの設定
+
+以下のページを参照して、サーバの設定を行ってください。
+
+[./server_ja.md](./server_ja.md)
+
 ## サンプルアプリケーションの実行
 
-uv でPythonサーバを起動します。
+まずは、サンプルアプリケーションを実行してみましょう。
 
-```bash
-uv sync
-uv run uvicorn app.gemini:app.fastapi --host 0.0.0.0 --port 8000
-```
+以下のページを参照して、サンプルアプリケーションの実行方法を確認してください。
 
-## アプリケーションの設定
+[./run_sample_app_ja.md](./run_sample_app_ja.md)
 
-TODO
+## アプリケーションを作る
+
+(WIP)
+
+[example_apps/gemini.py](../example_apps/gemini.py) をベースに改変を行い、アプリケーションを作ってみましょう。
 
 ## Claude Agent SDKによるエージェントの構築と実行
 
-TODO
+(WIP)
+
+[example_apps/claude_agent_sdk/claude_agent_sdk.py](../example_apps/claude_agent_sdk/claude_agent_sdk.py) をベースに改変を行い、Claude Agent SDKを利用したエージェントを作ってみましょう。
 
-## Docker環境で実行する
+## Claude Agent SDKをDocker環境で実行する
 
 TODO
diff --git a/example_apps/claude_agent_sdk/claude_agent_sdk.py b/example_apps/claude_agent_sdk/claude_agent_sdk.py
index 2b0975d..f04e0d6 100644
--- a/example_apps/claude_agent_sdk/claude_agent_sdk.py
+++ b/example_apps/claude_agent_sdk/claude_agent_sdk.py
@@ -16,8 +16,6 @@
 from pydantic import BaseModel
 
 from stackchan_server.app import StackChanApp
-from stackchan_server.speech_recognition.whisper_cpp import WhisperCppSpeechToText
-from stackchan_server.speech_synthesis.voicevox import VoiceVoxSpeechSynthesizer
 from stackchan_server.ws_proxy import (
     EmptyTranscriptError,
     ServoMoveType,
@@ -34,19 +32,7 @@
 WORKSPACE_DIR = pathlib.Path(__file__).parent / "workspace"
 
 
-def _create_app() -> StackChanApp:
-    whisper_model = os.getenv("STACKCHAN_WHISPER_MODEL")
-    if whisper_model:
-        return StackChanApp(
-            speech_recognizer=WhisperCppSpeechToText(
-                model_path=whisper_model,
-            ),
-            speech_synthesizer=VoiceVoxSpeechSynthesizer(),
-        )
-    return StackChanApp()
-
-
-app = _create_app()
+app = StackChanApp()
 
 model = "claude-haiku-4-5-20251001"
 if os.environ.get("CLAUDE_CODE_USE_VERTEX") == "1":
diff --git a/example_apps/echo.py b/example_apps/echo.py
index 098c456..84726bc 100644
--- a/example_apps/echo.py
+++ b/example_apps/echo.py
@@ -7,10 +7,6 @@
 from dotenv import load_dotenv
 
 from stackchan_server.app import StackChanApp
-from stackchan_server.speech_recognition import (
-    WhisperCppSpeechToText,
-)
-from stackchan_server.speech_synthesis import VoiceVoxSpeechSynthesizer
 from stackchan_server.ws_proxy import EmptyTranscriptError, WsProxy
 
 logger = getLogger(__name__)
@@ -23,24 +19,7 @@
 load_dotenv()
 
 
-def _create_app() -> StackChanApp:
-    whisper_model = os.getenv("STACKCHAN_WHISPER_MODEL")
-    # if os.getenv("STACKCHAN_WHISPER_SERVER_URL") or os.getenv("STACKCHAN_WHISPER_SERVER_PORT"):
-    #     return StackChanApp(
-    #         speech_recognizer=WhisperServerSpeechToText(server_url=whisper_server_url),
-    #         speech_synthesizer=VoiceVoxSpeechSynthesizer(),
-    #     )
-    if whisper_model:
-        return StackChanApp(
-            speech_recognizer=WhisperCppSpeechToText(
-                model_path=whisper_model,
-            ),
-            speech_synthesizer=VoiceVoxSpeechSynthesizer(),
-        )
-    return StackChanApp()
-
-
-app = _create_app()
+app = StackChanApp()
 
 
 @app.setup
diff --git a/example_apps/echo_with_move.py b/example_apps/echo_with_move.py
index 196a165..905a6ce 100644
--- a/example_apps/echo_with_move.py
+++ b/example_apps/echo_with_move.py
@@ -7,10 +7,6 @@
 from dotenv import load_dotenv
 
 from stackchan_server.app import StackChanApp
-from stackchan_server.speech_recognition import (
-    WhisperCppSpeechToText,
-)
-from stackchan_server.speech_synthesis import VoiceVoxSpeechSynthesizer
 from stackchan_server.ws_proxy import (
     EmptyTranscriptError,
     ServoMoveType,
@@ -27,24 +23,7 @@
     datefmt="%H:%M:%S",
 )
 
-def _create_app() -> StackChanApp:
-    whisper_model = os.getenv("STACKCHAN_WHISPER_MODEL")
-    # if os.getenv("STACKCHAN_WHISPER_SERVER_URL") or os.getenv("STACKCHAN_WHISPER_SERVER_PORT"):
-    #     return StackChanApp(
-    #         speech_recognizer=WhisperServerSpeechToText(server_url=whisper_server_url),
-    #         speech_synthesizer=VoiceVoxSpeechSynthesizer(),
-    #     )
-    if whisper_model:
-        return StackChanApp(
-            speech_recognizer=WhisperCppSpeechToText(
-                model_path=whisper_model,
-            ),
-            speech_synthesizer=VoiceVoxSpeechSynthesizer(),
-        )
-    return StackChanApp()
-
-
-app = _create_app()
+app = StackChanApp()
 
 
 @app.setup
@@ -61,15 +40,17 @@ async def talk_session(proxy: WsProxy):
 
             text = await proxy.listen()
 
-            await proxy.move_servo([
-                (ServoMoveType.MOVE_Y, 100, 100),
-                (ServoWaitType.SLEEP, 200),
-                (ServoMoveType.MOVE_Y, 90, 100),
-                (ServoWaitType.SLEEP, 200),
-                (ServoMoveType.MOVE_Y, 100, 100),
-                (ServoWaitType.SLEEP, 200),
-                (ServoMoveType.MOVE_Y, 90, 100),
-            ])
+            await proxy.move_servo(
+                [
+                    (ServoMoveType.MOVE_Y, 100, 100),
+                    (ServoWaitType.SLEEP, 200),
+                    (ServoMoveType.MOVE_Y, 90, 100),
+                    (ServoWaitType.SLEEP, 200),
+                    (ServoMoveType.MOVE_Y, 100, 100),
+                    (ServoWaitType.SLEEP, 200),
+                    (ServoMoveType.MOVE_Y, 90, 100),
+                ]
+            )
 
         except EmptyTranscriptError:
             await proxy.move_servo([(ServoMoveType.MOVE_Y, 90, 100)])
@@ -78,7 +59,6 @@ async def talk_session(proxy: WsProxy):
         await proxy.speak(text)
 
 
-
 if __name__ == "__main__":
     import uvicorn
 
diff --git a/misc/whisper-server/run-whisper-server.sh b/misc/whisper-server/run-whisper-server.sh
index 5935b12..138a9a5 100755
--- a/misc/whisper-server/run-whisper-server.sh
+++ b/misc/whisper-server/run-whisper-server.sh
@@ -4,11 +4,11 @@ set -xe
 whisper-server \
   --host 0.0.0.0 \
   --port ${STACKCHAN_WHISPER_SERVER_PORT} \
-  -m ${STACKCHAN_WHISPER_MODEL} \
+  -m ${STACKCHAN_WHISPER_SERVER_MODEL_PATH} \
   -l ja \
   -nt \
   --vad \
-  -vm ${STACKCHAN_WHISPER_VAD_MODEL} \
+  -vm ${STACKCHAN_WHISPER_SERVER_VAD_MODEL_PATH} \
   -vt 0.6 \
   -vspd 250 \
   -vsd 400 \
diff --git a/pyproject.toml b/pyproject.toml
index 95caadb..7f1a290 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
     "uvicorn[standard]>=0.40.0",
     "voicevox-client>=1.1.0",
     "python-dotenv>=1.2.1",
+    "pydantic-settings>=2.13.1",
 ]
 
 [dependency-groups]
diff --git a/stackchan_server/speech_recognition/__init__.py b/stackchan_server/speech_recognition/__init__.py
index aedc100..9c7a0bc 100644
--- a/stackchan_server/speech_recognition/__init__.py
+++ b/stackchan_server/speech_recognition/__init__.py
@@ -1,18 +1,14 @@
 from __future__ import annotations
 
-from ..types import SpeechRecognizer
+from .create import create_speech_recognizer
 from .google_cloud import GoogleCloudSpeechToText
-from .whisper_cpp import WhisperCppSpeechToText
+from .whisper_cli import WhisperCLISpeechToText
 from .whisper_server import WhisperServerSpeechToText
 
-
-def create_speech_recognizer() -> SpeechRecognizer:
-    return GoogleCloudSpeechToText()
-
-
 __all__ = [
+    "create_speech_recognizer",
     "GoogleCloudSpeechToText",
-    "WhisperCppSpeechToText",
+    "WhisperCLISpeechToText",
     "WhisperServerSpeechToText",
     "create_speech_recognizer",
 ]
diff --git a/stackchan_server/speech_recognition/create.py b/stackchan_server/speech_recognition/create.py
new file mode 100644
index 0000000..8a2f432
--- /dev/null
+++ b/stackchan_server/speech_recognition/create.py
@@ -0,0 +1,29 @@
+from pydantic_settings import BaseSettings
+
+from stackchan_server.types import SpeechRecognizer
+
+
+class _CreateSpeechRecognizerEnv(BaseSettings):
+    use_whisper_cli: bool = False
+    use_whisper_server: bool = False
+    use_google_cloud_stt: bool = True
+
+    class Config:
+        env_prefix = "STACKCHAN_"
+
+
+def create_speech_recognizer() -> SpeechRecognizer:
+    es = _CreateSpeechRecognizerEnv()
+    if es.use_whisper_cli:
+        from .whisper_cli import WhisperCLISpeechToText
+        return WhisperCLISpeechToText()
+
+    if es.use_whisper_server:
+        from .whisper_server import WhisperServerSpeechToText
+        return WhisperServerSpeechToText()
+
+    if es.use_google_cloud_stt:
+        from .google_cloud import GoogleCloudSpeechToText
+        return GoogleCloudSpeechToText()
+
+    raise ValueError("No speech recognizer configured")
diff --git a/stackchan_server/speech_recognition/google_cloud.py b/stackchan_server/speech_recognition/google_cloud.py
index 7a3ec73..9bee0d7 100644
--- a/stackchan_server/speech_recognition/google_cloud.py
+++ b/stackchan_server/speech_recognition/google_cloud.py
@@ -4,29 +4,40 @@
 from logging import getLogger
 
 from google.cloud import speech
+from pydantic_settings import BaseSettings
 
-from ..static import LISTEN_AUDIO_FORMAT, LISTEN_LANGUAGE_CODE
+from ..static import LISTEN_AUDIO_FORMAT
 from ..types import StreamingSpeechRecognizer, StreamingSpeechSession
 
 logger = getLogger(__name__)
 _STREAM_END = object()
 
 
+class GoogleCloudSpeechToTextConfig(BaseSettings):
+    language_code: str = "ja-JP"
+
+    class Config:
+        env_prefix = "STACKCHAN_GOOGLE_CLOUD_STT_"
+
+
 class _GoogleCloudStreamingSession(StreamingSpeechSession):
     def __init__(
         self,
+        config: GoogleCloudSpeechToTextConfig,
         client: speech.SpeechAsyncClient,
     ) -> None:
+        self._conf = config
         self._client = client
         self._config = speech.StreamingRecognitionConfig(
             config=speech.RecognitionConfig(
                 encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                 sample_rate_hertz=LISTEN_AUDIO_FORMAT.sample_rate_hz,
-                language_code=LISTEN_LANGUAGE_CODE,
+                language_code=config.language_code,
             ),
             interim_results=False,
             single_utterance=False,
         )
+
         self._audio_queue: asyncio.Queue[bytes | object] = asyncio.Queue()
         self._done = asyncio.Event()
         self._closed = False
@@ -73,7 +84,9 @@ async def _request_iter(self):
 
     async def _run(self) -> None:
         try:
-            responses = await self._client.streaming_recognize(requests=self._request_iter())
+            responses = await self._client.streaming_recognize(
+                requests=self._request_iter()
+            )
             async for response in responses:
                 for result in response.results:
                     if not result.alternatives:
@@ -95,7 +108,12 @@ async def _run(self) -> None:
 
 
 class GoogleCloudSpeechToText(StreamingSpeechRecognizer):
-    def __init__(self, client: speech.SpeechAsyncClient | None = None) -> None:
+    def __init__(
+        self,
+        config: GoogleCloudSpeechToTextConfig | None = None,
+        client: speech.SpeechAsyncClient | None = None,
+    ) -> None:
+        self._conf = config or GoogleCloudSpeechToTextConfig()
         self._client = client or speech.SpeechAsyncClient()
 
     async def transcribe(self, pcm_bytes: bytes) -> str:
@@ -103,14 +121,14 @@ async def transcribe(self, pcm_bytes: bytes) -> str:
         config = speech.RecognitionConfig(
             encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
             sample_rate_hertz=LISTEN_AUDIO_FORMAT.sample_rate_hz,
-            language_code=LISTEN_LANGUAGE_CODE,
+            language_code=self._conf.language_code,
         )
         response = await self._client.recognize(config=config, audio=audio)
 
         return "".join(result.alternatives[0].transcript for result in response.results)
 
     async def start_stream(self) -> StreamingSpeechSession:
-        return _GoogleCloudStreamingSession(self._client)
+        return _GoogleCloudStreamingSession(self._conf, self._client)
 
 
-__all__ = ["GoogleCloudSpeechToText"]
+__all__ = ["GoogleCloudSpeechToText", "GoogleCloudSpeechToTextConfig"]
diff --git a/stackchan_server/speech_recognition/whisper_cpp.py b/stackchan_server/speech_recognition/whisper_cli.py
similarity index 58%
rename from stackchan_server/speech_recognition/whisper_cpp.py
rename to stackchan_server/speech_recognition/whisper_cli.py
index 268015c..e352085 100644
--- a/stackchan_server/speech_recognition/whisper_cpp.py
+++ b/stackchan_server/speech_recognition/whisper_cli.py
@@ -4,7 +4,6 @@
 import io
 import json
 import math
-import os
 import shlex
 import shutil
 import tempfile
@@ -12,7 +11,9 @@
 from logging import getLogger
 from pathlib import Path
 
-from ..static import LISTEN_AUDIO_FORMAT, LISTEN_LANGUAGE_CODE
+from pydantic_settings import BaseSettings
+
+from ..static import LISTEN_AUDIO_FORMAT
 from ..types import SpeechRecognizer
 
 logger = getLogger(__name__)
@@ -23,57 +24,59 @@
 _DEFAULT_VAD_SPEECH_PAD_MS = 30
 
 
-class WhisperCppSpeechToText(SpeechRecognizer):
+class WhisperCLISpeechToTextConfig(BaseSettings):
+    language: str = "auto"
+    model_path: str | None = None
+    cli_path: str = "whisper-cli"
+    threads: int | None = None
+    translate: bool = False
+    no_speech_threshold: float = 0.8
+    suppress_non_speech_tokens: bool = True
+    vad_model_path: Path | None = None
+    use_vad: bool = True
+    vad_threshold: float = _DEFAULT_VAD_THRESHOLD
+    vad_min_speech_duration_ms: int = _DEFAULT_VAD_MIN_SPEECH_DURATION_MS
+    vad_min_silence_duration_ms: int = _DEFAULT_VAD_MIN_SILENCE_DURATION_MS
+    vad_speech_pad_ms: int = _DEFAULT_VAD_SPEECH_PAD_MS
+    silence_rms_threshold: float = _DEFAULT_SILENCE_RMS_THRESHOLD
+
+    class Config:
+        env_prefix = "STACKCHAN_WHISPER_CLI_"
+
+
+class WhisperCLISpeechToText(SpeechRecognizer):
     def __init__(
         self,
         *,
-        model_path: str | Path | None = None,
-        cli_path: str = "whisper-cli",
-        threads: int | None = None,
-        translate: bool = False,
-        no_speech_threshold: float = 0.8,
-        suppress_non_speech_tokens: bool = True,
-        vad_model_path: str | Path | None = None,
-        use_vad: bool = True,
-        vad_threshold: float = _DEFAULT_VAD_THRESHOLD,
-        vad_min_speech_duration_ms: int = _DEFAULT_VAD_MIN_SPEECH_DURATION_MS,
-        vad_min_silence_duration_ms: int = _DEFAULT_VAD_MIN_SILENCE_DURATION_MS,
-        vad_speech_pad_ms: int = _DEFAULT_VAD_SPEECH_PAD_MS,
-        silence_rms_threshold: float = _DEFAULT_SILENCE_RMS_THRESHOLD,
+        config: WhisperCLISpeechToTextConfig | None = None,
     ) -> None:
-        resolved_model_path = model_path or os.getenv("STACKCHAN_WHISPER_MODEL")
-        if not resolved_model_path:
-            raise ValueError("whisper.cpp model_path is required or set STACKCHAN_WHISPER_MODEL")
-        self._model_path = Path(resolved_model_path)
-        self._cli_path = cli_path
-        self._threads = threads
-        self._translate = translate
-        self._no_speech_threshold = no_speech_threshold
-        self._suppress_non_speech_tokens = suppress_non_speech_tokens
-        self._vad_model_path = _resolve_vad_model_path(vad_model_path)
-        self._use_vad = use_vad
-        self._vad_threshold = vad_threshold
-        self._vad_min_speech_duration_ms = vad_min_speech_duration_ms
-        self._vad_min_silence_duration_ms = vad_min_silence_duration_ms
-        self._vad_speech_pad_ms = vad_speech_pad_ms
-        self._silence_rms_threshold = silence_rms_threshold
+        self._conf = config or WhisperCLISpeechToTextConfig()
+        if self._conf.model_path is None:
+            raise ValueError(
+                "whisper.cpp model_path is required or set STACKCHAN_WHISPER_CLI_MODEL"
+            )
 
     async def transcribe(self, pcm_bytes: bytes) -> str:
-        if not self._model_path.is_file():
-            raise FileNotFoundError(f"whisper.cpp model not found: {self._model_path}")
-        if _pcm_rms_level(pcm_bytes) < self._silence_rms_threshold:
+        assert self._conf.model_path
+
+        if not Path(self._conf.model_path).is_file():
+            raise FileNotFoundError(
+                f"whisper.cpp model not found: {self._conf.model_path}"
+            )
+        if _pcm_rms_level(pcm_bytes) < self._conf.silence_rms_threshold:
             logger.info(
                 "Skipping whisper.cpp transcription because pcm rms %.2f is below silence threshold %.2f",
                 _pcm_rms_level(pcm_bytes),
-                self._silence_rms_threshold,
+                self._conf.silence_rms_threshold,
             )
             return ""
 
-        cli_path = shutil.which(self._cli_path)
+        cli_path = shutil.which(self._conf.cli_path)
         if cli_path is None:
-            raise FileNotFoundError(f"whisper.cpp CLI not found in PATH: {self._cli_path}")
+            raise FileNotFoundError(
+                f"whisper.cpp CLI not found in PATH: {self._conf.cli_path}"
+            )
 
-        language = _normalize_language(LISTEN_LANGUAGE_CODE)
         with tempfile.TemporaryDirectory(prefix="stackchan_whisper_") as temp_dir_name:
             temp_dir = Path(temp_dir_name)
             wav_path = temp_dir / "input.wav"
@@ -90,38 +93,38 @@ async def transcribe(self, pcm_bytes: bytes) -> str:
             command = [
                 cli_path,
                 "-m",
-                str(self._model_path),
+                str(self._conf.model_path),
                 "-f",
                 str(wav_path),
                 "-l",
-                language,
+                self._conf.language,
                 "-nth",
-                str(self._no_speech_threshold),
+                str(self._conf.no_speech_threshold),
                 "-nt",
                 "-ojf",
                 "-of",
                 str(out_base),
             ]
-            if self._threads is not None:
-                command.extend(["-t", str(self._threads)])
-            if self._translate:
+            if self._conf.threads is not None:
+                command.extend(["-t", str(self._conf.threads)])
+            if self._conf.translate:
                 command.append("-tr")
-            if self._suppress_non_speech_tokens:
+            if self._conf.suppress_non_speech_tokens:
                 command.append("-sns")
-            if self._use_vad and self._vad_model_path is not None:
+            if self._conf.use_vad and self._conf.vad_model_path is not None:
                 command.extend(
                     [
                         "--vad",
                         "-vm",
-                        str(self._vad_model_path),
+                        str(self._conf.vad_model_path),
                         "-vt",
-                        str(self._vad_threshold),
+                        str(self._conf.vad_threshold),
                         "-vspd",
-                        str(self._vad_min_speech_duration_ms),
+                        str(self._conf.vad_min_speech_duration_ms),
                         "-vsd",
-                        str(self._vad_min_silence_duration_ms),
+                        str(self._conf.vad_min_silence_duration_ms),
                         "-vp",
-                        str(self._vad_speech_pad_ms),
+                        str(self._conf.vad_speech_pad_ms),
                     ]
                 )
             command.append("-np")
@@ -154,12 +157,6 @@ async def transcribe(self, pcm_bytes: bytes) -> str:
             return transcript
 
 
-def _normalize_language(language_code: str) -> str:
-    if not language_code:
-        return "auto"
-    return language_code.split("-", 1)[0].lower()
-
-
 def _normalize_transcript(text: str) -> str:
     return text.strip()
 
@@ -194,25 +191,13 @@ def _pcm_rms_level(pcm_bytes: bytes) -> float:
     sample_count = len(pcm_bytes) // 2
     total = 0.0
     for index in range(0, sample_count * 2, 2):
-        sample = int.from_bytes(pcm_bytes[index : index + 2], byteorder="little", signed=True)
+        sample = int.from_bytes(
+            pcm_bytes[index : index + 2], byteorder="little", signed=True
+        )
         total += float(sample * sample)
     return math.sqrt(total / sample_count)
 
 
-def _resolve_vad_model_path(vad_model_path: str | Path | None) -> Path | None:
-    if vad_model_path is not None:
-        path = Path(vad_model_path)
-        return path if path.is_file() else None
-
-    env_path = os.getenv("STACKCHAN_WHISPER_VAD_MODEL")
-    if env_path:
-        path = Path(env_path)
-        if path.is_file():
-            return path
-
-    return None
-
-
 def _write_wav(
     path: Path,
     pcm_bytes: bytes,
@@ -230,4 +215,4 @@ def _write_wav(
         path.write_bytes(buffer.getvalue())
 
 
-__all__ = ["WhisperCppSpeechToText"]
+__all__ = ["WhisperCLISpeechToText", "WhisperCLISpeechToTextConfig"]
diff --git a/stackchan_server/speech_recognition/whisper_server.py b/stackchan_server/speech_recognition/whisper_server.py
index eb5d44d..df2d387 100644
--- a/stackchan_server/speech_recognition/whisper_server.py
+++ b/stackchan_server/speech_recognition/whisper_server.py
@@ -4,7 +4,6 @@
 import json
 import math
 import mimetypes
-import os
 import uuid
 from collections.abc import Mapping
 from logging import getLogger
@@ -13,7 +12,9 @@
 from urllib.error import HTTPError, URLError
 from urllib.request import Request, urlopen
 
-from ..static import LISTEN_AUDIO_FORMAT, LISTEN_LANGUAGE_CODE
+from pydantic_settings import BaseSettings
+
+from ..static import LISTEN_AUDIO_FORMAT
 from ..types import SpeechRecognizer
 
 logger = getLogger(__name__)
@@ -22,31 +23,38 @@
 _DEFAULT_SERVER_PORT = 8080
 
 
+class WhisperServerSpeechToTextConfig(BaseSettings):
+    url: str | None = None
+    port: int = _DEFAULT_SERVER_PORT
+    language: str = "auto"
+    detect_language: bool = False
+    response_format: str = "verbose_json"
+    silence_rms_threshold: float = _DEFAULT_SILENCE_RMS_THRESHOLD
+    request_timeout_seconds: float = 60.0
+
+    class Config:
+        env_prefix = "STACKCHAN_WHISPER_SERVER_"
+
+
 class WhisperServerSpeechToText(SpeechRecognizer):
     def __init__(
         self,
         *,
-        server_url: str | None = None,
-        language: str | None = None,
-        detect_language: bool = False,
-        response_format: str = "verbose_json",
-        silence_rms_threshold: float = _DEFAULT_SILENCE_RMS_THRESHOLD,
-        request_timeout_seconds: float = 60.0,
+        config: WhisperServerSpeechToTextConfig | None = None,
     ) -> None:
-        self._server_url = server_url or _default_server_url()
-        self._language = language or _normalize_language(LISTEN_LANGUAGE_CODE)
-        self._detect_language = detect_language
-        self._response_format = response_format
-        self._silence_rms_threshold = silence_rms_threshold
-        self._request_timeout_seconds = request_timeout_seconds
+        self._conf = config or WhisperServerSpeechToTextConfig()
+        self._server_url = _default_server_url(
+            url=self._conf.url,
+            port=self._conf.port,
+        )
 
     async def transcribe(self, pcm_bytes: bytes) -> str:
         rms_level = _pcm_rms_level(pcm_bytes)
-        if rms_level < self._silence_rms_threshold:
+        if rms_level < self._conf.silence_rms_threshold:
             logger.info(
                 "Skipping whisper-server transcription because pcm rms %.2f is below silence threshold %.2f",
                 rms_level,
-                self._silence_rms_threshold,
+                self._conf.silence_rms_threshold,
             )
             return ""
 
@@ -59,7 +67,7 @@ async def transcribe(self, pcm_bytes: bytes) -> str:
         transcript = await asyncio.to_thread(
             self._request_transcript,
             wav_bytes,
-            self._language,
+            self._conf.language,
         )
         if transcript:
             logger.info("whisper-server transcript: %s", transcript)
@@ -67,10 +75,10 @@ async def transcribe(self, pcm_bytes: bytes) -> str:
 
     def _request_transcript(self, wav_bytes: bytes, language: str) -> str:
         fields = {
-            "response_format": self._response_format,
+            "response_format": self._conf.response_format,
             "language": language,
         }
-        if self._detect_language:
+        if self._conf.detect_language:
             fields["detect_language"] = "true"
 
         body, content_type = _encode_multipart_formdata(
@@ -85,7 +93,9 @@ def _request_transcript(self, wav_bytes: bytes, language: str) -> str:
         )
         logger.info("Running whisper-server request: POST %s", self._server_url)
         try:
-            with urlopen(request, timeout=self._request_timeout_seconds) as response:
+            with urlopen(
+                request, timeout=self._conf.request_timeout_seconds
+            ) as response:
                 response_body = response.read()
         except HTTPError as exc:
             detail = exc.read().decode("utf-8", errors="replace").strip()
@@ -95,7 +105,7 @@ def _request_transcript(self, wav_bytes: bytes, language: str) -> str:
         except URLError as exc:
             raise RuntimeError(f"whisper-server request failed: {exc.reason}") from exc
 
-        if self._response_format == "json":
+        if self._conf.response_format == "json":
             payload = _load_json_response_bytes(response_body)
             if not isinstance(payload, Mapping):
                 return ""
@@ -107,20 +117,12 @@ def _request_transcript(self, wav_bytes: bytes, language: str) -> str:
         return _load_transcript_from_verbose_json(payload)
 
 
-def _default_server_url() -> str:
-    configured = os.getenv("STACKCHAN_WHISPER_SERVER_URL")
-    if configured:
-        return configured.rstrip("/")
-    port = os.getenv("STACKCHAN_WHISPER_SERVER_PORT", str(_DEFAULT_SERVER_PORT))
+def _default_server_url(*, url: str | None, port: int) -> str:
+    if url:
+        return url.rstrip("/")
     return f"http://127.0.0.1:{port}/inference"
 
 
-def _normalize_language(language_code: str) -> str:
-    if not language_code:
-        return ""
-    return language_code.split("-", 1)[0].lower()
-
-
 def _load_json_response_bytes(response_body: bytes) -> object:
     response_text = response_body.decode("utf-8", errors="replace")
     if "\ufffd" in response_text:
@@ -155,7 +157,9 @@ def _pcm_rms_level(pcm_bytes: bytes) -> float:
     sample_count = len(pcm_bytes) // 2
     total = 0.0
     for index in range(0, sample_count * 2, 2):
-        sample = int.from_bytes(pcm_bytes[index : index + 2], byteorder="little", signed=True)
+        sample = int.from_bytes(
+            pcm_bytes[index : index + 2], byteorder="little", signed=True
+        )
         total += float(sample * sample)
     return math.sqrt(total / sample_count)
 
@@ -199,7 +203,11 @@ def _encode_multipart_formdata(
         )
 
     for field_name, (filename, content, content_type) in files.items():
-        guessed_type = content_type or mimetypes.guess_type(filename)[0] or "application/octet-stream"
+        guessed_type = (
+            content_type
+            or mimetypes.guess_type(filename)[0]
+            or "application/octet-stream"
+        )
         lines.extend(
             [
                 b"--" + boundary_bytes,
@@ -218,4 +226,4 @@ def _encode_multipart_formdata(
     return body, f"multipart/form-data; boundary={boundary}"
 
 
-__all__ = ["WhisperServerSpeechToText"]
+__all__ = ["WhisperServerSpeechToText", "WhisperServerSpeechToTextConfig"]
diff --git a/stackchan_server/speech_synthesis/__init__.py b/stackchan_server/speech_synthesis/__init__.py
index aa72c1f..886ab3d 100644
--- a/stackchan_server/speech_synthesis/__init__.py
+++ b/stackchan_server/speech_synthesis/__init__.py
@@ -1,12 +1,11 @@
 from __future__ import annotations
 
-from ..types import SpeechSynthesizer
+from .create import create_speech_synthesizer
 from .google_cloud import GoogleCloudTextToSpeech
 from .voicevox import VoiceVoxSpeechSynthesizer
 
-
-def create_speech_synthesizer() -> SpeechSynthesizer:
-    return VoiceVoxSpeechSynthesizer()
-
-
-__all__ = ["GoogleCloudTextToSpeech", "VoiceVoxSpeechSynthesizer", "create_speech_synthesizer"]
+__all__ = [
+    "GoogleCloudTextToSpeech",
+    "VoiceVoxSpeechSynthesizer",
+    "create_speech_synthesizer",
+]
diff --git a/stackchan_server/speech_synthesis/create.py b/stackchan_server/speech_synthesis/create.py
new file mode 100644
index 0000000..5d8cfa9
--- /dev/null
+++ b/stackchan_server/speech_synthesis/create.py
@@ -0,0 +1,24 @@
+from pydantic_settings import BaseSettings
+
+from stackchan_server.types import SpeechSynthesizer
+
+
+class SpeechSynthesisEnvSetting(BaseSettings):
+    use_voicevox: bool = False
+    use_google_cloud_tts: bool = True
+
+    class Config:
+        env_prefix = "STACKCHAN_"
+
+
+def create_speech_synthesizer() -> SpeechSynthesizer:
+    es = SpeechSynthesisEnvSetting()
+    if es.use_voicevox:
+        from .voicevox import VoiceVoxSpeechSynthesizer
+        return VoiceVoxSpeechSynthesizer()
+
+    if es.use_google_cloud_tts:
+        from .google_cloud import GoogleCloudTextToSpeech
+        return GoogleCloudTextToSpeech()
+
+    raise ValueError("No speech synthesizer configured")
diff --git a/stackchan_server/speech_synthesis/google_cloud.py b/stackchan_server/speech_synthesis/google_cloud.py
index 8f3feab..59b4646 100644
--- a/stackchan_server/speech_synthesis/google_cloud.py
+++ b/stackchan_server/speech_synthesis/google_cloud.py
@@ -5,17 +5,16 @@
 import wave
 from collections.abc import AsyncIterator
 from logging import getLogger
-from typing import Any
 
 from google import genai
 from google.genai import types
+from google.genai.client import AsyncClient as GenAIAsyncClient
+from pydantic_settings import BaseSettings
 
 from ..types import AudioFormat, StreamingSpeechSynthesizer
 
 logger = getLogger(__name__)
 
-_DEFAULT_MODEL = "gemini-2.5-flash-tts"
-_DEFAULT_LOCATION = "global"
 _PCM_SAMPLE_RATE_HZ = 24000
 _PCM_CHANNELS = 1
 _PCM_SAMPLE_WIDTH = 2
@@ -26,11 +25,23 @@
 )
 
 
-def create_vertexai_client() -> Any:
+class GoogleCloudSpeechTextToSpeechConfig(BaseSettings):
+    model: str = "gemini-2.5-flash-tts"
+    language_code: str = "ja-JP"
+    voice_name: str = "Despina"
+    style_instructions: str | None = None
+
+    class Config:
+        env_prefix = "STACKCHAN_GOOGLE_CLOUD_TTS_"
+
+
+def create_vertexai_client() -> GenAIAsyncClient:
     return genai.Client(
         vertexai=True,
         project=os.getenv("GOOGLE_CLOUD_PROJECT"),
-        location=os.getenv("GOOGLE_CLOUD_LOCATION") or os.getenv("GOOGLE_CLOUD_REGION") or _DEFAULT_LOCATION,
+        location=os.getenv("GOOGLE_CLOUD_LOCATION")
+        or os.getenv("GOOGLE_CLOUD_REGION")
+        or "global",
     ).aio
 
 
@@ -38,16 +49,10 @@ class GoogleCloudTextToSpeech(StreamingSpeechSynthesizer):
     def __init__(
         self,
         *,
-        model: str = _DEFAULT_MODEL,
-        language_code: str = "ja-JP",
-        voice_name: str = "Despina",
-        style_instructions: str | None = None,
-        client: Any | None = None,
+        config: GoogleCloudSpeechTextToSpeechConfig | None = None,
+        client: GenAIAsyncClient | None = None,
     ) -> None:
-        self._model = model
-        self._language_code = language_code
-        self._voice_name = voice_name
-        self._style_instructions = style_instructions
+        self._conf = config or GoogleCloudSpeechTextToSpeechConfig()
         self._client = client or create_vertexai_client()
 
     @property
@@ -61,30 +66,30 @@ async def synthesize(self, text: str) -> bytes:
         logger.info(
             "Gemini TTS response pcm_bytes=%d model=%s language_code=%s voice_name=%s",
             len(pcm_bytes),
-            self._model,
-            self._language_code,
-            self._voice_name,
+            self._conf.model,
+            self._conf.language_code,
+            self._conf.voice_name,
         )
         return self._wrap_pcm_as_wav(bytes(pcm_bytes))
 
     async def synthesize_stream(self, text: str) -> AsyncIterator[bytes]:
         logger.info(
             "Requesting Gemini TTS model=%s language_code=%s voice_name=%s text_chars=%d",
-            self._model,
-            self._language_code,
-            self._voice_name,
+            self._conf.model,
+            self._conf.language_code,
+            self._conf.voice_name,
             len(text),
         )
         async for response in await self._client.models.generate_content_stream(
-            model=self._model,
+            model=self._conf.model,
             contents=self._build_contents(text),
             config=types.GenerateContentConfig(
                 response_modalities=["AUDIO"],
                 speech_config=types.SpeechConfig(
-                    language_code=self._language_code,
+                    language_code=self._conf.language_code,
                     voice_config=types.VoiceConfig(
                         prebuilt_voice_config=types.PrebuiltVoiceConfig(
-                            voice_name=self._voice_name,
+                            voice_name=self._conf.voice_name,
                         )
                     ),
                 ),
@@ -95,9 +100,9 @@ async def synthesize_stream(self, text: str) -> AsyncIterator[bytes]:
                 yield chunk
 
     def _build_contents(self, text: str) -> str:
-        if not self._style_instructions:
+        if not self._conf.style_instructions:
             return text
-        return f"{self._style_instructions}\n\n{text}"
+        return f"{self._conf.style_instructions}\n\n{text}"
 
     def _extract_audio_bytes(self, response: types.GenerateContentResponse) -> bytes:
         pcm_bytes = bytearray()
@@ -121,4 +126,8 @@ def _wrap_pcm_as_wav(self, pcm_bytes: bytes) -> bytes:
             return buffer.getvalue()
 
 
-__all__ = ["GoogleCloudTextToSpeech", "create_vertexai_client"]
+__all__ = [
+    "GoogleCloudTextToSpeech",
+    "create_vertexai_client",
+    "GoogleCloudSpeechTextToSpeechConfig",
+]
diff --git a/stackchan_server/speech_synthesis/voicevox.py b/stackchan_server/speech_synthesis/voicevox.py
index 8f1f1c9..7c83dff 100644
--- a/stackchan_server/speech_synthesis/voicevox.py
+++ b/stackchan_server/speech_synthesis/voicevox.py
@@ -1,25 +1,33 @@
 from __future__ import annotations
 
-import os
-
+from pydantic_settings import BaseSettings
 from vvclient import Client as VVClient
 
 from ..types import SpeechSynthesizer
 
 
-def create_voicevox_client() -> VVClient:
-    voicevox_url = os.getenv("STACKCHAN_VOICEVOX_URL", "http://localhost:50021")
-    return VVClient(base_uri=voicevox_url)
+class VoiceVoxSpeechSynthesizerConfig(BaseSettings):
+    url: str = "http://localhost:50021"
+    speaker: int = 29
+
+    class Config:
+        env_prefix = "STACKCHAN_VOICEVOX_"
 
 
 class VoiceVoxSpeechSynthesizer(SpeechSynthesizer):
-    def __init__(self, speaker: int = 29) -> None:
-        self._speaker = speaker
+    def __init__(
+            self,
+            config: VoiceVoxSpeechSynthesizerConfig | None = None,
+            ) -> None:
+        self._conf = config or VoiceVoxSpeechSynthesizerConfig()
+
+    def create_voicevox_client(self) -> VVClient:
+        return VVClient(base_uri=self._conf.url)
 
     async def synthesize(self, text: str) -> bytes:
-        async with create_voicevox_client() as client:
-            audio_query = await client.create_audio_query(text, speaker=self._speaker)
-            return await audio_query.synthesis(speaker=self._speaker)
+        async with self.create_voicevox_client() as client:
+            audio_query = await client.create_audio_query(text, speaker=self._conf.speaker)
+            return await audio_query.synthesis(speaker=self._conf.speaker)
 
 
-__all__ = ["VoiceVoxSpeechSynthesizer", "create_voicevox_client"]
+__all__ = ["VoiceVoxSpeechSynthesizer"]
diff --git a/stackchan_server/static.py b/stackchan_server/static.py
index 872f1dc..cd7b8cf 100644
--- a/stackchan_server/static.py
+++ b/stackchan_server/static.py
@@ -7,6 +7,5 @@
     channels=1,
     sample_width=2,
 )
-LISTEN_LANGUAGE_CODE = "ja-JP"
 
-__all__ = ["LISTEN_AUDIO_FORMAT", "LISTEN_LANGUAGE_CODE"]
+__all__ = ["LISTEN_AUDIO_FORMAT"]
diff --git a/uv.lock b/uv.lock
index 785ce3c..2540b7b 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1412,6 +1412,7 @@ dependencies = [
     { name = "fastapi" },
     { name = "google-cloud-speech" },
     { name = "google-genai" },
+    { name = "pydantic-settings" },
     { name = "python-dotenv" },
     { name = "uvicorn", extra = ["standard"] },
     { name = "voicevox-client" },
@@ -1431,6 +1432,7 @@ requires-dist = [
     { name = "fastapi", specifier = ">=0.128.0" },
     { name = "google-cloud-speech", specifier = ">=2.35.0" },
     { name = "google-genai", specifier = ">=1.59.0" },
+    { name = "pydantic-settings", specifier = ">=2.13.1" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "uvicorn", extras = ["standard"], specifier = ">=0.40.0" },
     { name = "voicevox-client", specifier = ">=1.1.0" },