diff --git a/skill/SKILL.md b/skill/SKILL.md index c128e19..cf6af88 100644 --- a/skill/SKILL.md +++ b/skill/SKILL.md @@ -178,7 +178,7 @@ mmx speech synthesize --text [flags] | `--bitrate ` | number | Bitrate (default: 128000) | | `--channels ` | number | Audio channels (default: 1) | | `--language ` | string | Language boost | -| `--subtitles` | boolean | Include subtitle timing data | +| `--subtitles` | boolean | Download and save subtitles as `.srt` file (alongside `--out` audio file). API must support subtitles for the selected model. | `--pronunciation ` | string, repeatable | Custom pronunciation | | `--sound-effect ` | string | Add sound effect | | `--out ` | string | Save audio to file | @@ -188,6 +188,9 @@ mmx speech synthesize --text [flags] mmx speech synthesize --text "Hello world" --out hello.mp3 --quiet # stdout: hello.mp3 +mmx speech synthesize --text "Hello" --subtitles --out hello.mp3 +# saves hello.mp3 + hello.srt (SRT subtitle file) + echo "Breaking news." | mmx speech synthesize --text-file - --out news.mp3 ``` diff --git a/src/commands/speech/synthesize.ts b/src/commands/speech/synthesize.ts index f5f3630..756eff0 100644 --- a/src/commands/speech/synthesize.ts +++ b/src/commands/speech/synthesize.ts @@ -6,6 +6,7 @@ import { speechEndpoint } from '../../client/endpoints'; import { parseSSE } from '../../client/stream'; import { detectOutputFormat, formatOutput } from '../../output/formatter'; import { saveAudioOutput } from '../../output/audio'; +import { writeFileSync } from 'fs'; import { readTextFromPathOrStdin } from '../../utils/fs'; import type { Config } from '../../config/schema'; import type { GlobalFlags } from '../../types/flags'; @@ -37,6 +38,7 @@ export default defineCommand({ examples: [ 'mmx speech synthesize --text "Hello, world!"', 'mmx speech synthesize --text "Hello, world!" --out hello.mp3', + 'mmx speech synthesize --text "Hello" --subtitles --out hello.mp3', 'echo "Breaking news." | mmx speech synthesize --text-file - --out news.mp3', 'mmx speech synthesize --text "Stream" --stream | mpv --no-terminal -', ], @@ -85,7 +87,7 @@ export default defineCommand({ }; if (flags.language) body.language_boost = flags.language as string; - if (flags.subtitles) body.subtitle = true; + if (flags.subtitles) body.subtitle_enable = true; // Correct API parameter name if (flags.pronunciation) { body.pronunciation_dict = (flags.pronunciation as string[]).map(p => { @@ -122,5 +124,52 @@ export default defineCommand({ if (!config.quiet) process.stderr.write(`[Model: ${model}]\n`); saveAudioOutput(response, outPath, format, config.quiet); + + // Download and save subtitle file when --subtitles is requested + if (flags.subtitles && response.data.subtitle_file) { + try { + // Download the subtitle JSON file from the URL + const subtitleRes = await fetch(response.data.subtitle_file); + if (!subtitleRes.ok) { + throw new CLIError(`Failed to download subtitle file: ${subtitleRes.status}`, ExitCode.GENERAL); + } + // API returns a flat array, not { subtitles: [...] } + const subtitleArray = await subtitleRes.json() as Array<{ text: string; time_begin: number; time_end: number }>; + + if (subtitleArray?.length) { + // Convert to SRT format (API returns time in milliseconds) + const subtitlePath = outPath.replace(/\.[^.]+$/, '') + '.srt'; + const srtContent = subtitleArray + .map((s, i) => { + // API already returns milliseconds, use directly + const fmt = (ms: number) => { + const h = String(Math.floor(ms / 3600000)).padStart(2, '0'); + const m = String(Math.floor((ms % 3600000) / 60000)).padStart(2, '0'); + const sec = String(Math.floor((ms % 60000) / 1000)).padStart(2, '0'); + const mil = String(Math.round(ms % 1000)).padStart(3, '0'); + return `${h}:${m}:${sec},${mil}`; + }; + return `${i + 1}\n${fmt(s.time_begin)} --> ${fmt(s.time_end)}\n${s.text}`; + }) + .join('\n\n'); + writeFileSync(subtitlePath, srtContent, 'utf-8'); + if (!config.quiet) { + console.log(formatOutput({ subtitles: subtitlePath }, format)); + } else { + console.log(subtitlePath); + } + } + } catch (err) { + // Non-fatal: log warning but don't fail the whole synthesis + if (!config.quiet) { + process.stderr.write(`Warning: failed to download subtitles: ${(err as Error).message}\n`); + } + } + } else if (flags.subtitles && !response.data.subtitle_file) { + // Warn if --subtitles was requested but API didn't return subtitle_file + if (!config.quiet) { + process.stderr.write(`Warning: subtitles requested but not returned by API\n`); + } + } }, }); diff --git a/src/types/api.ts b/src/types/api.ts index b2b0578..fdb956c 100644 --- a/src/types/api.ts +++ b/src/types/api.ts @@ -108,7 +108,7 @@ export interface SpeechRequest { pronunciation_dict?: Array<{ tone: string; text: string }>; output_format?: 'url' | 'hex'; stream?: boolean; - subtitle?: boolean; + subtitle_enable?: boolean; // Correct API parameter name (not 'subtitle') } export interface SpeechResponse { @@ -116,7 +116,7 @@ export interface SpeechResponse { data: { audio?: string; // hex-encoded audio data audio_url?: string; - subtitle_info?: SubtitleInfo; + subtitle_file?: string; // URL to download subtitle JSON file (when subtitle_enable=true) status: number; }; extra_info?: { @@ -129,14 +129,6 @@ export interface SpeechResponse { }; } -export interface SubtitleInfo { - subtitles: Array<{ - text: string; - start_time: number; - end_time: number; - }>; -} - // ---- Voice List ---- export interface SystemVoiceInfo { diff --git a/test/auth/timeout-fix.test.ts b/test/auth/timeout-fix.test.ts index baeaad1..2ee4976 100644 --- a/test/auth/timeout-fix.test.ts +++ b/test/auth/timeout-fix.test.ts @@ -124,7 +124,8 @@ describe('refreshAccessToken: timeout and error handling', () => { // We test the real function against a mock server via a wrapper // that overrides the fetch to hit our local server instead. const origFetch = globalThis.fetch; - globalThis.fetch = async (input: RequestInfo | URL, init?: RequestInit) => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (globalThis as any).fetch = async (input: RequestInfo | URL, init?: RequestInit) => { const url = typeof input === 'string' ? input : input.toString(); if (url.includes('oauth/token')) { return origFetch(`${server.url}/v1/oauth/token`, init); @@ -156,7 +157,8 @@ describe('refreshAccessToken: timeout and error handling', () => { const mod = await import('../../src/auth/refresh'); const origFetch = globalThis.fetch; - globalThis.fetch = async (input: RequestInfo | URL, init?: RequestInit) => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (globalThis as any).fetch = async (input: RequestInfo | URL, init?: RequestInit) => { const url = typeof input === 'string' ? input : input.toString(); if (url.includes('oauth/token')) { return origFetch(`${server.url}/v1/oauth/token`, init); diff --git a/test/commands/speech/synthesize.test.ts b/test/commands/speech/synthesize.test.ts index 360e8ac..38dc4f7 100644 --- a/test/commands/speech/synthesize.test.ts +++ b/test/commands/speech/synthesize.test.ts @@ -159,4 +159,47 @@ describe('speech synthesize command', () => { console.log = originalLog; } }); + + it('--subtitles sets subtitle_enable in dry-run output', async () => { + const config = { + apiKey: 'test-key', + region: 'global' as const, + baseUrl: 'https://api.mmx.io', + output: 'json' as const, + timeout: 10, + verbose: false, + quiet: false, + noColor: true, + yes: false, + dryRun: true, + nonInteractive: true, + async: false, + }; + + const originalLog = console.log; + let output = ''; + console.log = (msg: string) => { output += msg; }; + + try { + await synthesizeCommand.execute(config, { + text: 'Hello', + subtitles: true, + quiet: false, + verbose: false, + noColor: true, + yes: false, + dryRun: true, + help: false, + nonInteractive: true, + async: false, + }); + + const parsed = JSON.parse(output); + expect(parsed.request.subtitle_enable).toBe(true); + // Verify the old incorrect parameter name is NOT used + expect(parsed.request.subtitle).toBeUndefined(); + } finally { + console.log = originalLog; + } + }); });