124 lines
3.8 KiB
Python
124 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
# SPDX-License-Identifier: MIT
|
|
"""
|
|
MiniMax Sync TTS (HTTP)
|
|
Self-contained: no external dependencies beyond `requests`.
|
|
|
|
Usage:
|
|
python minimax_tts.py "Hello world" -o output.mp3
|
|
python minimax_tts.py "你好世界" -o hi.mp3 -v female-shaonv --model speech-2.8-hd
|
|
python minimax_tts.py "Welcome" -o out.wav -v male-qn-jingying --speed 0.8 --format wav
|
|
|
|
Env: MINIMAX_API_KEY (required)
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import argparse
|
|
import requests
|
|
|
|
API_KEY = os.getenv("MINIMAX_API_KEY")
|
|
API_BASE = os.getenv("MINIMAX_API_BASE", "https://api.minimax.io/v1")
|
|
|
|
|
|
def tts(
|
|
text: str,
|
|
voice_id: str = "male-qn-qingse",
|
|
model: str = "speech-2.8-hd",
|
|
speed: float = 1.0,
|
|
volume: float = 1.0,
|
|
pitch: int = 0,
|
|
emotion: str = "",
|
|
sample_rate: int = 32000,
|
|
bitrate: int = 128000,
|
|
fmt: str = "mp3",
|
|
language_boost: str = "auto",
|
|
timeout: int = 120,
|
|
) -> bytes:
|
|
"""Synchronous HTTP TTS. Returns raw audio bytes."""
|
|
if not API_KEY:
|
|
raise SystemExit("ERROR: MINIMAX_API_KEY is not set.\n export MINIMAX_API_KEY='your-key'")
|
|
|
|
voice_setting = {"voice_id": voice_id, "speed": speed, "vol": volume, "pitch": pitch}
|
|
if emotion:
|
|
voice_setting["emotion"] = emotion
|
|
|
|
payload = {
|
|
"model": model,
|
|
"text": text,
|
|
"stream": False,
|
|
"voice_setting": voice_setting,
|
|
"audio_setting": {
|
|
"sample_rate": sample_rate,
|
|
"bitrate": bitrate,
|
|
"format": fmt,
|
|
"channel": 1,
|
|
},
|
|
"language_boost": language_boost,
|
|
"output_format": "hex",
|
|
}
|
|
|
|
resp = requests.post(
|
|
f"{API_BASE}/t2a_v2",
|
|
headers={
|
|
"Authorization": f"Bearer {API_KEY}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json=payload,
|
|
timeout=timeout,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
# Check API-level error
|
|
base_resp = data.get("base_resp", {})
|
|
if base_resp.get("status_code", 0) != 0:
|
|
raise SystemExit(f"API Error [{base_resp.get('status_code')}]: {base_resp.get('status_msg')}")
|
|
|
|
audio_hex = data.get("data", {}).get("audio", "")
|
|
if not audio_hex:
|
|
raise SystemExit(f"No audio in response: {json.dumps(data, indent=2)}")
|
|
|
|
return bytes.fromhex(audio_hex)
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser(description="MiniMax Sync TTS (HTTP)")
|
|
p.add_argument("text", help="Text to synthesize (max 10000 chars)")
|
|
p.add_argument("-o", "--output", required=True, help="Output file path")
|
|
p.add_argument("-v", "--voice", default="male-qn-qingse", help="Voice ID")
|
|
p.add_argument("--model", default="speech-2.8-hd", help="Model (default: speech-2.8-hd)")
|
|
p.add_argument("--speed", type=float, default=1.0, help="Speed 0.5-2.0")
|
|
p.add_argument("--volume", type=float, default=1.0, help="Volume 0.1-10")
|
|
p.add_argument("--pitch", type=int, default=0, help="Pitch -12 to 12")
|
|
p.add_argument("--emotion", default="", help="Emotion tag (happy/sad/angry/...)")
|
|
p.add_argument("--format", default="mp3", dest="fmt", help="Audio format (mp3/wav/flac)")
|
|
p.add_argument("--sample-rate", type=int, default=32000, help="Sample rate")
|
|
p.add_argument("--lang", default="auto", help="Language boost")
|
|
args = p.parse_args()
|
|
|
|
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
|
|
|
|
audio = tts(
|
|
text=args.text,
|
|
voice_id=args.voice,
|
|
model=args.model,
|
|
speed=args.speed,
|
|
volume=args.volume,
|
|
pitch=args.pitch,
|
|
emotion=args.emotion,
|
|
fmt=args.fmt,
|
|
sample_rate=args.sample_rate,
|
|
language_boost=args.lang,
|
|
)
|
|
|
|
with open(args.output, "wb") as f:
|
|
f.write(audio)
|
|
|
|
print(f"OK: {len(audio)} bytes -> {args.output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|