""" VoiceSynth 语音合成插件 支持命令: - /音色列表 - /切换音色 xx - /echo 文本 并支持 AI 回复后按概率附带语音回复。 """ import asyncio import base64 import random import re import uuid from pathlib import Path from urllib.parse import urlparse import aiohttp import tomllib from loguru import logger from utils.plugin_base import PluginBase from utils.decorators import on_text_message from WechatHook import WechatHookClient class VoiceSynth(PluginBase): """语音合成插件""" description = "语音合成与语音回复插件" author = "ShiHao" version = "1.0.0" def __init__(self): super().__init__() self.config = {} self.api_base_url = "https://dashscope.aliyuncs.com/api/v1" self.api_endpoint = "/services/aigc/multimodal-generation/generation" self.api_key = "" self.model = "qwen3-tts-flash" self.language_type = "Chinese" self.stream = False self.timeout = 30 self.api_task = "tts" self.payload_mode = "auto" self._alt_endpoint = "/services/aigc/multimodal-generation/generation" self.voice_map = {} self.voice_alias_map = {} self.default_voice = "" self._chat_voice = {} self.enable_group = True self.enable_private = True self.master_enabled = True self.ai_voice_probability = 0.0 self.enable_auto_reply_voice = True self.max_duration_seconds = 60 self.max_chars_per_second = 4 self.allow_raw_audio = False self.raw_audio_format = "wav" self._session = None self._temp_dir = Path(__file__).parent / "temp" self._temp_dir.mkdir(parents=True, exist_ok=True) async def async_init(self): """插件异步初始化""" config_path = Path(__file__).parent / "config.toml" if config_path.exists(): with open(config_path, "rb") as f: self.config = tomllib.load(f) api_config = self.config.get("api", {}) self.api_base_url = api_config.get("base_url", self.api_base_url) self.api_endpoint = api_config.get("endpoint", self.api_endpoint) self.api_key = api_config.get("api_key", self.api_key) self.model = api_config.get("model", self.model) self.language_type = api_config.get("language_type", self.language_type) self.stream = bool(api_config.get("stream", self.stream)) self.timeout = int(api_config.get("timeout", self.timeout)) self.api_task = str(api_config.get("task", self.api_task)).strip() self.payload_mode = str(api_config.get("payload_mode", self.payload_mode)).strip().lower() self._alt_endpoint = str(api_config.get("alt_endpoint", self._alt_endpoint)).strip() or self._alt_endpoint if self.stream: logger.warning("stream 暂不支持,已强制关闭") self.stream = False voice_config = self.config.get("voices", {}) self.default_voice = str(voice_config.get("default", self.default_voice)).strip() voice_list = voice_config.get("list", []) if isinstance(voice_list, str): voice_list = [voice_list] self.voice_map, self.voice_alias_map = self._parse_voice_list(voice_list) if self.default_voice and self.default_voice not in self.voice_map: logger.warning(f"默认音色不在列表中: {self.default_voice}") self.voice_map[self.default_voice] = self.default_voice self.voice_alias_map[self.default_voice] = self.default_voice if not self.default_voice and self.voice_map: self.default_voice = next(iter(self.voice_map.keys())) behavior_config = self.config.get("behavior", {}) self.master_enabled = bool(behavior_config.get("enabled", True)) self.enable_group = bool(behavior_config.get("enable_group", True)) self.enable_private = bool(behavior_config.get("enable_private", True)) reply_config = self.config.get("reply", {}) self.ai_voice_probability = float(reply_config.get("ai_voice_probability", 0.0)) self.enable_auto_reply_voice = bool(reply_config.get("enable_auto_reply_voice", True)) self.max_duration_seconds = int(reply_config.get("max_duration_seconds", 60)) self.max_chars_per_second = int(reply_config.get("max_chars_per_second", 4)) if self.ai_voice_probability > 1: self.ai_voice_probability = self.ai_voice_probability / 100.0 if self.ai_voice_probability < 0: self.ai_voice_probability = 0.0 if self.ai_voice_probability > 1: self.ai_voice_probability = 1.0 conversion_config = self.config.get("conversion", {}) self.allow_raw_audio = bool(conversion_config.get("allow_raw_audio", False)) self.raw_audio_format = str(conversion_config.get("raw_audio_format", "wav")).strip().lower() or "wav" if self.raw_audio_format not in {"wav", "amr"}: self.raw_audio_format = "wav" if self._session is None or self._session.closed: timeout = aiohttp.ClientTimeout(total=self.timeout) self._session = aiohttp.ClientSession(timeout=timeout) logger.info( "VoiceSynth 配置: endpoint=%s task=%s payload_mode=%s model=%s default_voice=%s voice_count=%d master_enabled=%s allow_raw_audio=%s raw_audio_format=%s", self._build_api_url(), self.api_task or "", self.payload_mode, self.model, self.default_voice or "", len(self.voice_map), self.master_enabled, self.allow_raw_audio, self.raw_audio_format, ) logger.success("VoiceSynth 插件初始化完成") async def on_unload(self): """插件卸载时调用""" await super().on_unload() if self._session and not self._session.closed: await self._session.close() self._session = None def _parse_voice_list(self, voice_list): voice_map = {} alias_map = {} for item in voice_list: if not item: continue if ":" in item: code, name = item.split(":", 1) else: code, name = item, item code = code.strip() name = name.strip() or code if not code: continue voice_map[code] = name if name: alias_map[name] = code return voice_map, alias_map def _resolve_voice(self, voice_key: str) -> str: voice_key = (voice_key or "").strip() if not voice_key: return "" if voice_key in self.voice_map: return voice_key if voice_key in self.voice_alias_map: return self.voice_alias_map[voice_key] return "" def _get_chat_voice(self, chat_id: str) -> str: return self._chat_voice.get(chat_id, self.default_voice) def _set_chat_voice(self, chat_id: str, voice_code: str): if not chat_id or not voice_code: return self._chat_voice[chat_id] = voice_code def _build_api_url(self) -> str: endpoint = (self.api_endpoint or "").strip() if endpoint.startswith("http://") or endpoint.startswith("https://"): return endpoint return f"{self.api_base_url.rstrip('/')}/{endpoint.lstrip('/')}" def _save_master_enabled(self, enabled: bool) -> bool: """保存 VoiceSynth 总开关到 config.toml""" try: behavior = self.config.setdefault("behavior", {}) behavior["enabled"] = bool(enabled) config_path = Path(__file__).parent / "config.toml" if not config_path.exists(): return False text = config_path.read_text(encoding="utf-8") lines = text.splitlines() behavior_idx = -1 for i, line in enumerate(lines): if line.strip().lower() == "[behavior]": behavior_idx = i break enabled_line = f"enabled = {'true' if enabled else 'false'}" if behavior_idx < 0: if lines and lines[-1].strip() != "": lines.append("") lines.append("[behavior]") lines.append(enabled_line) else: section_end = len(lines) for i in range(behavior_idx + 1, len(lines)): if lines[i].strip().startswith("["): section_end = i break replaced = False for i in range(behavior_idx + 1, section_end): if re.match(r"^\s*enabled\s*=", lines[i]): lines[i] = enabled_line replaced = True break if not replaced: insert_at = behavior_idx + 1 lines.insert(insert_at, enabled_line) new_text = "\n".join(lines) if text.endswith("\n"): new_text += "\n" config_path.write_text(new_text, encoding="utf-8") return True except Exception as e: logger.warning(f"保存 VoiceSynth 总开关失败: {e}") return False def _truncate_text(self, text: str) -> str: if not text: return text max_chars = int(self.max_duration_seconds * self.max_chars_per_second) if max_chars <= 0: return text if len(text) > max_chars: logger.info(f"语音文本过长,已截断到 {max_chars} 字符") return text[:max_chars] return text def _build_payload(self, text: str, voice: str, mode: str) -> dict: """构建 TTS 请求 payload(仅用于 HTTP 方式备用)""" return { "model": self.model, "text": text, "voice": voice, "language_type": self.language_type, "stream": False, } async def _request_tts(self, text: str, voice: str) -> dict | None: """使用 HTTP 直接调用 TTS API""" if not self.api_key: logger.warning("VoiceSynth API Key 未配置") return None url = self._build_api_url() headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", } payload = { "model": self.model, "input": { "text": text, "voice": voice, "language_type": self.language_type, } } logger.debug(f"TTS 请求: url={url} voice={voice} text_len={len(text)}") session = self._session if session is None or session.closed: timeout = aiohttp.ClientTimeout(total=self.timeout) session = aiohttp.ClientSession(timeout=timeout) self._session = session try: async with session.post(url, json=payload, headers=headers) as resp: data = await resp.json(content_type=None) logger.debug(f"TTS 响应: status={resp.status} request_id={data.get('request_id', '')}") if resp.status == 200: return data logger.warning(f"TTS 请求失败: {resp.status}, {data}") return None except Exception as e: logger.warning(f"TTS 请求异常: {e}") return None def _build_alt_url(self) -> str: endpoint = (self._alt_endpoint or "").strip() if not endpoint: return "" if endpoint.startswith("http://") or endpoint.startswith("https://"): return endpoint return f"{self.api_base_url.rstrip('/')}/{endpoint.lstrip('/')}" def _get_audio_info(self, response: dict) -> tuple[str, str]: output = (response or {}).get("output") or {} audio = output.get("audio") or {} audio_url = audio.get("url") or "" audio_data = audio.get("data") or "" return audio_url, audio_data def _guess_extension(self, url: str, content_type: str = "") -> str: suffix = Path(urlparse(url).path).suffix if suffix: return suffix content_type = (content_type or "").lower() if "wav" in content_type: return ".wav" if "mpeg" in content_type or "mp3" in content_type: return ".mp3" if "ogg" in content_type: return ".ogg" return ".wav" async def _download_audio(self, url: str) -> Path | None: session = self._session if session is None or session.closed: timeout = aiohttp.ClientTimeout(total=self.timeout) session = aiohttp.ClientSession(timeout=timeout) self._session = session try: async with session.get(url) as resp: if resp.status != 200: logger.warning(f"下载音频失败: {resp.status}") return None content_type = resp.headers.get("Content-Type", "") suffix = self._guess_extension(url, content_type) file_path = self._temp_dir / f"tts_{uuid.uuid4().hex}{suffix}" audio_bytes = await resp.read() file_path.write_bytes(audio_bytes) logger.debug(f"下载音频完成: size={len(audio_bytes)} path={file_path}") return file_path except Exception as e: logger.warning(f"下载音频异常: {e}") return None async def _write_audio_bytes(self, data: bytes, suffix: str = ".wav") -> Path: file_path = self._temp_dir / f"tts_{uuid.uuid4().hex}{suffix}" file_path.write_bytes(data) return file_path def _load_pysilk(self): """加载 silk 编码库,优先 pysilk,备选 pilk""" # 尝试 pysilk try: import pysilk return pysilk, "pysilk", None except Exception: pass # 尝试 pilk(64 位兼容) try: import pilk return pilk, "pilk", None except Exception as e: return None, None, e async def _convert_to_silk(self, input_path: Path) -> Path | None: suffix = input_path.suffix.lower() if suffix == ".silk": return input_path silk_lib, lib_name, err = self._load_pysilk() if not silk_lib: import sys if self.allow_raw_audio: raw_path = await self._convert_to_raw(input_path) if raw_path: logger.warning( f"缺少 silk 编码库,使用 raw 音频发送: {raw_path} | python={sys.executable}" ) return raw_path logger.warning(f"缺少 silk 编码库(pysilk/pilk),无法转换: {err} | python={sys.executable}") return None source_path = input_path if suffix != ".wav": converted = await self._convert_to_wav(input_path) if not converted: logger.warning(f"不支持的音频格式: {suffix}") return None source_path = converted silk_path = source_path.with_suffix(".silk") # pilk 使用文件路径 API,需要先转换为正确格式的 PCM if lib_name == "pilk": try: import wave # 读取 WAV 文件信息 with wave.open(str(source_path), "rb") as wf: sample_rate = wf.getframerate() channels = wf.getnchannels() logger.debug(f"WAV 信息: sample_rate={sample_rate} channels={channels}") # pilk 需要单声道 PCM,如果采样率不是标准的需要转换 # 先用 ffmpeg 转换为 16000Hz 单声道 converted_wav = await self._convert_to_wav_16k(source_path) if not converted_wav: logger.warning("转换 WAV 到 16kHz 失败") return None duration = await asyncio.to_thread( silk_lib.encode, str(converted_wav), str(silk_path), pcm_rate=16000, tencent=True ) logger.debug(f"pilk 编码完成: duration={duration}ms") # 清理临时转换文件 if converted_wav != source_path and converted_wav.exists(): converted_wav.unlink() return silk_path except Exception as e: logger.warning(f"pilk 编码失败: {e}") return None # pysilk 使用 PCM 数据 API try: import wave import audioop with wave.open(str(source_path), "rb") as wf: sample_rate = wf.getframerate() channels = wf.getnchannels() sample_width = wf.getsampwidth() pcm = wf.readframes(wf.getnframes()) if channels > 1: pcm = audioop.tomono(pcm, sample_width, 0.5, 0.5) silk_bytes = await silk_lib.async_encode( pcm, data_rate=sample_rate, sample_rate=sample_rate, ) silk_path.write_bytes(silk_bytes) return silk_path except Exception as e: logger.warning(f"pysilk 编码失败: {e}") return None async def _convert_to_wav(self, input_path: Path) -> Path | None: import shutil import subprocess ffmpeg = shutil.which("ffmpeg") if not ffmpeg: return None output_path = input_path.with_suffix(".wav") cmd = [ ffmpeg, "-y", "-i", str(input_path), "-ac", "1", "-ar", "16000", str(output_path), ] try: result = await asyncio.to_thread( subprocess.run, cmd, capture_output=True, text=True, ) if result.returncode != 0: logger.warning(f"ffmpeg 转换失败: {result.stderr}") return None return output_path except Exception as e: logger.warning(f"ffmpeg 转换异常: {e}") return None async def _convert_to_wav_16k(self, input_path: Path) -> Path | None: """将音频转换为 16kHz 单声道 WAV(pilk 需要)""" import shutil import subprocess ffmpeg = shutil.which("ffmpeg") if not ffmpeg: logger.warning("未找到 ffmpeg,无法转换音频采样率") return None output_path = input_path.parent / f"{input_path.stem}_16k.wav" cmd = [ ffmpeg, "-y", "-i", str(input_path), "-ac", "1", "-ar", "16000", "-acodec", "pcm_s16le", str(output_path), ] try: result = await asyncio.to_thread( subprocess.run, cmd, capture_output=True, text=True, ) if result.returncode != 0: logger.warning(f"ffmpeg 转换 16k 失败: {result.stderr}") return None logger.debug(f"转换为 16kHz WAV: {output_path}") return output_path except Exception as e: logger.warning(f"ffmpeg 转换 16k 异常: {e}") return None async def _convert_to_raw(self, input_path: Path) -> Path | None: if self.raw_audio_format == "wav": if input_path.suffix.lower() == ".wav": return input_path return await self._convert_to_wav(input_path) if self.raw_audio_format == "amr": return await self._convert_with_ffmpeg(input_path, ".amr", sample_rate=8000) return await self._convert_to_wav(input_path) async def _convert_with_ffmpeg(self, input_path: Path, suffix: str, sample_rate: int = 16000) -> Path | None: import shutil import subprocess ffmpeg = shutil.which("ffmpeg") if not ffmpeg: logger.warning("未找到 ffmpeg,无法转码") return None output_path = input_path.with_suffix(suffix) cmd = [ ffmpeg, "-y", "-i", str(input_path), "-ac", "1", "-ar", str(sample_rate), str(output_path), ] try: result = await asyncio.to_thread( subprocess.run, cmd, capture_output=True, text=True, ) if result.returncode != 0: logger.warning(f"ffmpeg 转换失败: {result.stderr}") return None return output_path except Exception as e: logger.warning(f"ffmpeg 转换异常: {e}") return None async def _synthesize_to_silk(self, text: str, voice: str) -> tuple[Path | None, list[Path]]: cleanup_paths = [] text = self._truncate_text(text) if not text: return None, cleanup_paths response = await self._request_tts(text, voice) if not response: return None, cleanup_paths audio_url, audio_data = self._get_audio_info(response) logger.debug(f"音频信息: url={audio_url[:80] if audio_url else ''!r} data_len={len(audio_data) if audio_data else 0}") if audio_url: audio_path = await self._download_audio(audio_url) elif audio_data: try: raw = base64.b64decode(audio_data) audio_path = await self._write_audio_bytes(raw) except Exception as e: logger.warning(f"解码音频失败: {e}") return None, cleanup_paths else: logger.warning(f"未获取到音频数据: {response}") return None, cleanup_paths if not audio_path: return None, cleanup_paths if audio_path.exists(): cleanup_paths.append(audio_path) if audio_path.suffix.lower() != ".wav": cleanup_paths.append(audio_path.with_suffix(".wav")) silk_path = await self._convert_to_silk(audio_path) if not silk_path: return None, cleanup_paths return silk_path, cleanup_paths async def _send_voice(self, bot: WechatHookClient, to_wxid: str, silk_path: Path) -> bool: try: ok = await bot.http_client.send_voice(to_wxid, str(silk_path)) return ok except Exception as e: logger.warning(f"发送语音失败: {e}") return False async def _speak(self, bot: WechatHookClient, to_wxid: str, text: str, voice: str, silent: bool = False) -> bool: if not self.master_enabled: if not silent: await bot.send_text(to_wxid, "⚠️ VoiceSynth 总开关已关闭") return False silk_path = None cleanup_paths = [] try: silk_path, cleanup_paths = await self._synthesize_to_silk(text, voice) if not silk_path: if not silent: await bot.send_text(to_wxid, "❌ 语音生成失败") return False ok = await self._send_voice(bot, to_wxid, silk_path) if not ok and not silent: await bot.send_text(to_wxid, "❌ 语音发送失败") return ok finally: if silk_path: try: if silk_path.exists(): silk_path.unlink() except Exception: pass for path in cleanup_paths: try: if path.exists(): path.unlink() except Exception: pass async def maybe_send_voice_reply(self, bot: WechatHookClient, to_wxid: str, text: str, message: dict | None = None): """AI 回复后按概率发送语音""" if not self.enabled: return if not self.master_enabled: return if self.ai_voice_probability <= 0: return if message and not self.enable_auto_reply_voice: if message.get("_auto_reply_triggered") or message.get("_auto_reply_context"): return if random.random() > self.ai_voice_probability: return is_group = False if message: is_group = bool(message.get("IsGroup", False)) else: is_group = to_wxid.endswith("@chatroom") if is_group and not self.enable_group: return if not is_group and not self.enable_private: return voice_code = self._get_chat_voice(to_wxid) if not voice_code: return await self._speak(bot, to_wxid, text, voice_code, silent=True) @on_text_message(priority=70) async def handle_voice_command(self, bot: WechatHookClient, message: dict): """处理语音合成相关命令""" content = message.get("Content", "").strip() from_wxid = message.get("FromWxid", "") is_group = message.get("IsGroup", False) if content == "/语音开": self.master_enabled = True ok = self._save_master_enabled(True) if ok: await bot.send_text(from_wxid, "✅ VoiceSynth 总开关已开启") else: await bot.send_text(from_wxid, "⚠️ VoiceSynth 已开启,但写入配置失败") return False if content == "/语音关": self.master_enabled = False ok = self._save_master_enabled(False) if ok: await bot.send_text(from_wxid, "✅ VoiceSynth 总开关已关闭") else: await bot.send_text(from_wxid, "⚠️ VoiceSynth 已关闭,但写入配置失败") return False if content == "/语音状态": current_voice = self._get_chat_voice(from_wxid) current_voice_name = self.voice_map.get(current_voice, current_voice) if current_voice else "未配置" lines = [ "🎙️ VoiceSynth 状态", f"总开关: {'开启' if self.master_enabled else '关闭'}", f"群聊可用: {'是' if self.enable_group else '否'}", f"私聊可用: {'是' if self.enable_private else '否'}", f"AI回复语音概率: {self.ai_voice_probability:.2f}", f"AutoReply语音: {'开启' if self.enable_auto_reply_voice else '关闭'}", f"当前会话音色: {current_voice_name} ({current_voice or '-'})", ] await bot.send_text(from_wxid, "\n".join(lines)) return False if not self.master_enabled: if content == "/音色列表" or content.startswith("/切换音色") or content.startswith("/echo"): await bot.send_text(from_wxid, "⚠️ VoiceSynth 总开关已关闭") return False return True if is_group and not self.enable_group: return True if not is_group and not self.enable_private: return True if content == "/音色列表": if not self.voice_map: await bot.send_text(from_wxid, "❌ 未配置可用音色") return False current = self._get_chat_voice(from_wxid) lines = ["可用音色:"] for code, name in self.voice_map.items(): marker = "*" if code == current else "-" lines.append(f"{marker} {name} ({code})") lines.append(f"当前音色: {self.voice_map.get(current, current)} ({current})") lines.append("切换: /切换音色 音色代码") await bot.send_text(from_wxid, "\n".join(lines)) return False if content.startswith("/切换音色"): voice_key = content[len("/切换音色"):].strip() if not voice_key: await bot.send_text(from_wxid, "❌ 用法: /切换音色 音色代码") return False voice_code = self._resolve_voice(voice_key) if not voice_code: await bot.send_text(from_wxid, "❌ 未找到该音色") return False self._set_chat_voice(from_wxid, voice_code) display_name = self.voice_map.get(voice_code, voice_code) await bot.send_text(from_wxid, f"✅ 已切换音色: {display_name} ({voice_code})") return False if content.startswith("/echo"): text = content[len("/echo"):].strip() if not text: await bot.send_text(from_wxid, "❌ 用法: /echo 需要朗读的内容") return False voice_code = self._get_chat_voice(from_wxid) if not voice_code: await bot.send_text(from_wxid, "❌ 未配置音色") return False await self._speak(bot, from_wxid, text, voice_code, silent=False) return False return True