diff --git a/plugins/ai_auto_response/core/reply_formatter.py b/plugins/ai_auto_response/core/reply_formatter.py index a0bf84b..0570268 100644 --- a/plugins/ai_auto_response/core/reply_formatter.py +++ b/plugins/ai_auto_response/core/reply_formatter.py @@ -12,11 +12,11 @@ def finalize_reply(response: str, reply_mode: str) -> List[str]: text = text.replace("\n", " ").strip() if reply_mode == "social_short": - return split_reply_chunks(text, sentence_limit=2, char_limit=24, chunk_limit=2) + return split_reply_chunks(text, sentence_limit=2, char_limit=24, chunk_limit=2, allow_clip_split=False) if reply_mode == "qa_fast": - return split_reply_chunks(text, sentence_limit=2, char_limit=32, chunk_limit=2) + return split_reply_chunks(text, sentence_limit=2, char_limit=32, chunk_limit=2, allow_clip_split=False) if reply_mode == "qa_with_context": - return split_reply_chunks(text, sentence_limit=2, char_limit=40, chunk_limit=2) + return split_reply_chunks(text, sentence_limit=2, char_limit=40, chunk_limit=2, allow_clip_split=False) return [take_first_sentence(text, 28).strip()] @@ -45,14 +45,22 @@ def take_first_sentence(text: str, limit: int) -> str: return smart_clip(first, limit) -def split_reply_chunks(text: str, sentence_limit: int, char_limit: int, chunk_limit: int) -> List[str]: +def split_reply_chunks( + text: str, + sentence_limit: int, + char_limit: int, + chunk_limit: int, + allow_clip_split: bool = True, +) -> List[str]: parts = [item.strip() for item in re.split(r"(?<=[。!?!?;;])", text) if item.strip()] if not parts: short = text.strip() clipped = smart_clip(short, char_limit) - remainder = short[len(clipped):].strip(",,、;;:: ") if not short: return [] + if not allow_clip_split: + return [clipped] if clipped else [] + remainder = short[len(clipped):].strip(",,、;;:: ") return [item for item in [clipped, smart_clip(remainder, char_limit)] if item][:chunk_limit] chunks: List[str] = [] @@ -67,6 +75,8 @@ def split_reply_chunks(text: str, sentence_limit: int, char_limit: int, chunk_li clipped = current[:char_limit].rstrip(",,、;;:: ").strip() if clipped: chunks.append(clipped) + if not allow_clip_split: + break current = current[len(clipped):].strip(",,、;;:: ") return chunks[:chunk_limit] or [smart_clip(text, char_limit)] diff --git a/test/hifi_clone.wav b/test/hifi_clone.wav new file mode 100644 index 0000000..c523afe Binary files /dev/null and b/test/hifi_clone.wav differ diff --git a/test/lzl.mp3 b/test/lzl.mp3 new file mode 100644 index 0000000..a7c0a88 Binary files /dev/null and b/test/lzl.mp3 differ diff --git a/test/lzl.wav b/test/lzl.wav new file mode 100644 index 0000000..45d8073 Binary files /dev/null and b/test/lzl.wav differ diff --git a/test/lzl_prompt_cache.pt b/test/lzl_prompt_cache.pt new file mode 100644 index 0000000..31b786a Binary files /dev/null and b/test/lzl_prompt_cache.pt differ diff --git a/test/voice_design.wav b/test/voice_design.wav new file mode 100644 index 0000000..4213f57 Binary files /dev/null and b/test/voice_design.wav differ diff --git a/test/voxcpm_test.py b/test/voxcpm_test.py new file mode 100644 index 0000000..d2f8c94 --- /dev/null +++ b/test/voxcpm_test.py @@ -0,0 +1,67 @@ +from voxcpm import VoxCPM +import soundfile as sf +import torch +from pathlib import Path +import sys + +BASE_DIR = Path(__file__).resolve().parent +DEVICE = "cuda" +PROMPT_WAV = BASE_DIR / "lzl.wav" +CACHE_PATH = BASE_DIR / "lzl_prompt_cache.pt" +VOICE_DESIGN_PATH = BASE_DIR / "voice_design.wav" +HIFI_CLONE_PATH = BASE_DIR / "hifi_clone.wav" +PROMPT_TEXT = "亲爱的,今天的你要出发,挣钱喽,都没有系好安全带。 乖乖仔,我要到了,奇怪还没有分开就开始想你了,注意一下,闯红灯拍照,我可不喜欢明知孤犯的小坏蛋,安全带系一好,我们这边要出发喽。 小坏蛋,前方有限速拍照,姐姐给你盯着呢,车速太快了啊,慢一点慢一点降下来,不要让我害怕好吗? 过最堵的路段,千万不要着急,姐姐会一直陪着你。 今日导航就先到这里了,哥哥注意安全停车哦。" +TARGET_TEXT = "慢慢来吧,额度还在就好~" + +if DEVICE != "cuda": + raise RuntimeError(f"Unsupported device: {DEVICE}. This script only supports CUDA.") + +if not torch.cuda.is_available(): + raise RuntimeError( + "This script requires CUDA. " + f"Current python: {sys.executable}, torch: {torch.__version__}. " + "Please run it with a CUDA-enabled Python environment." + ) + +model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False) +if model.tts_model.device != DEVICE: + raise RuntimeError(f"Expected VoxCPM to run on {DEVICE}, got {model.tts_model.device}") +print(f"VoxCPM loaded on {model.tts_model.device} with torch {torch.__version__}") + +wav = model.generate( + text="(A young woman, gentle and sweet voice)Hello, welcome to VoxCPM!", + cfg_value=2.0, + inference_timesteps=10, +) +sf.write(str(VOICE_DESIGN_PATH), wav, model.tts_model.sample_rate) +print(f"Saved voice design to {VOICE_DESIGN_PATH}") + +# Build and persist the prompt cache so later runs can skip prompt encoding. +if CACHE_PATH.exists(): + prompt_cache = torch.load(CACHE_PATH, map_location="cpu") + print(f"Loaded prompt cache from {CACHE_PATH}") +else: + prompt_cache = model.tts_model.build_prompt_cache( + prompt_wav_path=str(PROMPT_WAV), + prompt_text=PROMPT_TEXT, + reference_wav_path=str(PROMPT_WAV), + ) + torch.save(prompt_cache, CACHE_PATH) + print(f"Built and saved prompt cache to {CACHE_PATH}") + +cache_devices = { + key: str(value.device) + for key, value in prompt_cache.items() + if isinstance(value, torch.Tensor) +} +print(f"Prompt cache tensor devices: {cache_devices}") + +wav, _, _ = model.tts_model.generate_with_prompt_cache( + target_text=TARGET_TEXT, + prompt_cache=prompt_cache, + cfg_value=2.0, + inference_timesteps=10, +) +wav = wav.detach().cpu().float().squeeze(0).numpy() +sf.write(str(HIFI_CLONE_PATH), wav, model.tts_model.sample_rate) +print(f"Saved cloned audio to {HIFI_CLONE_PATH}")