diff --git a/test/hifi_clone.wav b/test/hifi_clone.wav deleted file mode 100644 index c523afe..0000000 Binary files a/test/hifi_clone.wav and /dev/null differ diff --git a/test/lzl.mp3 b/test/lzl.mp3 deleted file mode 100644 index a7c0a88..0000000 Binary files a/test/lzl.mp3 and /dev/null differ diff --git a/test/lzl.wav b/test/lzl.wav deleted file mode 100644 index 45d8073..0000000 Binary files a/test/lzl.wav and /dev/null differ diff --git a/test/lzl_prompt_cache.pt b/test/lzl_prompt_cache.pt deleted file mode 100644 index 31b786a..0000000 Binary files a/test/lzl_prompt_cache.pt and /dev/null differ diff --git a/test/voice_design.wav b/test/voice_design.wav deleted file mode 100644 index 4213f57..0000000 Binary files a/test/voice_design.wav and /dev/null differ diff --git a/test/voxcpm_test.py b/test/voxcpm_test.py deleted file mode 100644 index d2f8c94..0000000 --- a/test/voxcpm_test.py +++ /dev/null @@ -1,67 +0,0 @@ -from voxcpm import VoxCPM -import soundfile as sf -import torch -from pathlib import Path -import sys - -BASE_DIR = Path(__file__).resolve().parent -DEVICE = "cuda" -PROMPT_WAV = BASE_DIR / "lzl.wav" -CACHE_PATH = BASE_DIR / "lzl_prompt_cache.pt" -VOICE_DESIGN_PATH = BASE_DIR / "voice_design.wav" -HIFI_CLONE_PATH = BASE_DIR / "hifi_clone.wav" -PROMPT_TEXT = "亲爱的,今天的你要出发,挣钱喽,都没有系好安全带。 乖乖仔,我要到了,奇怪还没有分开就开始想你了,注意一下,闯红灯拍照,我可不喜欢明知孤犯的小坏蛋,安全带系一好,我们这边要出发喽。 小坏蛋,前方有限速拍照,姐姐给你盯着呢,车速太快了啊,慢一点慢一点降下来,不要让我害怕好吗? 过最堵的路段,千万不要着急,姐姐会一直陪着你。 今日导航就先到这里了,哥哥注意安全停车哦。" -TARGET_TEXT = "慢慢来吧,额度还在就好~" - -if DEVICE != "cuda": - raise RuntimeError(f"Unsupported device: {DEVICE}. This script only supports CUDA.") - -if not torch.cuda.is_available(): - raise RuntimeError( - "This script requires CUDA. " - f"Current python: {sys.executable}, torch: {torch.__version__}. " - "Please run it with a CUDA-enabled Python environment." - ) - -model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False) -if model.tts_model.device != DEVICE: - raise RuntimeError(f"Expected VoxCPM to run on {DEVICE}, got {model.tts_model.device}") -print(f"VoxCPM loaded on {model.tts_model.device} with torch {torch.__version__}") - -wav = model.generate( - text="(A young woman, gentle and sweet voice)Hello, welcome to VoxCPM!", - cfg_value=2.0, - inference_timesteps=10, -) -sf.write(str(VOICE_DESIGN_PATH), wav, model.tts_model.sample_rate) -print(f"Saved voice design to {VOICE_DESIGN_PATH}") - -# Build and persist the prompt cache so later runs can skip prompt encoding. -if CACHE_PATH.exists(): - prompt_cache = torch.load(CACHE_PATH, map_location="cpu") - print(f"Loaded prompt cache from {CACHE_PATH}") -else: - prompt_cache = model.tts_model.build_prompt_cache( - prompt_wav_path=str(PROMPT_WAV), - prompt_text=PROMPT_TEXT, - reference_wav_path=str(PROMPT_WAV), - ) - torch.save(prompt_cache, CACHE_PATH) - print(f"Built and saved prompt cache to {CACHE_PATH}") - -cache_devices = { - key: str(value.device) - for key, value in prompt_cache.items() - if isinstance(value, torch.Tensor) -} -print(f"Prompt cache tensor devices: {cache_devices}") - -wav, _, _ = model.tts_model.generate_with_prompt_cache( - target_text=TARGET_TEXT, - prompt_cache=prompt_cache, - cfg_value=2.0, - inference_timesteps=10, -) -wav = wav.detach().cpu().float().squeeze(0).numpy() -sf.write(str(HIFI_CLONE_PATH), wav, model.tts_model.sample_rate) -print(f"Saved cloned audio to {HIFI_CLONE_PATH}")