$ git clone https://github.com/andimarafioti/faster-qwen3-tts.git
$ cd faster-qwen3-tts/
$ uv venv
$ source .venv/bin/activate
$ python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'; print(f'PyTorch {torch.__version__}, CUDA {torch.version.cuda}, GPU: {torch.cuda.get_device_name(0)}')" || {
echo ""
echo "WARNING: CUDA not available. You may need to install a CUDA-enabled PyTorch wheel."
echo " uv pip install torch --index-url https://download.pytorch.org/whl/cu128 --python .venv/bin/python"
echo " Note: RTX 50xx / Blackwell GPUs need CUDA 12.8 wheels (PyTorch 2.7+)."
}
$ MAX_JOBS=6 uv pip install flash-attn --no-build-isolation
$ python3 -c "import flash_attn; print(f'Flash Attention version: {flash_attn.__version__}')"
$ uv pip install sounddevice
$ faster-qwen3-tts clone \
--model /mnt/480SSD/models/Qwen3-TTS-12Hz-1.7B-Base \
--text "你好,這是中英文語音 Zero-shot TTS 測試" \
--language English \
--ref-audio ref_audio.wav \
--ref-text "I'm confused why some people have super short timelines, yet at the same time are bullish on scaling up reinforcement learning atop LLMs. If we're actually close to a human-like learner, then this whole approach of training on verifiable outcomes is doomed." \
--output out.wav
$ faster-qwen3-tts custom --model /mnt/480SSD/models/Qwen3-TTS-12Hz-1.7B-CustomVoice --list-speakers --text aaa --output out.wav
$ faster-qwen3-tts custom \
--model /mnt/480SSD/models/Qwen3-TTS-12Hz-1.7B-CustomVoice \
--speaker aiden \
--text "你好,這是中英文語音 Zero-shot TTS 測試" \
--language English \
--output out.wav
$ uv pip install -e ".[demo]"
$ uv pip install nano_parakeet
$ python demo/server.py --model /mnt/480SSD/models/Qwen3-TTS-12Hz-1.7B-Base
# open http://localhost:7860
$ uv pip install "faster-qwen3-tts[demo]"
$ ffmpeg -i /home/spark/DiskD/audio_llm/audio_openai/audio_openai/audio_files/6551d5b0-e35f-43ee-b262-79c55ad548ea.webm \
-vn -acodec pcm_s16le -ar 44100 -ac 2 examples/hard_way.wav
$ python examples/openai_server.py \
--model /mnt/480SSD/models/Qwen3-TTS-12Hz-1.7B-Base \
--ref-audio examples/hard_way.wav \
--ref-text "行路難,行路難,多歧路,今安在?長風破浪會有時,直掛雲帆濟滄海。" \
--language English --port 8102
$ curl http://localhost:8102/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{"model": "tts-1", "input": "你好,這是中英文語音 Zero-shot TTS 測試", "voice": "alloy", "response_format": "wav"}' \
--output speech.wav
沒有留言:
張貼留言