參考 https://huggingface.co/Qwen/Qwen3-ASR-1.7B
$ export HF_TOKEN=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
$ hf download Qwen/Qwen3-ASR-1.7B --local-dir Qwen3-ASR-1.7B
$ hf download Qwen/Qwen3-ASR-0.6B --local-dir Qwen3-ASR-0.6B
$ uv init qwen3-asr
$ cd qwen3-asr/
$ rm .python-version
# 參考 cu130 版本資訊 https://download.pytorch.org/whl/cu130/
$ uv venv --python 3.13
$ source .venv/bin/activate
$ uv pip install -e .
$ uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
$ uv pip install qwen-asr
$ uv pip uninstall torch torchvision torchaudio
$ uv pip install torch==2.9.1 --index-url https://download.pytorch.org/whl/cu130
$ uv pip install torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu130
$ uv pip install torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130
# 別想著直接使用下列命令安裝,會導致之後的安裝 flash-attn 失敗
# uv pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130
# 安裝 flash-attn, 使用 wheels 要求 python 3.10
# uv pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.6.4/flash_attn-2.8.3%2Bcu130torch2.9-cp310-cp310-linux_aarch64.whl
# 安裝 flash-attn 使用編譯安裝,要求使用 torch 2.9.1
$ uv pip install numpy ninja packaging setuptools wheel
$ export TORCH_CUDA_ARCH_LIST="12.1"
$ export CUDA_HOME=/usr/local/cuda-13.0
$ FLASH_ATTENTION_FORCE_BUILD=TRUE MAX_JOBS=4 uv pip install flash-attn --no-build-isolation --no-cache-dir
# 安裝 qwen-asr, 因為內建 vllm 會去找 CUDA 12 版本,所以失敗
# git clone https://github.com/QwenLM/Qwen3-ASR.git
# uv pip install -e ./Qwen3-ASR[vllm] --no-build-isolation -v
# 直接安裝 vllm
$ uv pip install https://github.com/vllm-project/vllm/releases/download/v0.14.0/vllm-0.14.0+cu130-cp38-abi3-manylinux_2_35_aarch64.whl
$ qwen-asr-serve /mnt/models/Qwen3-ASR-0.6B \
--allowed-local-media-path /home/spark/DiskD/audio_llm \
--gpu-memory-utilization 0.5 \
--host 0.0.0.0 --port 8000
$ curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-X POST \
-d '{
"messages": [
{
"role": "user",
"content": [
{
"type": "audio_url",
"audio_url": {
"url": "file:///home/spark/DiskD/audio_llm/breeze-asr/output.wav"
}
},
{
"type": "audio_url",
"audio_url": {
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"
}
}
]
}
]
}' | jq -r '.choices[0].message.content'
$ uvicorn test_c:app --host 0.0.0.0 --port 8000
$ curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
-F "file=@/home/spark/DiskD/audio_llm/breeze-asr/output.wav" \
-F "model_name=gpt-4o-mini-transcribe" \
-F "language=zh" | jq
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 525k 100 2955 100 522k 2355 416k 0:00:01 0:00:01 --:--:-- 418k
{
"results": [
{
"language": "Chinese",
"text": "说书相声这种东西,人靠一张嘴,通过语言的结构,把看官听众吸引到故事里面。在演出的时候,要求你身上的每个动作都必须要有含义。",
"time_stamps": {
"items": [
{
"text": "说",
"start_time": 0.08,
"end_time": 0.32
},
{
"text": "书",
"start_time": 0.32,
"end_time": 0.48
},
{
"text": "相",
"start_time": 0.48,
"end_time": 0.72
},
{
"text": "声",
"start_time": 0.72,
"end_time": 1.04
},
........
{
"text": "有",
"start_time": 11.52,
"end_time": 11.6
},
{
"text": "含",
"start_time": 11.6,
"end_time": 11.84
},
{
"text": "义",
"start_time": 11.84,
"end_time": 12.08
}
]
}
}
]
}
$ curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
-F "file_url=https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav" \
-F "model_name=gpt-4o-mini-transcribe" \
-F "language=en" | jq
$ curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
-F "file=@/home/spark/DiskD/audio_llm/breeze-asr/output.wav" \
-F "file_url=https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav" \
-F "model_name=gpt-4o-mini-transcribe" | jq