網頁

2026年2月6日 星期五

DGX Spark 使用 Breeze-ASR-25

參考 https://huggingface.co/MediaTek-Research/Breeze-ASR-25

$ uv init breeze-asr
$ cd breeze-asr/
$ rm .python-version
# 在 pyproject.toml 增加下列文件,才可順利安裝 torch 等套件
$ vi pyproject.toml
[[tool.uv.index]]
name = "pytorch-cu130"
url = "https://download.pytorch.org/whl/cu130"
explicit = true  # 關鍵:這會阻止一般套件跑去 PyTorch 倉庫找

[tool.uv.sources]
torch = { index = "pytorch-cu130" }
torchaudio = { index = "pytorch-cu130" }
torchvision = { index = "pytorch-cu130" }
torchcodec = { index = "pytorch-cu130" }

$ uv venv --python 3.13
$ source .venv/bin/activate
$ uv pip install -e .

# 安裝時有順序區別,torch 要比 datasets[audio] 先安裝
$ uv add torch torchaudio torchcodec
$ uv add transformers
$ uv add datasets[audio]
$ uv add accelerate
$ sudo apt update
$ sudo apt install -y ffmpeg libavutil-dev

# 照著文件執行會出現下列錯誤,google AI 和 chatgpt 都建議直接使用 model
KeyError: 'num_frames'

# 若使用下列程式碼,輸出的文件不會分段,全部混再一起
result = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True
)[0]

# 下面為最後的程式,可產生如下的輸出
===== TEXT + TIME =====
0.00s 所說相生這種東西人靠一張嘴
3.28s 通過語言的結構把看官聽眾吸引到故事裡面
7.72s 在演出的時候
9.04s 要求你身上的每個動作都必須要有含義

import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration

audio_path = "/home/spark/DiskD/audio_llm/breeze-asr/output.wav"

# load audio
waveform, sr = torchaudio.load(audio_path)
waveform = waveform.mean(dim=0)

if sr != 16000:
    waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)

# load model
processor = WhisperProcessor.from_pretrained("/mnt/models/Breeze-ASR-25")
model = WhisperForConditionalGeneration.from_pretrained(
    "/mnt/models/Breeze-ASR-25"
).to("cuda")
model.eval()

# preprocess
inputs = processor(
    waveform,
    sampling_rate=16000,
    return_tensors="pt"
)

# inference
with torch.no_grad():
    outputs = model.generate(
        inputs.input_features.to("cuda"),
        return_timestamps=True,
        return_dict_in_generate=True,
        output_scores=True,
    )

# --------------------------------------------------
# 5. Decode tokens
# --------------------------------------------------
token_ids = outputs["sequences"][0].tolist()
tokens = processor.tokenizer.convert_ids_to_tokens(token_ids)

# Whisper timestamp token設定
timestamp_begin = processor.tokenizer.convert_tokens_to_ids("<|0.00|>")
time_precision = 0.02  # Whisper: 20ms

current_time = None
buffer = []

print("===== TEXT + TIME =====")

for tid in token_ids:
    # timestamp token
    if tid >= timestamp_begin:
        # 先輸出上一段
        if buffer and current_time is not None:
            text = processor.tokenizer.decode(
                buffer,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            if text.strip():
                print(f"{current_time:.2f}s\t{text}")

        # 更新時間
        current_time = (tid - timestamp_begin) * time_precision
        buffer = []
    else:
        buffer.append(tid)

# flush 最後一段
if buffer and current_time is not None:
    text = processor.tokenizer.decode(
        buffer,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    if text.strip():
        print(f"{current_time:.2f}s\t{text}")

沒有留言:

張貼留言