參考 https://huggingface.co/MediaTek-Research/Breeze-ASR-25
$ uv init breeze-asr
$ cd breeze-asr/
$ rm .python-version
# 在 pyproject.toml 增加下列文件,才可順利安裝 torch 等套件
$ vi pyproject.toml
[[tool.uv.index]]
name = "pytorch-cu130"
url = "https://download.pytorch.org/whl/cu130"
explicit = true # 關鍵:這會阻止一般套件跑去 PyTorch 倉庫找
[tool.uv.sources]
torch = { index = "pytorch-cu130" }
torchaudio = { index = "pytorch-cu130" }
torchvision = { index = "pytorch-cu130" }
torchcodec = { index = "pytorch-cu130" }
$ uv venv --python 3.13
$ source .venv/bin/activate
$ uv pip install -e .
# 安裝時有順序區別,torch 要比 datasets[audio] 先安裝
$ uv add torch torchaudio torchcodec
$ uv add transformers
$ uv add datasets[audio]
$ uv add accelerate
$ sudo apt update
$ sudo apt install -y ffmpeg libavutil-dev
# 照著文件執行會出現下列錯誤,google AI 和 chatgpt 都建議直接使用 model
KeyError: 'num_frames'
# 若使用下列程式碼,輸出的文件不會分段,全部混再一起
result = processor.batch_decode(
generated_ids,
skip_special_tokens=True
)[0]
# 下面為最後的程式,可產生如下的輸出
===== TEXT + TIME =====
0.00s 所說相生這種東西人靠一張嘴
3.28s 通過語言的結構把看官聽眾吸引到故事裡面
7.72s 在演出的時候
9.04s 要求你身上的每個動作都必須要有含義
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
audio_path = "/home/spark/DiskD/audio_llm/breeze-asr/output.wav"
# load audio
waveform, sr = torchaudio.load(audio_path)
waveform = waveform.mean(dim=0)
if sr != 16000:
waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
# load model
processor = WhisperProcessor.from_pretrained("/mnt/models/Breeze-ASR-25")
model = WhisperForConditionalGeneration.from_pretrained(
"/mnt/models/Breeze-ASR-25"
).to("cuda")
model.eval()
# preprocess
inputs = processor(
waveform,
sampling_rate=16000,
return_tensors="pt"
)
# inference
with torch.no_grad():
outputs = model.generate(
inputs.input_features.to("cuda"),
return_timestamps=True,
return_dict_in_generate=True,
output_scores=True,
)
# --------------------------------------------------
# 5. Decode tokens
# --------------------------------------------------
token_ids = outputs["sequences"][0].tolist()
tokens = processor.tokenizer.convert_ids_to_tokens(token_ids)
# Whisper timestamp token設定
timestamp_begin = processor.tokenizer.convert_tokens_to_ids("<|0.00|>")
time_precision = 0.02 # Whisper: 20ms
current_time = None
buffer = []
print("===== TEXT + TIME =====")
for tid in token_ids:
# timestamp token
if tid >= timestamp_begin:
# 先輸出上一段
if buffer and current_time is not None:
text = processor.tokenizer.decode(
buffer,
skip_special_tokens=True,
clean_up_tokenization_spaces=True
)
if text.strip():
print(f"{current_time:.2f}s\t{text}")
# 更新時間
current_time = (tid - timestamp_begin) * time_precision
buffer = []
else:
buffer.append(tid)
# flush 最後一段
if buffer and current_time is not None:
text = processor.tokenizer.decode(
buffer,
skip_special_tokens=True,
clean_up_tokenization_spaces=True
)
if text.strip():
print(f"{current_time:.2f}s\t{text}")
沒有留言:
張貼留言