網頁

2026年2月9日 星期一

DGX Spark 安裝 piper tts

參考 https://github.com/OHF-Voice/piper1-gpl/tree/main
參考 https://huggingface.co/csukuangfj/vits-piper-zh_CN-huayan-medium
參考 https://huggingface.co/csukuangfj/vits-piper-zh_CN-huayan-x_low

$ uv venv --python 3.13
$ source .venv/bin/activate
$ uv pip install piper-tts
$ uv pip install g2pw
$ uv pip install requests
$ uv pip install torch --index-url https://download.pytorch.org/whl/cu130
$ uv pip install unicode_rbnf
$ uv pip install sentence_stream
$ uv pip install fastapi
$ uv pip install uvicorn
$ uv pip install python-multipart


$ python3 -m piper.download_voices
$ python3 -m piper.download_voices zh_CN-huayan-x_low --download-dir models
$ ls models/
$ python3 -m piper.download_voices zh_CN-huayan-medium --download-dir models
$ python3 -m piper.download_voices zh_CN-chaowen-medium --download-dir models
$ python3 -m piper.download_voices zh_CN-xiao_ya-medium --download-dir models
$ python3 -m piper.download_voices en_US-lessac-medium --download-dir models

# for voice.synthesize, 用 curl 可以成功,但 open-webui 測試失敗
$ curl -X POST http://127.0.0.1:8100/v1/audio/speech \
     -H "Content-Type: application/json" \
     -d '{"input": "你好,這是一段測試語音。"}' \
     --output output.pcm
$ ffmpeg -f s16le -ar 16000 -ac 1 -i output.pcm \
       -codec:a libmp3lame -b:a 128k output.mp3

# voice.synthesize_wav, 用 curl 可以成功,並且 open-webui 測試成功
$ curl -X POST http://127.0.0.1:8100/v1/audio/speech \
  -H "Content-Type: application/json" \
  -d '{"input":"你好,這是 synthesize_wav 測試"}' \
  --output output.wav

2026年2月6日 星期五

DGX Spark 使用 Qwen3-ASR-1.7B

參考 https://huggingface.co/Qwen/Qwen3-ASR-1.7B

$ export HF_TOKEN=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
$ hf download Qwen/Qwen3-ASR-1.7B --local-dir Qwen3-ASR-1.7B
$ hf download Qwen/Qwen3-ASR-0.6B --local-dir Qwen3-ASR-0.6B

$ uv init qwen3-asr
$ cd qwen3-asr/
$ rm .python-version
# 參考 cu130 版本資訊 https://download.pytorch.org/whl/cu130/
$ uv venv --python 3.13
$ source .venv/bin/activate
$ uv pip install -e .
$ uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
$ uv pip install qwen-asr
$ uv pip uninstall torch torchvision torchaudio
$ uv pip install torch==2.9.1 --index-url https://download.pytorch.org/whl/cu130
$ uv pip install torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu130
$ uv pip install torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130
# 別想著直接使用下列命令安裝,會導致之後的安裝 flash-attn 失敗
# uv pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130

# 安裝 flash-attn, 使用 wheels 要求 python 3.10
# uv pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.6.4/flash_attn-2.8.3%2Bcu130torch2.9-cp310-cp310-linux_aarch64.whl
# 安裝 flash-attn 使用編譯安裝,要求使用 torch 2.9.1
$ uv pip install numpy ninja packaging setuptools wheel
$ export TORCH_CUDA_ARCH_LIST="12.1"
$ export CUDA_HOME=/usr/local/cuda-13.0
$ FLASH_ATTENTION_FORCE_BUILD=TRUE MAX_JOBS=4 uv pip install flash-attn --no-build-isolation --no-cache-dir
# 因為很耗記憶體,所以時常會出現 Out of memory, 可以在很長的 log 中找到 Killed
# 或者使用下列兩個命令查詢確認

# 安裝 qwen-asr, 因為內建 vllm 會去找 CUDA 12 版本,所以失敗
# git clone https://github.com/QwenLM/Qwen3-ASR.git
# uv pip install -e ./Qwen3-ASR[vllm] --no-build-isolation -v
# 直接安裝 vllm
$ uv pip install https://github.com/vllm-project/vllm/releases/download/v0.14.0/vllm-0.14.0+cu130-cp38-abi3-manylinux_2_35_aarch64.whl

$ qwen-asr-serve /mnt/models/Qwen3-ASR-0.6B \
  --allowed-local-media-path /home/spark/DiskD/audio_llm \
  --gpu-memory-utilization 0.5 \
  --host 0.0.0.0 --port 8000
$ curl http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -X POST \
  -d '{
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "audio_url",
            "audio_url": {
              "url": "file:///home/spark/DiskD/audio_llm/breeze-asr/output.wav"
            }
          },
          {
            "type": "audio_url",
            "audio_url": {
              "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"
            }
          }
        ]
      }
    ]
  }' | jq -r '.choices[0].message.content'

$ uvicorn test_c:app --host 0.0.0.0 --port 8000
$ curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
  -F "file=@/home/spark/DiskD/audio_llm/breeze-asr/output.wav" \
  -F "model_name=gpt-4o-mini-transcribe" \
  -F "language=zh" | jq
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  525k  100  2955  100  522k   2355   416k  0:00:01  0:00:01 --:--:--  418k
{
  "results": [
    {
      "language": "Chinese",
      "text": "说书相声这种东西,人靠一张嘴,通过语言的结构,把看官听众吸引到故事里面。在演出的时候,要求你身上的每个动作都必须要有含义。",
      "time_stamps": {
        "items": [
          {
            "text": "说",
            "start_time": 0.08,
            "end_time": 0.32
          },
          {
            "text": "书",
            "start_time": 0.32,
            "end_time": 0.48
          },
          {
            "text": "相",
            "start_time": 0.48,
            "end_time": 0.72
          },
          {
            "text": "声",
            "start_time": 0.72,
            "end_time": 1.04
          },
          ........
          {
            "text": "有",
            "start_time": 11.52,
            "end_time": 11.6
          },
          {
            "text": "含",
            "start_time": 11.6,
            "end_time": 11.84
          },
          {
            "text": "义",
            "start_time": 11.84,
            "end_time": 12.08
          }
        ]
      }
    }
  ]
}

$ curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
  -F "file_url=https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav" \
  -F "model_name=gpt-4o-mini-transcribe" \
  -F "language=en" | jq
$ curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
  -F "file=@/home/spark/DiskD/audio_llm/breeze-asr/output.wav" \
  -F "file_url=https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav" \
  -F "model_name=gpt-4o-mini-transcribe" | jq

DGX Spark 使用 Breeze-ASR-25

參考 https://huggingface.co/MediaTek-Research/Breeze-ASR-25

$ uv init breeze-asr
$ cd breeze-asr/
$ rm .python-version
# 在 pyproject.toml 增加下列文件,才可順利安裝 torch 等套件
$ vi pyproject.toml
[[tool.uv.index]]
name = "pytorch-cu130"
url = "https://download.pytorch.org/whl/cu130"
explicit = true  # 關鍵:這會阻止一般套件跑去 PyTorch 倉庫找

[tool.uv.sources]
torch = { index = "pytorch-cu130" }
torchaudio = { index = "pytorch-cu130" }
torchvision = { index = "pytorch-cu130" }
torchcodec = { index = "pytorch-cu130" }

$ uv venv --python 3.13
$ source .venv/bin/activate
$ uv pip install -e .

# 安裝時有順序區別,torch 要比 datasets[audio] 先安裝
$ uv add torch torchaudio torchcodec
$ uv add transformers
$ uv add datasets[audio]
$ uv add accelerate
$ sudo apt update
$ sudo apt install -y ffmpeg libavutil-dev

# 照著文件執行會出現下列錯誤,google AI 和 chatgpt 都建議直接使用 model
KeyError: 'num_frames'

# 若使用下列程式碼,輸出的文件不會分段,全部混再一起
result = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True
)[0]

# 下面為最後的程式,可產生如下的輸出
===== TEXT + TIME =====
0.00s 所說相生這種東西人靠一張嘴
3.28s 通過語言的結構把看官聽眾吸引到故事裡面
7.72s 在演出的時候
9.04s 要求你身上的每個動作都必須要有含義

import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration

audio_path = "/home/spark/DiskD/audio_llm/breeze-asr/output.wav"

# load audio
waveform, sr = torchaudio.load(audio_path)
waveform = waveform.mean(dim=0)

if sr != 16000:
    waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)

# load model
processor = WhisperProcessor.from_pretrained("/mnt/models/Breeze-ASR-25")
model = WhisperForConditionalGeneration.from_pretrained(
    "/mnt/models/Breeze-ASR-25"
).to("cuda")
model.eval()

# preprocess
inputs = processor(
    waveform,
    sampling_rate=16000,
    return_tensors="pt"
)

# inference
with torch.no_grad():
    outputs = model.generate(
        inputs.input_features.to("cuda"),
        return_timestamps=True,
        return_dict_in_generate=True,
        output_scores=True,
    )

# --------------------------------------------------
# 5. Decode tokens
# --------------------------------------------------
token_ids = outputs["sequences"][0].tolist()
tokens = processor.tokenizer.convert_ids_to_tokens(token_ids)

# Whisper timestamp token設定
timestamp_begin = processor.tokenizer.convert_tokens_to_ids("<|0.00|>")
time_precision = 0.02  # Whisper: 20ms

current_time = None
buffer = []

print("===== TEXT + TIME =====")

for tid in token_ids:
    # timestamp token
    if tid >= timestamp_begin:
        # 先輸出上一段
        if buffer and current_time is not None:
            text = processor.tokenizer.decode(
                buffer,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            if text.strip():
                print(f"{current_time:.2f}s\t{text}")

        # 更新時間
        current_time = (tid - timestamp_begin) * time_precision
        buffer = []
    else:
        buffer.append(tid)

# flush 最後一段
if buffer and current_time is not None:
    text = processor.tokenizer.decode(
        buffer,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    if text.strip():
        print(f"{current_time:.2f}s\t{text}")

2026年2月3日 星期二

DGX Spark 安裝 Riva ASR

參考文件 https://docs.nvidia.com/nim/riva/asr/latest/overview.html

$ export NGC_API_KEY="NTVwdDZqbTdrNnBva285Y3EzbmQxOGNodjY6NWM0NjYzYjYtMzczMy00MjVkLTg1NWQtZTE2MDNmZTAxNDBj"
$ echo "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin

parakeet-0-6b-ctc-en-us 缺 arm64 版本
parakeet-1-1b-ctc-en-us ok
parakeet-tdt-0.6b-v2 缺 arm64 版本
parakeet-1-1b-rnnt-multilingual 失敗
parakeet-ctc-0.6b-zh-cn 缺 arm64 版本
parakeet-ctc-0.6b-zh-tw 缺 arm64 版本

$ export CONTAINER_ID=parakeet-ctc-0.6b-zh-tw
$ export NIM_TAGS_SELECTOR="mode=str,vad=silero,diarizer=sortformer"

$ docker run -it --rm --name=$CONTAINER_ID \
   --gpus '"device=0"' \
   --shm-size=8GB \
   -e NGC_API_KEY \
   -e NIM_HTTP_API_PORT=9001 \
   -e NIM_GRPC_API_PORT=50052 \
   -p 9001:9001 \
   -p 50052:50052 \
   -e NIM_TAGS_SELECTOR \
   nvcr.io/nim/nvidia/$CONTAINER_ID:latest

# Create the cache directory on the host machine:
$ export LOCAL_NIM_CACHE=$(pwd)/cache/nim
$ mkdir -p $LOCAL_NIM_CACHE
$ chmod 777 $LOCAL_NIM_CACHE

# Set the appropriate values
$ export CONTAINER_ID=parakeet-1-1b-ctc-en-us
$ export NIM_TAGS_SELECTOR="name=parakeet-1-1b-ctc-en-us,mode=all,vad=silero,diarizer=sortformer,model_type=prebuilt"

# Run the container with the cache directory mounted in the appropriate location:
$ docker run -it --rm --name=$CONTAINER_ID \
      --gpus '"device=0"' \
      --shm-size=8GB \
      -e NGC_API_KEY \
      -e NIM_TAGS_SELECTOR \
      -e NIM_HTTP_API_PORT=9001 \
      -e NIM_GRPC_API_PORT=50052 \
      -p 9001:9001 \
      -p 50052:50052 \
      -v $LOCAL_NIM_CACHE:/opt/nim/.cache \
      nvcr.io/nim/nvidia/$CONTAINER_ID:latest
   
$ curl -X 'GET' 'http://localhost:9001/v1/health/ready'
{"status":"ready"}

$ uv init riva-client
$ cd riva-client/
$ rm .python-version
$ uv venv --python 3.10
$ uv add nvidia-riva-client
$ git clone https://github.com/nvidia-riva/python-clients.git

$ sudo apt-get install python3-pip
$ pip install -U nvidia-riva-client
$ git clone https://github.com/nvidia-riva/python-clients.git

$ docker cp $CONTAINER_ID:/opt/riva/wav/zh-TW_sample.wav .
$ python3 python-clients/scripts/asr/transcribe_file.py \
   --server 0.0.0.0:50052 \
   --list-models
Available ASR models
{'en-US': [{'model': ['parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer']}]}
$ python3 python-clients/scripts/asr/transcribe_file_offline.py \
   --server 0.0.0.0:50052 \
   --list-models
Available ASR models
{}
$ curl -s http://0.0.0.0:9001/v1/audio/transcriptions -F language=zh-TW \
   -F file="@zh-TW_sample.wav"
   
$ docker stop $CONTAINER_ID
$ docker rm $CONTAINER_ID

https://docs.nvidia.com/nim/riva/asr/latest/support-matrix.html#parakeet-0-6b-ctc-taiwanese-mandarin-english
Available modes include streaming low latency (str), 
streaming high throughput (str-thr), and offline (ofl). 
Setting the mode to (all) deploys all inference modes where applicable.
The profiles with silero and sortformer use Silero VAD to detect start and end of utterance 
and Sortformer SD for speaker diarization.

CONTAINER_ID=parakeet-ctc-0.6b-zh-tw
NIM_TAGS_SELECTOR
mode=ofl,vad=default,diarizer=disabled
mode=str,vad=default,diarizer=disabled
mode=str-thr,vad=default,diarizer=disabled
mode=all,vad=default,diarizer=disabled
mode=ofl,vad=silero,diarizer=sortformer
mode=str,vad=silero,diarizer=sortformer
mode=str-thr,vad=silero,diarizer=sortformer
mode=all,vad=silero,diarizer=sortformer

DGX Spark 安裝 Riva TTS

參考文件 https://docs.nvidia.com/nim/riva/tts/latest/overview.html

$ export NGC_API_KEY="Y6NWM0NjYzYjYtMzczMy00MjVkLTg1NWQtZTE2MDNmZTAxNDBj"
$ echo "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin

$ export CONTAINER_ID=magpie-tts-multilingual
$ export NIM_TAGS_SELECTOR=name=magpie-tts-multilingual

$ docker run -it --rm --name=$CONTAINER_ID \
   --gpus '"device=0"' \
   --shm-size=8GB \
   -e NGC_API_KEY \
   -e NIM_HTTP_API_PORT=9000 \
   -e NIM_GRPC_API_PORT=50051 \
   -p 9000:9000 \
   -p 50051:50051 \
   -e NIM_TAGS_SELECTOR \
   nvcr.io/nim/nvidia/$CONTAINER_ID:latest

# Create the cache directory on the host machine
$ export LOCAL_NIM_CACHE=~/.cache/nim
$ mkdir -p $LOCAL_NIM_CACHE
$ chmod 777 $LOCAL_NIM_CACHE

# Set appropriate value for container ID
$ export CONTAINER_ID=magpie-tts-multilingual

# Set the appropriate values for NIM_TAGS_SELECTOR.
$ export NIM_TAGS_SELECTOR="name=magpie-tts-multilingual,model_type=prebuilt"

# Run the container with the cache directory mounted in the appropriate location
$ docker run -it --rm --name=$CONTAINER_ID \
   --gpus '"device=0"' \
   --shm-size=8GB \
   -e NGC_API_KEY \
   -e NIM_TAGS_SELECTOR \
   -e NIM_HTTP_API_PORT=9000 \
   -e NIM_GRPC_API_PORT=50051 \
   -p 9000:9000 \
   -p 50051:50051 \
   -v $LOCAL_NIM_CACHE:/opt/nim/.cache \
   nvcr.io/nim/nvidia/$CONTAINER_ID:latest
   
$ curl -X 'GET' 'http://localhost:9000/v1/health/ready'
{"status":"ready"}

$ sudo apt-get install python3-pip
$ pip install -U nvidia-riva-client
$ cd $HOME
$ git clone https://github.com/nvidia-riva/python-clients.git

$ curl -sS http://localhost:9000/v1/audio/list_voices | jq
{
   "en-US,es-US,fr-FR,de-DE,zh-CN": {
      "voices": [
            "Magpie-Multilingual.ZH-CN.HouZhen",
            "Magpie-Multilingual.ZH-CN.Siwei",
            "Magpie-Multilingual.ZH-CN.Louise",
            "Magpie-Multilingual.ZH-CN.Pascal",
            "Magpie-Multilingual.ZH-CN.Isabela",
            "Magpie-Multilingual.ZH-CN.Diego",
            "Magpie-Multilingual.ZH-CN.Ray",
            "Magpie-Multilingual.ZH-CN.Mia",
            "Magpie-Multilingual.ZH-CN.Aria",
            "Magpie-Multilingual.ZH-CN.Long",
            "Magpie-Multilingual.ZH-CN.North",
      ]
   }
}

$ curl -sS http://localhost:9000/v1/audio/synthesize --fail-with-body \
   -F language=zh-CN  \
   -F text="說書,相聲這種東西,人靠一張嘴,通過語言的結構,把看官聽眾吸引到故事里面。在演出的時候要求你身上的每個動作,都必須要有含義。" \
   -F voice=Magpie-Multilingual.ZH-CN.Mia \
   --output output.wav
$ curl -sS http://localhost:9000/v1/audio/synthesize_online --fail-with-body \
   -F language=zh-CN  \
   -F text="說書,相聲這種東西,人靠一張嘴,通過語言的結構,把看官聽眾吸引到故事里面。在演出的時候要求你身上的每個動作,都必須要有含義。" \
   -F voice=Magpie-Multilingual.ZH-CN.Ray \
   -F sample_rate_hz=22050 \
   --output output.raw
$ sudo apt update
$ sudo apt install -y sox
$ sox -b 16 -e signed -c 1 -r 22050 output.raw output.wav

Available Voices
Magpie-Multilingual.ZH-CN.Aria
Magpie-Multilingual.ZH-CN.Diego
Magpie-Multilingual.ZH-CN.HouZhen
Magpie-Multilingual.ZH-CN.Isabela
Magpie-Multilingual.ZH-CN.Long
Magpie-Multilingual.ZH-CN.Louise
Magpie-Multilingual.ZH-CN.Mia
Magpie-Multilingual.ZH-CN.North
Magpie-Multilingual.ZH-CN.Pascal
Magpie-Multilingual.ZH-CN.Ray
Magpie-Multilingual.ZH-CN.Siwei

$ docker stop $CONTAINER_ID
$ docker rm $CONTAINER_ID

DGX Spark 安裝使用 NIM

參考文件 https://build.nvidia.com/spark/nim-llm/overview

下載 ngc
$ wget --content-disposition \
https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/4.11.1/files/ngccli_arm64.zip \
-O ngccli_arm64.zip && unzip ngccli_arm64.zip
$ find ngc-cli/ -type f -exec md5sum {} + | LC_ALL=C sort | md5sum -c ngc-cli.md5
$ sha256sum ngccli_arm64.zip
9c285ef8250c30c21e70a836700121945b296a8441005b829179bde6106ce6d1
$ chmod u+x ngc-cli/ngc
$ echo "export PATH=\"\$PATH:$(pwd)/ngc-cli\"" >> ~/.bash_profile && source ~/.bash_profile

$ ngc config set
Enter API key [no-apikey]. Choices: [<VALID_APIKEY>, 'no-apikey']: 
Enter CLI output format type [ascii]. Choices: ['ascii', 'csv', 'json']: 
Validating configuration...
Successfully validated configuration.
Saving configuration...
Successfully saved NGC configuration to /home/spark/.ngc/config
$ docker login nvcr.io
Username: $oauthtoken
Password: Y6NWM0NjYzYjYtMzczMy00MjVkLTg1NWQtZTE2MDNmZTAxNDBj

$ newgrp docker
$ export NGC_API_KEY="Y6NWM0NjYzYjYtMzczMy00MjVkLTg1NWQtZTE2MDNmZTAxNDBj"
$ echo "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin

$ export CONTAINER_NAME="nim-llm-demo"
$ export IMG_NAME="nvcr.io/nim/meta/llama-3.1-8b-instruct-dgx-spark:latest"
$ export LOCAL_NIM_CACHE=$(pwd)/cache/nim
$ export LOCAL_NIM_WORKSPACE=~/workspace
$ mkdir -p "$LOCAL_NIM_WORKSPACE"
$ chmod -R a+w "$LOCAL_NIM_WORKSPACE"
$ mkdir -p "$LOCAL_NIM_CACHE"
$ chmod -R a+w "$LOCAL_NIM_CACHE"

$ docker run -it --rm --name=$CONTAINER_NAME \
  --gpus all \
  --shm-size=16GB \
  -e NGC_API_KEY=$NGC_API_KEY \
  -v "$LOCAL_NIM_CACHE:/opt/nim/.cache" \
  -v "$LOCAL_NIM_WORKSPACE:/opt/nim/workspace" \
  -p 8000:8000 \
  $IMG_NAME

$ curl -X 'POST' \
    'http://0.0.0.0:8000/v1/chat/completions' \
    -H 'accept: application/json' \
    -H 'Content-Type: application/json' \
    -d '{
      "model": "meta/llama-3.1-8b-instruct",
      "messages": [
        {
          "role":"system",
          "content":"detailed thinking on"
        },
        {
          "role":"user",
          "content":"Can you write me a song?"
        }
      ],
      "top_p": 1,
      "n": 1,
      "max_tokens": 15,
      "frequency_penalty": 1.0,
      "stop": ["hello"]

    }'
    
$ docker stop $CONTAINER_NAME
$ docker rm $CONTAINER_NAME

2026年2月2日 星期一

DGX Spark 安裝使用 LocalAI

參考 https://localai.io/

$ mkdir models backends

$ docker run -ti --gpus all --name local-ai \
-p 8080:8080 \
-v $PWD/models:/models \
-v $PWD/backends:/backends \
-e DEBUG=true \
localai/localai:latest-nvidia-l4t-arm64-cuda-13 \
--models-path /models \
--context-size 700 \
--threads 4 

開啟網站 http://192.168.0.108:8080/
右上 Settings/Backends
安裝 whisper, qwen-tss, piper
右上 Settings/Models
安裝 qwen3-tts-1.7b-custom-voice, voice-zh_CN-huayan-medium, whisper-medium

https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice 可查詢到 speaker

$ docker exec -it local-ai sh

$ curl http://192.168.0.108:8080/v1/audio/transcriptions \
  -H "Content-Type: multipart/form-data" \
  -F file="@/home/mark/Data/Whisper/speaches/aaa.mp3" \
  -F model="whisper-1"

$ curl http://192.168.0.108:8080/v1/audio/speech \
  -H "Content-Type: application/json" \
  -d '{
  "backend": "piper",
  "input": "這是一段來自 LocalAI 的測試語音。",
  "model": "voice-zh_CN-huayan-medium",
  "response_format": "wav"
  }' \
  --output test_audio.wav

$ curl http://192.168.0.108:8080/v1/audio/speech \
  -H "Content-Type: application/json" \
  -d '{
  "backend": "qwen-tss",
  "input": "這是一段來自 LocalAI 的測試語音。",
  "model": "qwen3-tts-1.7b-custom-voice",
  "response_format": "wav"
  }' \
  --output test_audio.wav

$ curl http://192.168.0.108:8080/v1/completions \
  -H "Content-Type: application/json" \
  -d '{"model": "Qwen3-4B.Q4_K_M.gguf",
     "prompt": "A long time ago in a galaxy far, far away",
     "temperature": 0.7
   }'


================
以下是嘗試使用 docker build image, 的失敗過程

$ mkdir -p localai/models
$ cd localai

直接下載模型
$ cat <<EOF > models/breeze.yaml
name: "whisper-1"
backend: "whisper"
parameters:
  model: "MediaTek-Research/Breeze-ASR-25"
EOF

預先下載模型
$ vi models/breeze.yaml
name: "whisper-1"
backend: "faster-whisper"
parameters:
  # 這裡要寫容器內的路徑
  model: "/models/Breeze-ASR-25"

不可使用
$ docker run -d --name local-ai \
  --gpus all \
  -p 8080:8080 \
  -v $(pwd)/models:/build/models:ro \
  -v /mnt/models:/models:ro \
  localai/localai:latest-nvidia-l4t-arm64-cuda-13
會出現下列錯誤
$ curl http://192.168.0.108:8080/v1/audio/transcriptions \
>   -H "Content-Type: multipart/form-data" \
>   -F file="@/mnt/Data/Whisper/speaches/aaa.mp3" \
>   -F model="whisper-1"
{"error":{"code":500,"message":"failed to load model with internal loader: backend not found: whisper","type":""}}

$ git clone https://github.com/mudler/LocalAI
$ cd LocalAI
$ docker build --build-arg SKIP_DRIVERS=false \
--build-arg BUILD_TYPE=cublas \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg BASE_IMAGE=ubuntu:24.04 \
--build-arg IMAGE_TYPE=aio \
-t quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-aio .

$ docker run -d --name local-ai \
  --gpus all \
  -p 8080:8080 \
  -v $(pwd)/models:/build/models:ro \
  -v /mnt/models:/models:ro \
  -e EXTERNAL_GRPC_BACKENDS="faster-whisper:/build/backend/python/faster-whisper/backend.py" \
  quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-aio

$ docker build --build-arg SKIP_DRIVERS=true \
--build-arg BUILD_TYPE=cublas \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg IMAGE_TYPE=core \
-t local-ai-cuda13:latest .

$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=faster-whisper \
-t local-ai-faster-whisper-cuda13:latest \
-f backend/Dockerfile.python .

$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=piper \
-t local-ai-piper-cuda13:latest \
-f backend/Dockerfile.golang .

$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=vllm \
-t local-ai-vllm-omni-cuda13:latest \
-f backend/Dockerfile.python .

# 確保你在 LocalAI 根目錄,且 backend/python 下已有你的 kokoro 實作
$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=kokoro \
-t local-ai-kokoro-cuda13:latest \
-f backend/Dockerfile.python .

$ vi docker-compose.yml
version: '3.8'
services:
  asr-service:
    image: local-ai-faster-whisper-cuda13:latest
    container_name: faster-whisper-backend
    networks:
      - ai-network
    volumes:
      - ./models:/build/models:ro
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    restart: always
  tts-service:
    image: local-ai-piper-cuda13:latest
    container_name: tts-backend
    networks:
      - ai-network
    volumes:
      - ./models:/build/models:ro
  kokoro-service:
    image: local-ai-kokoro-cuda13:latest
    container_name: kokoro-backend
    networks:
      - ai-network
    volumes:
      - ./models:/build/models:ro
    command: ["/usr/bin/python3", "/build/backend/python/kokoro/kokoro.py"] # 範例路徑
    deploy:
      resources:
        reservations:
          devices: [{driver: nvidia, count: all, capabilities: [gpu]}]
  vllm-omni:
    image: local-ai-vllm-omni-cuda13:latest
    container_name: vllm-backend
    restart: always
    networks:
      - ai-network
    volumes:
      - ./models:/build/models:ro
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all # DGX 環境建議指定顯卡,例如 "device=0,1"
              capabilities: [gpu]
  localai:
    image: local-ai-cuda13:latest
    container_name: localai-api
    depends_on:
      - asr-service:
      - tts-service
      - vllm-omni
      - kokoro-service
    networks:
      - ai-network
    ports:
      - "8080:8080"
    volumes:
      - ./models:/build/models:ro
      - /mnt/models:/models:ro
    environment:
      - DEBUG=true
      - EXTERNAL_GRPC_BACKENDS=vllm:vllm-omni:9000,faster-whisper:asr-service:9000,piper:tts-service:9000,kokoro:kokoro-service:9000
    restart: always
networks:
  ai-network:
    driver: bridge

$ docker compose -f docker-compose.yml up -d
$ docker compose -f docker-compose.yml logs -f
$ docker compose -f docker-compose.yaml down

$ curl http://192.168.0.108:8080/v1/audio/transcriptions \
  -H "Content-Type: multipart/form-data" \
  -F file="@/home/mark/Data/Whisper/speaches/aaa.mp3" \
  -F model="whisper-1"

$ vi models/piper.yaml
name: "tts-1"
backend: "piper"
parameters:
  model: "/models/piper-zh_CN-huayan-medium/model.onnx"

$ vi models/vllm-omni.yaml
name: "vllm-omni"
backend: "vllm"
parameters:
  model: "/models/llama-3.2-vision"
  extra_args:
    - "--dtype"
    - "bfloat16"
    - "--limit-mm-per-prompt" # 限制單個提示的多模態輸入數量
    - "image=1,video=0"
    - "--max-model-len"
    - "4096"
    - "--gpu-memory-utilization"
    - "0.8"
    
$ vi models/kokoro.yaml
name: "kokoro-tts"
# 這裡直接指定為子目錄名稱
backend: "kokoro" 
parameters:
  # 指向您下載的 Kokoro ONNX 模型路徑
  model: "kokoro-v1.0.onnx" 
# 如果模型需要特定的聲音風格檔,通常在環境變數或參數中指定
environment:
  KOKORO_VOICES_PATH: "/build/models/kokoro/voices.bin"


git clone https://github.com/mudler/LocalAI

cd LocalAI

$ docker build \
--build-arg SKIP_DRIVERS=false \
--build-arg BUILD_TYPE=cublas \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg BASE_IMAGE=ubuntu:24.04 \
--build-arg IMAGE_TYPE=core \
-t quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-core .

$ docker run -ti --gpus all --name local-ai \
-p 8080:8080 \
-v $PWD/models:/models \
-v $PWD/backends:/backends \
-e DEBUG=true \
quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-core \
--models-path /models \
--context-size 700 \
--threads 4 


$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=qwen-tts \
-t local-ai-qwen-tts-cuda13:latest \
-f backend/Dockerfile.python .

$ vi docker-compose.yml
version: "3.9"
services:
  localai:
    image: localai/localai:latest
    container_name: localai
    runtime: nvidia
    ports:
      - "8080:8080"
    volumes:
      - ./models:/models
      - ./backends:/build/backend
    environment:
      # 👇 多個 backend image,用逗號分隔
      LOCALAI_BACKEND_IMAGE=local-ai-qwen-asr-cuda13,local-ai-qwen-tts-cuda13

      # 常見 GPU 設定
      NVIDIA_VISIBLE_DEVICES=all
      NVIDIA_DRIVER_CAPABILITIES=compute,utility


$ docker create --name tmp-qwen local-ai-qwen-tts-cuda13:latest /bin/true
$ mkdir qwen-tss-rootfs
$ docker export tmp-qwen | tar -xvf - -C ./qwen-tss-rootfs
$ docker rm tmp-qwen

cd qwen-tss-rootfs
mv venv venv.bak
python3 -m venv venv
source venv/bin/activate
which python
# 應該是 .../qwen-asr-rootfs/venv/bin/python
pip --version
# 應該指向 venv 內
pip install grpcio-tools==1.71.0 grpcio==1.71.0
python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. backend.proto

DGX Spark 之溫度

找不到可以控制風扇的方式,目前只能加強通風散熱,再來就是降頻了

$ nvidia-smi -q -d CLOCK
查詢 時脈
$ sudo nvidia-smi -lgc 1200,2400
固定 GPU 時脈
$ sudo nvidia-smi -rgc
回復原始設定

下面命令失效
$ sudo nvidia-smi -pl 350
Changing power management limit is not supported in current scope for GPU: 0000000F:01:00.0.
All done.
$ sudo nvidia-smi -gtt 80
GPU Target Temperature Threshold not supported for GPU 0000000F:01:00.0.
Treating as warning and moving on.
All done.

$ sudo nvidia-settings -a "[gpu:0]/GPUFanControlState=1"
$ sudo nvidia-settings -a "[fan:0]/GPUTargetFanSpeed=85"
ERROR: Error resolving target specification 'fan:0' (No targets match target
       specification), specified in assignment '[fan:0]/GPUTargetFanSpeed=85'.

$ sudo cpupower frequency-set -g performance
$ sudo cpupower frequency-set -u 3.5GHz

2026年1月31日 星期六

DGX Spark Manual System Updates

sudo apt update
sudo apt dist-upgrade
sudo fwupdmgr refresh
sudo fwupdmgr upgrade
sudo reboot

2026年1月6日 星期二

vscode 開發 python, 使用 uv, git

VS Code 快捷鍵
Ctrl + Shift + P(命令面板)
Win + ↑(開發環境最大化)
Ctrl + `(開啟終端機)

開發 python
1. Remote-SSH 連 Ubuntu
2. mkdir my_python_project; cd my_python_project
3. python3 -m venv venv
4. source venv/bin/activate
5. Ctrl + Shift + P(命令面板) 輸入 Python: Select Interpreter, 選擇 ./venv/bin/python
6. 開始寫 main.py
7. pip install request
8. pip freeze > requirements.txt

使用 uv 開發 python
1. Ctrl + `(開啟終端機)
2a. mkdir my_python_project; cd my_python_project
2b. uv init my_python_project; cd my_python_project
3. uv venv
4. source .venv/bin/activate
5. Ctrl + Shift + P(命令面板) 輸入 Python: Select Interpreter, 選擇 ./venv/bin/python
6. 開始寫 main.py
7. uv add requests
8. uv pip install -e .

再加上 git
1. git init
2. cat > .gitignore <<EOS
.venv/
__pycache__/
*.pyc
.env
EOS
3. git add .
4. git commit -m "Initial commit"
5. 在 GitHub 建立 Repository
5a. 打開 GitHub 網站
5b. 點 New repository
5c. Repository name:和資料夾同名(建議)
5d. 不要勾 README / .gitignore / license
5e. 建立 Repository
6. 從 VS Code / Ubuntu 推到 GitHub
git branch -M master
git remote add origin https://github.com/你的帳號/專案名.git
git push -u origin master
上述命令執行後,注意視窗上方要輸入密碼
7. 改程式後
git add .
git commit -m "說明你改了什麼"
git push
8. 在別的地方更新過
git pull
9. VS Code 介面操作
VS Code 左邊的 Source Control
Stage(+)
Commit
Push / Pull
10a. 查詢本地專案設定
git config user.name
git config user.email
10b. 查全域設定
git config --global user.name
git config --global user.email
10c. 設定 Git 身分
git config --global user.name "你的名字"
git config --global user.email "你的 GitHub Email"
11. 查看所有遠端
git remote -v
12. 查看某個 remote 詳細資訊
git remote show origin
13. 常見遠端名稱
origin 原始遠端,通常是主 GitHub repository
upstream 當你 fork 別人的 repo,用來指原始 repo
backup 備份用遠端,例如 GitLab、私有伺服器


2026年1月2日 星期五

vscode 的 vibe coding

點選左側 continue icon
拉寬 continue 視窗
點選 continue 視窗右上側齒輪
點選左側 Models
點選 Chat 右側齒輪
自動出現 config.yaml 檔案, 或手動開啟 %USERPROFILE%\.continue\config.yaml

name: Local Config
version: 1.0.0
schema: v1
models:
  - name: gpt-oss-120b
    provider: vllm
    model: gpt-oss-120b
    apiBase: http://192.168.0.108:8000/v1
    title: gpt-oss-120b
    roles:
      - chat
      - edit
      - autocomplete
      - apply
  - name: Llama-3.1-8B-Instruct
    provider: vllm
    model: Llama-3.1-8B-Instruct
    apiBase: https://www.host.com.tw/trtllm-Llama-3.1-8B-Instruct/v1
    title: Llama-3.1-8B-Instruct
    roles:
      - chat
      - edit
      - autocomplete
      - apply
      
點選 continue 視窗右上 "Local Config v" 選擇 Reload

=========================
安裝 Cline 完成後 選 Login to Cline, 開啟網頁, 同意授權
回 vscode, 有對話框 Allow 'Cline' extension to open this URL
按 Open
點選左側 cline icon
點選 cline 視窗右上側齒輪
API Provider: OpenAI Compatible
Base URL: http://192.168.0.108:8000
OpenAI Compatible API Key: token-abc123
Model ID: gpt-oss-120b
點選 cline 視窗右上 Done

=========================
安裝 Remote-SSH
點選左側 Remote Explorer
在 REMOTE/SSH 右側按齒輪
中間視窗上面選擇 C:/Users/user_name/.ssh/config
# Read more about SSH config files: https://linux.die.net/man/5/ssh_config
Host hostA
    HostName user_ip
    User user_name
    Port 2201
Host hostB
    HostName user_ip
    User user_name
    Port 2202
在 REMOTE/SSH/spark 右側可以選擇
Connect in Current Window... 或 Connect in New Window...
第一次登入時在中間視窗上面選擇 linux, 選擇 continue
中間視窗上面輸入密碼

DGX Spark 使用 LLM 經歷

同一模型使用 TRT LLM 比使用 vLLM 好, 不論有無轉成 nvfp4

10GB 以下 Qwen2.5-Coder-7B
10GB 以下 Qwen2.5-Coder-32B-Instruct
30GB 以下 gpt-oss-20b
30GB 以下 Llama-3.1-8B-Instruct
上述模型耗記憶體和反應速度都不錯,只是 Qwen2.5-Coder-7B 對話不行

gpt-oss-120b 耗記憶體,速度有些慢

Llama-3.3-70B-Instruct 耗記憶體,速度超慢
# 原本應該從 meta-llama/Llama-3.3-70B-Instruct 下載模型,再轉成 nvfp4
# 但是下載的模型不只無法轉成 nvfp4, 也無法執行
# 發現有 nvidia/Llama-3.3-70B-Instruct-NVFP4 直接下載使用

將 open webui 的 http 轉成 https

產生證書,需先執行一個 nginx 在 80 port 上,以便 cerbot 產生證書
$ sta_nginx_certbot.sh
$ docker run --rm -it \
  -v $(pwd)/nginx/certs:/etc/letsencrypt \
  -v $(pwd)/nginx/acme:/var/www/certbot \
  certbot/certbot certonly \
  --webroot \
  -w /var/www/certbot \
  -d www.host.com.tw
證書產生在下列目錄
nginx/certs/live/www.host.com.tw/fullchain.pem
nginx/certs/live/www.host.com.tw/privkey.pem
停止 nginx_certbot

docker nginx 加入 nginx_certbot.conf, 以便在更新證書時,不用停止 nginx
server {
    listen 80 default_server;
    server_name _;

    root /var/www/html;
    index index.html index.htm index.nginx-debian.html;
    
    location /.well-known/acme-challenge/ {
        root /var/www/certbot;
    }

    location / {
        return 200 "nginx certbot ok\n";
    }
}

更新證書
$ docker run --rm \
  -v $(pwd)/nginx/certs:/etc/letsencrypt \
  -v $(pwd)/nginx/acme:/var/www/certbot \
  certbot/certbot renew

測試證書
$ docker run --rm \
  -v $(pwd)/nginx/certs:/etc/letsencrypt \
  -v $(pwd)/nginx/acme:/var/www/certbot \
  certbot/certbot renew --dry-run


# 1. Nginx 是否在 webnet
docker network inspect webnet
# 2. open-webui 是否在 webnet
docker ps
# 3. Nginx 內部能否解析
docker exec -it nginx getent hosts open-webui
# 4. Nginx 內部直連測試
docker exec -it nginx curl http://open-webui:8080
# nginx.conf 修改後 nginx 重開
docker restart nginx
# 測試命令
curl -k -H "Cache-Control: no-cache" \
     -H "Pragma: no-cache" \
     https://www.host.com.tw/

為了避免一些 docker 沒有啟動,造成 nginx 無法啟動,將
    location /sub-path/ {
        proxy_pass http://sub-path:8001;
    }
改成
    location /sub-path/ {
        set $p8001 http://sub-path:8001;
        rewrite ^/sub-path/(.*)$ /$1 break;
        proxy_pass $p8001;
    }

open-webui 無法掛到 https://www.host.com.tw/open-webui 必須使用 /
    location / {
        set $webui http://open-webui:8080;
        # 無法使用 rewrite, 只能用 location /
        proxy_pass $webui;
        # 避免下列 WebSocket 錯誤
        # "GET /ws/socket.io/?EIO=4&transport=websocket HTTP/1.1" 400 46
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection "upgrade";
    }

原先也要加入 dashboard, 但兩者同時都使用 WebSocket, 而且是 GET /ws 開頭,所以只好放棄
    location /dashboard/ {
        set $dashboard http://dashboard:8080;
        rewrite ^/dashboard/(.*)$ /$1 break;
        proxy_pass $dashboard;
        # 避免下列 WebSocket 錯誤
        # "GET /ws HTTP/1.1" 500 21
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection "upgrade";
    }