網頁

2026年2月2日 星期一

DGX Spark 安裝使用 LocalAI

參考 https://localai.io/

$ mkdir models backends

$ docker run -ti --gpus all --name local-ai \
-p 8080:8080 \
-v $PWD/models:/models \
-v $PWD/backends:/backends \
-e DEBUG=true \
localai/localai:latest-nvidia-l4t-arm64-cuda-13 \
--models-path /models \
--context-size 700 \
--threads 4 

開啟網站 http://192.168.0.108:8080/
右上 Settings/Backends
安裝 whisper, qwen-tss, piper
右上 Settings/Models
安裝 qwen3-tts-1.7b-custom-voice, voice-zh_CN-huayan-medium, whisper-medium

https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice 可查詢到 speaker

$ docker exec -it local-ai sh

$ curl http://192.168.0.108:8080/v1/audio/transcriptions \
  -H "Content-Type: multipart/form-data" \
  -F file="@/home/mark/Data/Whisper/speaches/aaa.mp3" \
  -F model="whisper-1"

$ curl http://192.168.0.108:8080/v1/audio/speech \
  -H "Content-Type: application/json" \
  -d '{
  "backend": "piper",
  "input": "這是一段來自 LocalAI 的測試語音。",
  "model": "voice-zh_CN-huayan-medium",
  "response_format": "wav"
  }' \
  --output test_audio.wav

$ curl http://192.168.0.108:8080/v1/audio/speech \
  -H "Content-Type: application/json" \
  -d '{
  "backend": "qwen-tss",
  "input": "這是一段來自 LocalAI 的測試語音。",
  "model": "qwen3-tts-1.7b-custom-voice",
  "response_format": "wav"
  }' \
  --output test_audio.wav

$ curl http://192.168.0.108:8080/v1/completions \
  -H "Content-Type: application/json" \
  -d '{"model": "Qwen3-4B.Q4_K_M.gguf",
     "prompt": "A long time ago in a galaxy far, far away",
     "temperature": 0.7
   }'


================
以下是嘗試使用 docker build image, 的失敗過程

$ mkdir -p localai/models
$ cd localai

直接下載模型
$ cat <<EOF > models/breeze.yaml
name: "whisper-1"
backend: "whisper"
parameters:
  model: "MediaTek-Research/Breeze-ASR-25"
EOF

預先下載模型
$ vi models/breeze.yaml
name: "whisper-1"
backend: "faster-whisper"
parameters:
  # 這裡要寫容器內的路徑
  model: "/models/Breeze-ASR-25"

不可使用
$ docker run -d --name local-ai \
  --gpus all \
  -p 8080:8080 \
  -v $(pwd)/models:/build/models:ro \
  -v /mnt/models:/models:ro \
  localai/localai:latest-nvidia-l4t-arm64-cuda-13
會出現下列錯誤
$ curl http://192.168.0.108:8080/v1/audio/transcriptions \
>   -H "Content-Type: multipart/form-data" \
>   -F file="@/mnt/Data/Whisper/speaches/aaa.mp3" \
>   -F model="whisper-1"
{"error":{"code":500,"message":"failed to load model with internal loader: backend not found: whisper","type":""}}

$ git clone https://github.com/mudler/LocalAI
$ cd LocalAI
$ docker build --build-arg SKIP_DRIVERS=false \
--build-arg BUILD_TYPE=cublas \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg BASE_IMAGE=ubuntu:24.04 \
--build-arg IMAGE_TYPE=aio \
-t quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-aio .

$ docker run -d --name local-ai \
  --gpus all \
  -p 8080:8080 \
  -v $(pwd)/models:/build/models:ro \
  -v /mnt/models:/models:ro \
  -e EXTERNAL_GRPC_BACKENDS="faster-whisper:/build/backend/python/faster-whisper/backend.py" \
  quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-aio

$ docker build --build-arg SKIP_DRIVERS=true \
--build-arg BUILD_TYPE=cublas \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg IMAGE_TYPE=core \
-t local-ai-cuda13:latest .

$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=faster-whisper \
-t local-ai-faster-whisper-cuda13:latest \
-f backend/Dockerfile.python .

$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=piper \
-t local-ai-piper-cuda13:latest \
-f backend/Dockerfile.golang .

$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=vllm \
-t local-ai-vllm-omni-cuda13:latest \
-f backend/Dockerfile.python .

# 確保你在 LocalAI 根目錄,且 backend/python 下已有你的 kokoro 實作
$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=kokoro \
-t local-ai-kokoro-cuda13:latest \
-f backend/Dockerfile.python .

$ vi docker-compose.yml
version: '3.8'
services:
  asr-service:
    image: local-ai-faster-whisper-cuda13:latest
    container_name: faster-whisper-backend
    networks:
      - ai-network
    volumes:
      - ./models:/build/models:ro
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    restart: always
  tts-service:
    image: local-ai-piper-cuda13:latest
    container_name: tts-backend
    networks:
      - ai-network
    volumes:
      - ./models:/build/models:ro
  kokoro-service:
    image: local-ai-kokoro-cuda13:latest
    container_name: kokoro-backend
    networks:
      - ai-network
    volumes:
      - ./models:/build/models:ro
    command: ["/usr/bin/python3", "/build/backend/python/kokoro/kokoro.py"] # 範例路徑
    deploy:
      resources:
        reservations:
          devices: [{driver: nvidia, count: all, capabilities: [gpu]}]
  vllm-omni:
    image: local-ai-vllm-omni-cuda13:latest
    container_name: vllm-backend
    restart: always
    networks:
      - ai-network
    volumes:
      - ./models:/build/models:ro
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all # DGX 環境建議指定顯卡,例如 "device=0,1"
              capabilities: [gpu]
  localai:
    image: local-ai-cuda13:latest
    container_name: localai-api
    depends_on:
      - asr-service:
      - tts-service
      - vllm-omni
      - kokoro-service
    networks:
      - ai-network
    ports:
      - "8080:8080"
    volumes:
      - ./models:/build/models:ro
      - /mnt/models:/models:ro
    environment:
      - DEBUG=true
      - EXTERNAL_GRPC_BACKENDS=vllm:vllm-omni:9000,faster-whisper:asr-service:9000,piper:tts-service:9000,kokoro:kokoro-service:9000
    restart: always
networks:
  ai-network:
    driver: bridge

$ docker compose -f docker-compose.yml up -d
$ docker compose -f docker-compose.yml logs -f
$ docker compose -f docker-compose.yaml down

$ curl http://192.168.0.108:8080/v1/audio/transcriptions \
  -H "Content-Type: multipart/form-data" \
  -F file="@/home/mark/Data/Whisper/speaches/aaa.mp3" \
  -F model="whisper-1"

$ vi models/piper.yaml
name: "tts-1"
backend: "piper"
parameters:
  model: "/models/piper-zh_CN-huayan-medium/model.onnx"

$ vi models/vllm-omni.yaml
name: "vllm-omni"
backend: "vllm"
parameters:
  model: "/models/llama-3.2-vision"
  extra_args:
    - "--dtype"
    - "bfloat16"
    - "--limit-mm-per-prompt" # 限制單個提示的多模態輸入數量
    - "image=1,video=0"
    - "--max-model-len"
    - "4096"
    - "--gpu-memory-utilization"
    - "0.8"
    
$ vi models/kokoro.yaml
name: "kokoro-tts"
# 這裡直接指定為子目錄名稱
backend: "kokoro" 
parameters:
  # 指向您下載的 Kokoro ONNX 模型路徑
  model: "kokoro-v1.0.onnx" 
# 如果模型需要特定的聲音風格檔,通常在環境變數或參數中指定
environment:
  KOKORO_VOICES_PATH: "/build/models/kokoro/voices.bin"


git clone https://github.com/mudler/LocalAI

cd LocalAI

$ docker build \
--build-arg SKIP_DRIVERS=false \
--build-arg BUILD_TYPE=cublas \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg BASE_IMAGE=ubuntu:24.04 \
--build-arg IMAGE_TYPE=core \
-t quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-core .

$ docker run -ti --gpus all --name local-ai \
-p 8080:8080 \
-v $PWD/models:/models \
-v $PWD/backends:/backends \
-e DEBUG=true \
quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-core \
--models-path /models \
--context-size 700 \
--threads 4 


$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=qwen-tts \
-t local-ai-qwen-tts-cuda13:latest \
-f backend/Dockerfile.python .

$ vi docker-compose.yml
version: "3.9"
services:
  localai:
    image: localai/localai:latest
    container_name: localai
    runtime: nvidia
    ports:
      - "8080:8080"
    volumes:
      - ./models:/models
      - ./backends:/build/backend
    environment:
      # 👇 多個 backend image,用逗號分隔
      LOCALAI_BACKEND_IMAGE=local-ai-qwen-asr-cuda13,local-ai-qwen-tts-cuda13

      # 常見 GPU 設定
      NVIDIA_VISIBLE_DEVICES=all
      NVIDIA_DRIVER_CAPABILITIES=compute,utility


$ docker create --name tmp-qwen local-ai-qwen-tts-cuda13:latest /bin/true
$ mkdir qwen-tss-rootfs
$ docker export tmp-qwen | tar -xvf - -C ./qwen-tss-rootfs
$ docker rm tmp-qwen

cd qwen-tss-rootfs
mv venv venv.bak
python3 -m venv venv
source venv/bin/activate
which python
# 應該是 .../qwen-asr-rootfs/venv/bin/python
pip --version
# 應該指向 venv 內
pip install grpcio-tools==1.71.0 grpcio==1.71.0
python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. backend.proto

沒有留言:

張貼留言