參考 https://localai.io/
$ mkdir models backends
$ docker run -ti --gpus all --name local-ai \
-p 8080:8080 \
-v $PWD/models:/models \
-v $PWD/backends:/backends \
-e DEBUG=true \
localai/localai:latest-nvidia-l4t-arm64-cuda-13 \
--models-path /models \
--context-size 700 \
--threads 4
開啟網站 http://192.168.0.108:8080/
右上 Settings/Backends
安裝 whisper, qwen-tss, piper
右上 Settings/Models
安裝 qwen3-tts-1.7b-custom-voice, voice-zh_CN-huayan-medium, whisper-medium
https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice 可查詢到 speaker
$ docker exec -it local-ai sh
$ curl http://192.168.0.108:8080/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@/home/mark/Data/Whisper/speaches/aaa.mp3" \
-F model="whisper-1"
$ curl http://192.168.0.108:8080/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{
"backend": "piper",
"input": "這是一段來自 LocalAI 的測試語音。",
"model": "voice-zh_CN-huayan-medium",
"response_format": "wav"
}' \
--output test_audio.wav
$ curl http://192.168.0.108:8080/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{
"backend": "qwen-tss",
"input": "這是一段來自 LocalAI 的測試語音。",
"model": "qwen3-tts-1.7b-custom-voice",
"response_format": "wav"
}' \
--output test_audio.wav
$ curl http://192.168.0.108:8080/v1/completions \
-H "Content-Type: application/json" \
-d '{"model": "Qwen3-4B.Q4_K_M.gguf",
"prompt": "A long time ago in a galaxy far, far away",
"temperature": 0.7
}'
================
以下是嘗試使用 docker build image, 的失敗過程
$ mkdir -p localai/models
$ cd localai
直接下載模型
$ cat <<EOF > models/breeze.yaml
name: "whisper-1"
backend: "whisper"
parameters:
model: "MediaTek-Research/Breeze-ASR-25"
EOF
預先下載模型
$ vi models/breeze.yaml
name: "whisper-1"
backend: "faster-whisper"
parameters:
# 這裡要寫容器內的路徑
model: "/models/Breeze-ASR-25"
不可使用
$ docker run -d --name local-ai \
--gpus all \
-p 8080:8080 \
-v $(pwd)/models:/build/models:ro \
-v /mnt/models:/models:ro \
localai/localai:latest-nvidia-l4t-arm64-cuda-13
會出現下列錯誤
$ curl http://192.168.0.108:8080/v1/audio/transcriptions \
> -H "Content-Type: multipart/form-data" \
> -F file="@/mnt/Data/Whisper/speaches/aaa.mp3" \
> -F model="whisper-1"
{"error":{"code":500,"message":"failed to load model with internal loader: backend not found: whisper","type":""}}
$ git clone https://github.com/mudler/LocalAI
$ cd LocalAI
$ docker build --build-arg SKIP_DRIVERS=false \
--build-arg BUILD_TYPE=cublas \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg BASE_IMAGE=ubuntu:24.04 \
--build-arg IMAGE_TYPE=aio \
-t quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-aio .
$ docker run -d --name local-ai \
--gpus all \
-p 8080:8080 \
-v $(pwd)/models:/build/models:ro \
-v /mnt/models:/models:ro \
-e EXTERNAL_GRPC_BACKENDS="faster-whisper:/build/backend/python/faster-whisper/backend.py" \
quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-aio
$ docker build --build-arg SKIP_DRIVERS=true \
--build-arg BUILD_TYPE=cublas \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg IMAGE_TYPE=core \
-t local-ai-cuda13:latest .
$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=faster-whisper \
-t local-ai-faster-whisper-cuda13:latest \
-f backend/Dockerfile.python .
$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=piper \
-t local-ai-piper-cuda13:latest \
-f backend/Dockerfile.golang .
$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=vllm \
-t local-ai-vllm-omni-cuda13:latest \
-f backend/Dockerfile.python .
# 確保你在 LocalAI 根目錄,且 backend/python 下已有你的 kokoro 實作
$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=kokoro \
-t local-ai-kokoro-cuda13:latest \
-f backend/Dockerfile.python .
$ vi docker-compose.yml
version: '3.8'
services:
asr-service:
image: local-ai-faster-whisper-cuda13:latest
container_name: faster-whisper-backend
networks:
- ai-network
volumes:
- ./models:/build/models:ro
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: always
tts-service:
image: local-ai-piper-cuda13:latest
container_name: tts-backend
networks:
- ai-network
volumes:
- ./models:/build/models:ro
kokoro-service:
image: local-ai-kokoro-cuda13:latest
container_name: kokoro-backend
networks:
- ai-network
volumes:
- ./models:/build/models:ro
command: ["/usr/bin/python3", "/build/backend/python/kokoro/kokoro.py"] # 範例路徑
deploy:
resources:
reservations:
devices: [{driver: nvidia, count: all, capabilities: [gpu]}]
vllm-omni:
image: local-ai-vllm-omni-cuda13:latest
container_name: vllm-backend
restart: always
networks:
- ai-network
volumes:
- ./models:/build/models:ro
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all # DGX 環境建議指定顯卡,例如 "device=0,1"
capabilities: [gpu]
localai:
image: local-ai-cuda13:latest
container_name: localai-api
depends_on:
- asr-service:
- tts-service
- vllm-omni
- kokoro-service
networks:
- ai-network
ports:
- "8080:8080"
volumes:
- ./models:/build/models:ro
- /mnt/models:/models:ro
environment:
- DEBUG=true
- EXTERNAL_GRPC_BACKENDS=vllm:vllm-omni:9000,faster-whisper:asr-service:9000,piper:tts-service:9000,kokoro:kokoro-service:9000
restart: always
networks:
ai-network:
driver: bridge
$ docker compose -f docker-compose.yml up -d
$ docker compose -f docker-compose.yml logs -f
$ docker compose -f docker-compose.yaml down
$ curl http://192.168.0.108:8080/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@/home/mark/Data/Whisper/speaches/aaa.mp3" \
-F model="whisper-1"
$ vi models/piper.yaml
name: "tts-1"
backend: "piper"
parameters:
model: "/models/piper-zh_CN-huayan-medium/model.onnx"
$ vi models/vllm-omni.yaml
name: "vllm-omni"
backend: "vllm"
parameters:
model: "/models/llama-3.2-vision"
extra_args:
- "--dtype"
- "bfloat16"
- "--limit-mm-per-prompt" # 限制單個提示的多模態輸入數量
- "image=1,video=0"
- "--max-model-len"
- "4096"
- "--gpu-memory-utilization"
- "0.8"
$ vi models/kokoro.yaml
name: "kokoro-tts"
# 這裡直接指定為子目錄名稱
backend: "kokoro"
parameters:
# 指向您下載的 Kokoro ONNX 模型路徑
model: "kokoro-v1.0.onnx"
# 如果模型需要特定的聲音風格檔,通常在環境變數或參數中指定
environment:
KOKORO_VOICES_PATH: "/build/models/kokoro/voices.bin"
git clone https://github.com/mudler/LocalAI
cd LocalAI
$ docker build \
--build-arg SKIP_DRIVERS=false \
--build-arg BUILD_TYPE=cublas \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg BASE_IMAGE=ubuntu:24.04 \
--build-arg IMAGE_TYPE=core \
-t quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-core .
$ docker run -ti --gpus all --name local-ai \
-p 8080:8080 \
-v $PWD/models:/models \
-v $PWD/backends:/backends \
-e DEBUG=true \
quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-core \
--models-path /models \
--context-size 700 \
--threads 4
$ docker build \
--build-arg BUILD_TYPE=cublas \
--build-arg BASE_IMAGE=nvidia/cuda:13.0.0-devel-ubuntu24.04 \
--build-arg CUDA_MAJOR_VERSION=13 \
--build-arg CUDA_MINOR_VERSION=0 \
--build-arg UBUNTU_VERSION=2404 \
--build-arg TARGETARCH=arm64 \
--build-arg BACKEND=qwen-tts \
-t local-ai-qwen-tts-cuda13:latest \
-f backend/Dockerfile.python .
$ vi docker-compose.yml
version: "3.9"
services:
localai:
image: localai/localai:latest
container_name: localai
runtime: nvidia
ports:
- "8080:8080"
volumes:
- ./models:/models
- ./backends:/build/backend
environment:
# 👇 多個 backend image,用逗號分隔
LOCALAI_BACKEND_IMAGE=local-ai-qwen-asr-cuda13,local-ai-qwen-tts-cuda13
# 常見 GPU 設定
NVIDIA_VISIBLE_DEVICES=all
NVIDIA_DRIVER_CAPABILITIES=compute,utility
$ docker create --name tmp-qwen local-ai-qwen-tts-cuda13:latest /bin/true
$ mkdir qwen-tss-rootfs
$ docker export tmp-qwen | tar -xvf - -C ./qwen-tss-rootfs
$ docker rm tmp-qwen
cd qwen-tss-rootfs
mv venv venv.bak
python3 -m venv venv
source venv/bin/activate
which python
# 應該是 .../qwen-asr-rootfs/venv/bin/python
pip --version
# 應該指向 venv 內
pip install grpcio-tools==1.71.0 grpcio==1.71.0
python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. backend.proto