網頁

2025年12月30日 星期二

DGX Spark 使用 TRT LLM

參考 https://build.nvidia.com/spark 之下的
TRT LLM for Inference
NVFP4 Quantization
參考 https://nvidia.github.io/TensorRT-LLM/1.0.0rc2/commands/trtllm-serve.html

# Configure Docker permissions
$ sudo usermod -aG docker $USER
$ newgrp docker
$ id
uid=1000(spark) gid=988(docker) groups=988(docker),4(adm),27(sudo),29(audio),30(dip),46(plugdev),100(users),122(lpadmin),1000(spark)
$ ps
    PID TTY          TIME CMD
   6123 pts/1    00:00:00 bash
  24590 pts/1    00:00:00 bash
  24597 pts/1    00:00:00 ps

# Verify environment prerequisites
$ nvidia-smi
$ docker run --rm --gpus all nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev nvidia-smi

$ export HF_TOKEN=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
$ docker run --rm -it --gpus all \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  python -c "import tensorrt_llm; print(f'TensorRT-LLM version: {tensorrt_llm.__version__}')"
倒數兩行的輸出
[TensorRT-LLM] TensorRT-LLM version: 1.1.0rc3
TensorRT-LLM version: 1.1.0rc3

# Create Hugging Face cache directory
$ mkdir -p $HOME/.cache/huggingface/
## 若有需要改變目錄位置
$ export HF_HOME=/mnt/Data/huggingface

$ export MODEL_HANDLE="openai/gpt-oss-20b"
$ docker run \
  -e MODEL_HANDLE=$MODEL_HANDLE \
  -e HF_TOKEN=$HF_TOKEN \
  -v $HOME/.cache/huggingface/:/root/.cache/huggingface/ \
  --rm -it --ulimit memlock=-1 --ulimit stack=67108864 \
  --gpus=all --ipc=host --network host \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c '
    export TIKTOKEN_ENCODINGS_BASE="/tmp/harmony-reqs" && \
    mkdir -p $TIKTOKEN_ENCODINGS_BASE && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken && \
    hf download $MODEL_HANDLE && \
    python examples/llm-api/quickstart_advanced.py \
      --model_dir $MODEL_HANDLE \
      --prompt "Paris is great because" \
      --max_tokens 64
    '

==================
# Serve LLM with OpenAI-compatible API
$ export MODEL_HANDLE="openai/gpt-oss-20b"
$ export MODEL_HANDLE="openai/gpt-oss-120b"
$ export MODEL_HANDLE="meta-llama/Llama-3.3-70B-Instruct"
$ export MODEL_HANDLE="Qwen/Qwen3-4B-Instruct-2507"
$ export MODEL_HANDLE="deepseek-ai/DeepSeek-R1-Distill-Llama-8B'"

$ docker run --name trtllm_llm_server --rm -it --gpus all --ipc host --network host \
  -e HF_TOKEN=$HF_TOKEN \
  -e MODEL_HANDLE="$MODEL_HANDLE" \
  -v $HOME/.cache/huggingface/:/root/.cache/huggingface/ \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c '
    export TIKTOKEN_ENCODINGS_BASE="/tmp/harmony-reqs" && \
    mkdir -p $TIKTOKEN_ENCODINGS_BASE && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken && \
    hf download $MODEL_HANDLE && \
    cat > /tmp/extra-llm-api-config.yml <<EOF
print_iter_log: false
kv_cache_config:
  dtype: "auto"
  free_gpu_memory_fraction: 0.4
cuda_graph_config:
  enable_padding: true
disable_overlap_scheduler: true
EOF
    trtllm-serve "$MODEL_HANDLE" \
      --max_batch_size 8 \
      --trust_remote_code \
      --host 0.0.0.0 \
      --port 8000 \
      --extra_llm_api_options /tmp/extra-llm-api-config.yml
  '
$ curl -s http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "'"$MODEL_HANDLE"'",
    "messages": [{"role": "user", "content": "請你自我介紹"}],
    "max_tokens": 64
  }'

# Cleanup and rollback
sudo chown -R "$USER:$USER" "$HOME/.cache/huggingface"
rm -rf $HOME/.cache/huggingface/
docker image prune -f
docker rmi nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev


==================
# NVFP4 Quantization
$ mkdir -p ./output_models
$ chmod 755 ./output_models
# 使用 huggingface 的模型, 轉換模型
$ docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
  -v "./output_models:/workspace/output_models" \
  -v "$HOME/.cache/huggingface:/root/.cache/huggingface" \
  -e HF_TOKEN=$HF_TOKEN \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c "
    git clone -b 0.35.0 --single-branch https://github.com/NVIDIA/Model-Optimizer.git /app/TensorRT-Model-Optimizer && \
    cd /app/TensorRT-Model-Optimizer && pip install -e '.[dev]' && \
    export ROOT_SAVE_PATH='/workspace/output_models' && \
    /app/TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh \
    --model $MODEL_HANDLE \
    --quant nvfp4 \
    --tp 1 \
    --export_fmt hf
  "
# 出現 pynvml.NVMLError_NotSupported: Not Supported 錯誤,不用怕
# 使用本地端的模型,轉換模型
$ docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
  -v "./output_models:/workspace/output_models" \
  -v /mnt/models:/mnt/models \
  -v "$HOME/.cache/huggingface:/root/.cache/huggingface" \
  -e HF_TOKEN=$HF_TOKEN \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c "
    git clone -b 0.35.0 --single-branch https://github.com/NVIDIA/Model-Optimizer.git /app/TensorRT-Model-Optimizer && \
    cd /app/TensorRT-Model-Optimizer && pip install -e '.[dev]' && \
    export ROOT_SAVE_PATH='/workspace/output_models' && \
    /app/TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh \
    --model /mnt/models/Qwen2.5-Coder-7B \
    --quant nvfp4 \
    --tp 1 \
    --export_fmt hf
  "

$ ls -la ./output_models/
$ find ./output_models/ -name "*.bin" -o -name "*.safetensors" -o -name "config.json"
$ export MODEL_PATH="./output_models/saved_models_DeepSeek-R1-Distill-Llama-8B_nvfp4_hf/"
$ export MODEL_PATH="./output_models/saved_models_Qwen3-4B-Instruct-2507_nvfp4_hf/"
# 使用轉換完成的模型
$ docker run \
  -e HF_TOKEN=$HF_TOKEN \
  -v $HOME/.cache/huggingface/:/root/.cache/huggingface/ \
  -v "$MODEL_PATH:/workspace/model" \
  --rm -it --ulimit memlock=-1 --ulimit stack=67108864 \
  --gpus=all --ipc=host --network host \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c '
    python examples/llm-api/quickstart_advanced.py \
      --model_dir /workspace/model/ \
      --prompt "Paris is great because" \
      --max_tokens 64
    '
# Serve the model with OpenAI-compatible API
$ docker run \
  -e HF_TOKEN=$HF_TOKEN \
  -v "$MODEL_PATH:/workspace/model" \
  --rm -it --ulimit memlock=-1 --ulimit stack=67108864 \
  --gpus=all --ipc=host --network host \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  trtllm-serve /workspace/model \
    --backend pytorch \
    --max_batch_size 4 \
    --host 0.0.0.0 \
    --port 8000
$ curl -X POST http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "openai/gpt-oss-20b",
    "messages": [{"role": "user", "content": "What is artificial intelligence?"}],
    "max_tokens": 100,
    "temperature": 0.7,
    "stream": false
  }'

==================
# trtllm-serve 使用本地端模型
$ export MODEL_HANDLE="/mnt/models/gpt-oss-20b"    # 0.8:42.6GB 48W 16s | 0.4:26.1GB 48W 21s | 0.2:26.6GB 46W 17s
$ export MODEL_HANDLE="/mnt/models/gpt-oss-120b"   # 0.8:  72GB 50W 35s | 0.5:70.4GB 49W 40s | 0.4:68.7GB 49W 39s 
$ docker run --name trtllm_llm_server --rm -it --gpus all --ipc host --network host \
  -e MODEL_HANDLE="$MODEL_HANDLE" \
  -v /mnt/models:/mnt/models \
  -v $HOME/.cache/huggingface/:/root/.cache/huggingface/ \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c '
    export TIKTOKEN_ENCODINGS_BASE="/tmp/harmony-reqs" && \
    mkdir -p $TIKTOKEN_ENCODINGS_BASE && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken && \
    cat > /tmp/extra-llm-api-config.yml <<EOF
print_iter_log: false
kv_cache_config:
  dtype: "auto"
  free_gpu_memory_fraction: 0.5
cuda_graph_config:
  enable_padding: true
disable_overlap_scheduler: true
EOF

    trtllm-serve "$MODEL_HANDLE" \
      --max_batch_size 8 \
      --max_seq_len 65536 \
      --max_num_tokens 131072 \
      --trust_remote_code \
      --host 0.0.0.0 \
      --port 8000 \
      --extra_llm_api_options /tmp/extra-llm-api-config.yml
  '

DGX Spark 使用 vLLM server

經測試發現 vLLM 在記憶體和功耗上都比 TRT LLM差

$ sudo mount -t nfs 192.168.0.107:/mnt/Data/LangGraph/HuggingFace/models /mnt/models

$ export MODEL_HANDLE="/mnt/models/gpt-oss-20b"    # 0.8:96.9GB 43W 148s | 0.4:49.4GB 39W 120s
$ export MODEL_HANDLE="/mnt/models/gpt-oss-120b"   # 0.8:97.9GB 41W  75s | 0.7:86.5GB 40W 104s

$ docker run --rm --name vllm_server -it --gpus all \
-p 8000:8000 \
-v /mnt/models:/models \
nvcr.io/nvidia/vllm:25.11-py3 \
vllm serve "/models/gpt-oss-20b" \
--trust_remote_code \
--max-num-seqs 2 \
--quantization mxfp4 \
--gpu-memory-utilization 0.3 \
--served-model-name llm_chat \
--api-key token-abc123

若執行失敗,可清除記憶體,再試一遍
$ sudo sh -c 'sync && echo 3 > /proc/sys/vm/drop_caches'

DGX Spark 之 Open WebUI 配合 vLLM 或 TRT LLM

# ollama 連接 vllm 或 trt_llm
$ docker run -d --rm \
  --name open-webui-vllm \
  -p 8501:8080 \
  -v open-webui:/app/backend/data \
  -e OPENAI_API_BASE_URL=http://192.168.0.108:8000/v1 \
  -e OPENAI_API_KEY=token-abc123 \
  ghcr.io/open-webui/open-webui:main
其中 8080 為固定,不需改變
開啟 Open WebUI(http://localhost:8501)
登入後點選左下 使用者 → Admin Panel → Settings → Connections → OpenAI
你會看到已自動配置的 OpenAI 連線(指向 vLLM)

上述命令只可使用單一 vLLM 模型
改用下述命令,由網頁手動輸入多組 vLLM 模型
$ docker run -d --rm --gpus=all \
  -p 8501:8080 \
  -v open-webui:/app/backend/data \
  -v open-webui-ollama:/root/.ollama \
  --name open-webui ghcr.io/open-webui/open-webui:ollama
若啟動失敗,應該是 open-webui volume 不一致,使用下列命令
$ docker volume rm open-webui

網頁左下使用者名稱/Admin Panel/Settings/Connections/ +
URL: http://192.168.0.108:8000/v1
按 Save

DGX Spark 使用 dashboard

參考 https://github.com/DanTup/dgx_dashboard
可以觀察 記憶體使用量,溫度,功耗,Docker Container

$ docker run -d --gpus all \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 8080:8080 \
    --pull=always \
    --restart=unless-stopped \
    --name dashboard \
    ghcr.io/dantup/dgx_dashboard:latest
    
$ docker stop dashboard && docker rm dashboard

firefox http://192.168.0.108:8080/

2025年12月24日 星期三

DGX Spark 上的 vLLM 和 TRT LLM

NVFP4 Quantization 支援度不好,目前成功的如下:
DeepSeek-R1-Distill-Llama-8B
Qwen3-4B-Instruct-2507
chatgpt 說 Qwen3-xB-AxxB 的都不行, 不能有 AxxB

比較 gpt-oss-20b 和 gpt-oss-120b
trtllm 確實比 vllm 省記憶體,且速度快

2025年12月23日 星期二

DGX Spark 之溫度監測

$ sudo apt-get install lm-sensors
$ sudo sensors-detect
會問很多問題,通常回 YES
$ sudo apt-get install psensors
$ psensors

$ sudo apt-get install gnome-shell-extension-manager
$ sudo apt-get install chrome-gnome-shell
$ sudo apt-get install gnome-browser-connector

不要使用 chromium, 直接使用 firefox
firefox 開啟 https://extensions.gnome.org
安裝 GNOME Shell integration extension
搜尋 Vitals 安裝


2025年12月22日 星期一

DGX Spark 之 vLLM 安裝測試

$ curl -LsSf https://hf.co/cli/install.sh | bash
$ hf download openai/gpt-oss-20b --local-dir ./models/gpt-oss-20b

$ docker pull nvcr.io/nvidia/vllm:25.11-py3
$ docker run -it --gpus all -p 8000:8000 \
-v /mnt/models:/models \
nvcr.io/nvidia/vllm:25.11-py3 \
vllm serve "/models/gpt-oss-20b" \
--served-model-name llm_chat \
--api-key token-abc123

$ curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer token-abc123" \
-d '{
    "model": "llm_chat",
    "messages": [{"role": "user", "content": "你好,請自我介紹"}],
    "max_tokens": 500
}'

為了避免出現 out-of-memory (OOM)
$ sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'

ubuntu 之 nfs server 和 client

A 主機
$ sudo apt update
$ sudo apt install -y nfs-kernel-server
$ sudo vi /etc/exports
/mnt/Data/models 192.168.1.20(ro,sync,no_subtree_check)
ro:只讀(強烈建議,避免模型被誤寫)
sync:資料一致性
no_subtree_check:效能與穩定性
/mnt/Data/models 192.168.1.20(ro,all_squash,anonuid=1000,anongid=1000,sync,no_subtree_check)
all_squash:所有 client user 都映射成 anonymous
anonuid/anongid:指定成某個 UID/GID

$ sudo exportfs -ra
$ sudo systemctl restart nfs-kernel-server
$ showmount -e localhost

B 主機
$ sudo apt update
$ sudo apt install -y nfs-common
$ sudo mkdir -p /mnt/models
$ sudo mount -t nfs 192.168.1.10:/mnt/Data/models /mnt/models
$ ls /mnt/models
$ sudo vi /etc/fstab
192.168.1.10:/mnt/Data/models  /mnt/models  nfs  ro,_netdev,auto  0  0

DGX Spark 安裝 Text to Knowledge Graph

安裝 Text to Knowledge Graph
參考 https://build.nvidia.com/spark/txt2kg/instructions

$ git clone https://github.com/NVIDIA/dgx-spark-playbooks
$ cd dgx-spark-playbook/nvidia/txt2kg/assets
$ ./start.sh
瀏覽器 http://localhost:3001
$ ./stop.sh

因為同時使用了 Open WebUI with Ollama docker, 切換 compose_ollama_data 到 open-webui-ollama
$ docker volume ls
$ docker volume inspect compose_ollama_data
[
    {
        "CreatedAt": "2025-12-19T14:38:37+08:00",
        "Driver": "local",
        "Labels": {
            "com.docker.compose.config-hash": "b60a4e44fe9b008057f3eaff8c4477427e0db99c0c9a70285f81b92ba016830d",
            "com.docker.compose.project": "compose",
            "com.docker.compose.version": "2.40.0",
            "com.docker.compose.volume": "ollama_data"
        },
        "Mountpoint": "/var/lib/docker/volumes/compose_ollama_data/_data",
        "Name": "compose_ollama_data",
        "Options": null,
        "Scope": "local"
    }
]
$ cd dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/compose
$ cp docker-compose.yml docker-compose.yml.bak
$ vi docker-compose.yml
  ollama:
    volumes:
      - ollama_data:/root/.ollama
改成
  ollama:
    volumes:
      - open-webui-ollama:/root/.ollama
volumes:
  ollama_data:
改成
volumes:
  open-webui-ollama:
    external: true

查詢 ollama-compose docker 目前使用的 volume
$ docker inspect ollama-compose --format '{{ json .Mounts }}'

把 volume 掛進暫時容器,並清空內容
$ docker run --rm \
  -v compose_ollama_data:/data \
  alpine \
  sh -c "rm -rf /data/*"
確保沒有容器在用
$ docker ps -a --filter volume=compose_ollama_data
刪除 volume
docker volume rm compose_ollama_data


$ export OLLAMA_MODEL=gpt-oss:20b
$ docker exec ollama-compose ollama list
$ docker exec ollama-compose ollama pull gpt-oss:20b

2025年12月19日 星期五

DGX Spark 安裝 NVIDIA Sync 之 terminal

出現錯誤
Bad permissions. Try removing permissions for user: UNKNOWN\\UNKNOWN (S-1-15-3-1024-3299255270-1847605585-2201808924-710406709-3613095291-873286183-3101090833-2655911836) on file C:/Users/xxx/AppData/Local/NVIDIA Corporation/Sync/config/ssh_config.
Bad owner or permissions on C:/Users/xxx/AppData/Local/NVIDIA Corporation/Sync/config/ssh_config

用 powershell 執行
takeown /f "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\ssh_config"
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\ssh_config" /inheritance:r
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\ssh_config" /grant xxx:F

用下列命令確認
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\ssh_config"
C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\ssh_config ThinkPad-E000\xxx:(F)

出現錯誤
Bad permissions. Try removing permissions for user: UNKNOWN\\UNKNOWN (S-1-15-3-1024-3299255270-1847605585-2201808924-710406709-3613095291-873286183-3101090833-2655911836) on file C:/Users/xxx/AppData/Local/NVIDIA Corporation/Sync/config/nvsync.key.
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@         WARNING: UNPROTECTED PRIVATE KEY FILE!          @
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Permissions for 'C:\\Users\\xxx\\AppData\\Local\\NVIDIA Corporation\\Sync\\config\\nvsync.key' are too open.
It is required that your private key files are NOT accessible by others.
This private key will be ignored.
Load key "C:\\Users\\xxx\\AppData\\Local\\NVIDIA Corporation\\Sync\\config\\nvsync.key": bad permissions
spark@gx10-spark.local's password:

用 powershell 執行
takeown /f "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\nvsync.key"
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\nvsync.key" /inheritance:r
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\nvsync.key" /grant xxx:F

用下列命令確認
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\nvsync.key"
C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\nvsync.key ThinkPad-000\xxx:(F)

2025年12月15日 星期一

jetson orin 安裝 vLLM

https://hackmd.io/@johnnynunez/S1vJlvThee
https://pypi.jetson-ai-lab.io/jp6/cu126

$ uv init uv_vllm
$ cd uv_vllm
$ rm .python-version
$ vi pyproject.toml
requires-python = "==3.10.*"
$ uv venv --python 3.10

$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/62a/1beee9f2f1470/torch-2.8.0-cp310-cp310-linux_aarch64.whl#sha256=62a1beee9f2f147076a974d2942c90060c12771c94740830327cae705b2595fc
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/81a/775c8af36ac85/torchaudio-2.8.0-cp310-cp310-linux_aarch64.whl#sha256=81a775c8af36ac859fb3f4a1b2f662d5fcf284a835b6bb4ed8d0827a6aa9c0b7
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/907/c4c1933789645/torchvision-0.23.0-cp310-cp310-linux_aarch64.whl#sha256=907c4c1933789645ebb20dd9181d40f8647978e6bd30086ae7b01febb937d2d1
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/9da/4bcb8e8f0eba0/triton-3.4.0-cp310-cp310-linux_aarch64.whl#sha256=9da4bcb8e8f0eba00a097ad8c57b26102add499e520d67fb2d5362bebf976ca3
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/014/eff8ba676c7a3/bitsandbytes-0.47.0.dev0-cp310-cp310-linux_aarch64.whl#sha256=014eff8ba676c7a3830b9430744115af50790d2f7ff1b57f155a8839bcc39104

避免資源耗盡
$ ulimit -v
unlimited
$ ulimit -v $((26*1024*1024)) # 26GB
$ export MAX_JOBS=6

$ export TORCH_CUDA_ARCH_LIST="8.6;8.7"
$ export TRITON_PTXAS_PATH=/usr/local/cuda-12.6/bin/ptxas
$ export PATH=/usr/local/cuda-12.6/bin:$PATH
$ export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64:$LD_LIBRARY_PATH

$ git clone --recursive https://github.com/vllm-project/vllm.git
$ cd vlm
$ git checkout v0.11.0
$ python3 use_existing_torch.py
$ uv pip install -r requirements/build.txt
不要直接安裝,雖然可以成功,但無法使用,懷疑是 vlm 目錄名稱會和 python package 衝突,也無法隨意改名
$ uv pip install --no-build-isolation -e .
改建立 wheel
$ uv build --no-build-isolation --wheel
$ cd ..
$ mv vllm vllm_v0.11.0
$ uv remove vllm
$ uv add vllm_v0.11.0/dist/vu add vllm_v0.11.0/dist/vllm-0.11.1.dev0+gb8b302cde.d20251214.cu126-cp310-cp310-linux_aarch64.whl
$ uv pip list
$ uv run python -c "import vllm; print(vllm.__version__)"

rm -rf build dist *.egg-info
find . -name "*.so" -delete
uv cache clean
rm -rf ~/.cache/torch_extensions

$ uv run vllm serve "/mnt/Data/LangGraph/HuggingFace/models/Qwen3-4B-Instruct-2507" --trust_remote_code --tensor-parallel-size 1 --max-model-len 20k --max-num-seqs 16 --gpu-memory-utilization 0.6 --quantization bitsandbytes --api-key token-abc123

可用的 --quantization ['awq', 'deepspeedfp', 'tpu_int8', 'fp8', 'ptpc_fp8', 'fbgemm_fp8', 'modelopt', 'modelopt_fp4', 'bitblas', 'gguf', 'gptq_marlin_24', 'gptq_marlin', 'gptq_bitblas', 'awq_marlin', 'gptq', 'compressed-tensors', 'bitsandbytes', 'hqq', 'experts_int8', 'ipex', 'quark', 'moe_wna16', 'torchao', 'auto-round', 'rtn', 'inc', 'mxfp4', 'petit_nvfp4']

測試可用的 quantization [bitsandbytes fp8 experts_int8]

2025年10月2日 星期四

jetson orin 重新安裝

$ sudo apt-get update
$ sudo apt-get upgrade
$ sudo apt install python3-pip
$  sudo apt-get install curl
$ sudo pip3 install -U jetson-stats
$ sudo jtop
info 中 CUDA, cuDNN 出現 MISSING, 需安裝 nvidia-jetpack
$ sudo apt-get install nvidia-jetpack
出現錯誤
E: Unable to locate package nvidia-jetpack
$ sudo vi /etc/apt/sources.list.d/nvidia-l4t-apt-source.list 
deb https://repo.download.nvidia.com/jetson/common r36.3 main
deb https://repo.download.nvidia.com/jetson/t234 r36.3 main
$ sudo apt-get update
$ sudo apt-get install nvidia-jetpack

安裝 chromium
$ sudo apt-get install chromium-browser
$ snap download snapd --revision=24724
$ sudo snap ack snapd_24724.assert
$ sudo snap install snapd_24724.snap
$ sudo snap refresh --hold snapd
$ chromium-browser


安裝 Deepstream https://docs.nvidia.com/metropolis/deepstream/7.0/dev-guide/text/DS_Installation.html
$ pkg-config --modversion glib-2.0
2.72.4
$ sudo apt install meson ninja-build
$ git clone https://github.com/GNOME/glib.git
$ cd glib
$ git checkout <glib-version-branch>
# e.g. 2.76.6
$ meson build --prefix=/usr
$ ninja -C build/
$ cd build/
$ ninja install
$ pkg-config --modversion glib-2.0
$ sudo apt install \
libssl3 \
libssl-dev \
libgstreamer1.0-0 \
gstreamer1.0-tools \
gstreamer1.0-plugins-good \
gstreamer1.0-plugins-bad \
gstreamer1.0-plugins-ugly \
gstreamer1.0-libav \
libgstreamer-plugins-base1.0-dev \
libgstrtspserver-1.0-0 \
libjansson4 \
libyaml-cpp-dev
$ git clone https://github.com/confluentinc/librdkafka.git
$ cd librdkafka
$ git checkout tags/v2.2.0
$ ./configure --enable-ssl
$ make
$ sudo make install
$ sudo mkdir -p /opt/nvidia/deepstream/deepstream/lib
$ sudo cp /usr/local/lib/librdkafka* /opt/nvidia/deepstream/deepstream/lib
$ sudo ldconfig
$ wget --content-disposition 'https://api.ngc.nvidia.com/v2/resources/org/nvidia/deepstream/7.0/files?redirect=true&path=deepstream_sdk_v7.0.0_jetson.tbz2' --output-document 'deepstream_sdk_v7.0.0_jetson.tbz2'
$ sudo tar -xvf deepstream_sdk_v7.0.0_jetson.tbz2 -C /
$ cd /opt/nvidia/deepstream/deepstream-7.0
$ sudo ./install.sh
$ sudo ldconfig

2025年10月1日 星期三

nvidia jetson orin 安裝 vino

$ sudo apt-get update
$ sudo apt-get upgrade
$ sudo apt-get install vino
$ mkdir -p ~/.config/autostart
$ cp /usr/share/applications/vino-server.desktop ~/.config/autostart/.
$ cd /usr/lib/systemd/user/graphical-session.target.wants
$ sudo ln -s ../vino-server.service ./.
$ gsettings set org.gnome.Vino prompt-enabled false
$ gsettings set org.gnome.Vino require-encryption false
$ gsettings set org.gnome.Vino authentication-methods "['vnc']"
$ gsettings set org.gnome.Vino vnc-password $(echo -n 'thepassword'|base64)

2025年6月25日 星期三

更改 Docker Root Dir

參考 https://ithelp.ithome.com.tw/articles/10235112
參考 https://medium.com/@calvineotieno010/change-docker-default-root-data-directory-a1d9271056f4

$ docker info
$ vi /etc/docker/daemon.json
{
  "data-root": "/path/to/new/docker_data"
}
$ sudo systemctl stop docker
$ sudo systemctl stop docker.socket
$ sudo rsync -avh /var/lib/docker/* /path/to/new/docker_data
$ sudo systemctl start docker
$ sudo systemctl start docker.socket
$ docker info

2025年6月18日 星期三

Jetpack 6.0 更新 CUDA 12.2 至 12.6

本篇參考 安裝 Flux & ComfyUI 將其中的一些細節敘述清楚

參考 https://www.jetson-ai-lab.com/tutorial_comfyui_flux.html
參考 https://docs.nvidia.com/metropolis/deepstream/dev-guide/text/DS_Installation.html#id10

$ sudo jtop
CUDA: 12.2.140
cuDNN: 8.9.4.25
TensorRT: 8.6.2.3

因為需要從 cuda-12.2 更新到 cuda-12.6
而在更新的過程中,會移除 Deepstream-7.0
所以需要自己安裝 Deepstream-7.1

記錄下目前的版本資料
$ dpkg -l>dekg_jp6.0.txt

$ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-aarch64.sh
$ chmod +x Miniconda3-latest-Linux-aarch64.sh
$ ./Miniconda3-latest-Linux-aarch64.sh
$ conda update conda
$ conda create -n comfyui python=3.10
$ conda init bash
$ cat .bashrc
$ conda activate comfyui
$ conda info --envs
$ conda deactivate
$ conda activate comfyui
$ conda list

$ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
$ sudo dpkg -i cuda-keyring_1.1-1_all.deb
$ sudo apt-get update
$ sudo apt-get -y install cuda-toolkit-12-6 cuda-compat-12-6
$ update-alternatives --list cuda
$ update-alternatives --display cuda
$ sudo update-alternatives --config cuda
cuda-12.6

$ apt list -a cudnn
$ sudo apt-get install cudnn=9.10.2-1
$ apt list -a python3-libnvinfer
$ sudo apt-get install python3-libnvinfer=10.7.0.23+cuda12.6

The following packages will be REMOVED:
  deepstream-7.0 libnvparsers-dev nvidia-tensorrt-dev
$ sudo apt-get install python3-libnvinfer-dev=10.7.0.23-1+cuda12.6
$ sudo jtop
CUDA: 12.6.85
cuDNN: 9.10.2
TensorRT: 10.7.0.23
$ wget --content-disposition 'https://api.ngc.nvidia.com/v2/resources/org/nvidia/deepstream/7.1/files?redirect=true&path=deepstream-7.1_7.1.0-1_arm64.deb' -O deepstream-7.1_7.1.0-1_arm64.deb
$ sudo apt-get install ./deepstream-7.1_7.1.0-1_arm64.deb

$ export BNB_CUDA_VERSION=126
$ export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64:$LD_LIBRARY_PATH

$ git clone https://github.com/timdettmers/bitsandbytes.git
$ cd bitsandbytes

$ pip uninstall numpy
$ pip install "numpy<2.0"
$ mkdir -p build
$ cd build
$ cmake .. -DCOMPUTE_BACKEND=cuda -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.6
$ make -j$(nproc)
$ cd ..
$ python setup.py install

$ pip install http://jetson.webredirect.org/jp6/cu124/+f/5fe/ee5f5d1a75229/torch-2.3.0-cp310-cp310-linux_aarch64.whl
$ pip install http://jetson.webredirect.org/jp6/cu124/+f/988/cb71323efff87/torchvision-0.18.0a0+6043bc2-cp310-cp310-linux_aarch64.whl
$ pip install http://jetson.webredirect.org/jp6/cu124/+f/0aa/a066463c02b4a/torchaudio-2.3.0+952ea74-cp310-cp310-linux_aarch64.whl

$ python3
>>> import bitsandbytes as bnb
>>> print(bnb.__version__)

$ git clone https://github.com/comfyanonymous/ComfyUI.git
$ cd ComfyUI
$ pip install -r requirements.txt
$ cd custom_nodes
$ git clone https://github.com/ltdrdata/ComfyUI-Manager.git
$ cd ..
$ mkdir workflows
$ cd workflows
$ wget https://www.jetson-ai-lab.com/assets/workflow_agx_orin_4steps.json
$ cd ..
到 https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
下載 flux1-schnell.safetensors 放到 models/unet
下載 ae.safetensors 放到 models/vae/FLUX1
到 https://huggingface.co/stabilityai/stable-diffusion-3-medium/tree/main/text_encoders
下載 clip_l.safetensors 和 t5xxl_fp8_e4m3fn.safetensors 放到 models/clip
$ python main.py --port=8080
在 http://127.0.0.1:8080/ 網頁,按 Queue Prompt, 等待...
Load workflow_agx_orin_4steps.json 時,需修改 "Load VAE" 的 vae_name 到 FLUX1/ae.safetensors

記錄下目前的版本資料
$ dpkg -l>dpkg_jp6.0_comfyui.txt

若需要將 cuda-12.6 還回 cuda-12.2
參考記錄下來的版本資料 dpkg_jp6.0.txt dpkg_jp6.0_comfyui.txt
參考 https://repo.download.nvidia.com/jetson 下載 package
移除不必要的 package
$ sudo apt-get remove --purge package
安裝 package 時,常出現相依性錯誤,所以有時要變更安裝順序
甚至最後重新安裝一遍,確保全部安裝
$ cd jetpack_6.0
$ ./download.sh
$ ./rollback.sh
$ ./install.sh

參考 https://catalog.ngc.nvidia.com/orgs/nvidia/containers/l4t-cuda/tags
$ xhost +
$ docker run -it --rm --net=host --runtime nvidia\
  -e DISPLAY=$DISPLAY \
  -v /tmp/.X11-unix/:/tmp/.X11-unix \
  nvcr.io/nvidia/l4t-cuda:11.4.19-runtime
$ docker run -it --rm --net=host --runtime nvidia\
  -e DISPLAY=$DISPLAY \
  -v /tmp/.X11-unix/:/tmp/.X11-unix \
  nvcr.io/nvidia/l4t-cuda:12.2.12-runtime
$ docker run -it --rm --net=host --runtime nvidia\
  -e DISPLAY=$DISPLAY \
  -v /tmp/.X11-unix/:/tmp/.X11-unix \
  nvcr.io/nvidia/l4t-cuda:12.6.11-runtime
docker: Error response from daemon: failed to create task for container: 
failed to create shim task: OCI runtime create failed: 
failed to create NVIDIA Container Runtime: failed to construct OCI spec modifier: 
requirements not met: unsatisfied condition: cuda>=12.6 (cuda=12.2): unknown.
因為 Jetpack 6.0 自帶 CUDA 12.2
使用的 docker image 不能比 CUDA 12.2 高 

$ docker run -it --rm --net=host --runtime nvidia\
  -e DISPLAY=$DISPLAY \
  -v /tmp/.X11-unix/:/tmp/.X11-unix \
  -w /opt/nvidia/deepstream/deepstream-7.0 \
  nvcr.io/nvidia/deepstream-l4t:7.0-samples-multiarch

因為 Jetpack 6.0 自帶 CUDA 12.2
使用的 docker image 不能比 CUDA 12.2 高 
所以也無法使用 Deepstream 7.1
$ docker run -it --rm --net=host --runtime nvidia\
  -e DISPLAY=$DISPLAY \
  -v /tmp/.X11-unix/:/tmp/.X11-unix \
  -w /opt/nvidia/deepstream/deepstream-7.1 \
  nvcr.io/nvidia/deepstream-l4t:7.1-samples-multiarch

2025年6月12日 星期四

安裝 Flux & ComfyUI

參考 https://www.jetson-ai-lab.com/tutorial_comfyui_flux.html

$ sudo jtop
CUDA: 12.2.140
cuDNN: 8.9.4.25
TensorRT: 8.6.2.3

因為需要從 cuda-12.2 更新到 cuda-12.6
而在更新的過程中,會移除 Deepstream-7.0
所以需要自己安裝 Deepstream-7.1

記錄下目前的版本資料
$ dpkg -l>dekg_jp6.0.txt

$ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-aarch64.sh
$ chmod +x Miniconda3-latest-Linux-aarch64.sh
$ ./Miniconda3-latest-Linux-aarch64.sh
$ conda update conda
$ conda create -n comfyui python=3.10
$ conda init bash
$ cat .bashrc
$ conda activate comfyui
$ conda info --envs
$ conda deactivate
$ conda activate comfyui
$ conda list

$ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
$ sudo dpkg -i cuda-keyring_1.1-1_all.deb
$ sudo apt-get update
$ sudo apt-get -y install cuda-toolkit-12-6 cuda-compat-12-6
$ update-alternatives --list cuda
$ update-alternatives --display cuda
$ sudo update-alternatives --config cuda
cuda-12.6

$ sudo apt-get install cudnn=9.5.1-1
$ sudo apt-get install python3-libnvinfer=9.5.1-1
The following packages will be REMOVED:
  deepstream-7.0 libnvparsers-dev nvidia-tensorrt-dev
$ sudo apt-get install python3-libnvinfer-dev=10.6.0.26-1+cuda12.6
$ sudo jtop
CUDA: 12.6.77
cuDNN: 9.5.1
TensorRT: 10.6.0.26
$ wget --content-disposition 'https://api.ngc.nvidia.com/v2/resources/org/nvidia/deepstream/7.1/files?redirect=true&path=deepstream-7.1_7.1.0-1_arm64.deb' -O deepstream-7.1_7.1.0-1_arm64.deb
$ sudo apt-get install ./deepstream-7.1_7.1.0-1_arm64.deb

$ export BNB_CUDA_VERSION=126
$ export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64:$LD_LIBRARY_PATH

$ git clone https://github.com/timdettmers/bitsandbytes.git
$ cd bitsandbytes

$ pip uninstall numpy
$ pip install "numpy<2.0"
$ mkdir -p build
$ cd build
$ cmake .. -DCOMPUTE_BACKEND=cuda -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.6
$ make -j$(nproc)
$ cd ..
$ python setup.py install

$ pip install http://jetson.webredirect.org/jp6/cu124/+f/5fe/ee5f5d1a75229/torch-2.3.0-cp310-cp310-linux_aarch64.whl
$ pip install http://jetson.webredirect.org/jp6/cu124/+f/988/cb71323efff87/torchvision-0.18.0a0+6043bc2-cp310-cp310-linux_aarch64.whl
$ pip install http://jetson.webredirect.org/jp6/cu124/+f/0aa/a066463c02b4a/torchaudio-2.3.0+952ea74-cp310-cp310-linux_aarch64.whl

$ python3
>>> import bitsandbytes as bnb
>>> print(bnb.__version__)

$ git clone https://github.com/comfyanonymous/ComfyUI.git
$ cd ComfyUI
$ pip install -r requirements.txt
$ cd custom_nodes
$ git clone https://github.com/ltdrdata/ComfyUI-Manager.git
$ cd ..
$ mkdir workflows
$ cd workflows
$ wget https://www.jetson-ai-lab.com/assets/workflow_agx_orin_4steps.json
$ cd ..
到 https://huggingface.co/black-forest-labs/FLUX.1-schnell/tree/main
下載 flux1-schnell.safetensors 放到 models/unet
下載 ae.safetensors 放到 models/vae/FLUX1
到 https://huggingface.co/stabilityai/stable-diffusion-3-medium/tree/main/text_encoders
下載 clip_l.safetensors 和 t5xxl_fp8_e4m3fn.safetensors 放到 models/clip
$ python main.py --port=8080
http://127.0.0.1:8080/
Load workflow_agx_orin_4steps.json 時,需修改 "Load VAE" 的 vae_name 到 FLUX1/ae.safetensors

記錄下目前的版本資料
$ dpkg -l>dekg_jp6.0_cuda-12.6.txt

若需要將 cuda-12.6 還回 cuda-12.2
$ cd jetpack_6.0
$ ./download.sh
$ ./install.sh
參考記錄下來的版本資料 dekg_jp6.0.txt dekg_jp6.0_cuda-12.6.txt
移除不必要的 package
$ sudo apt-get remove --purge package

參考 https://catalog.ngc.nvidia.com/orgs/nvidia/containers/l4t-cuda/tags
$ xhost +
$ docker run -it --rm --net=host --runtime nvidia\
  -e DISPLAY=$DISPLAY \
  -v /tmp/.X11-unix/:/tmp/.X11-unix \
  nvcr.io/nvidia/l4t-cuda:11.4.19-runtime
$ docker run -it --rm --net=host --runtime nvidia\
  -e DISPLAY=$DISPLAY \
  -v /tmp/.X11-unix/:/tmp/.X11-unix \
  nvcr.io/nvidia/l4t-cuda:12.2.12-runtime
$ docker run -it --rm --net=host --runtime nvidia\
  -e DISPLAY=$DISPLAY \
  -v /tmp/.X11-unix/:/tmp/.X11-unix \
  nvcr.io/nvidia/l4t-cuda:12.6.11-runtime
docker: Error response from daemon: failed to create task for container: 
failed to create shim task: OCI runtime create failed: 
failed to create NVIDIA Container Runtime: failed to construct OCI spec modifier: 
requirements not met: unsatisfied condition: cuda>=12.6 (cuda=12.2): unknown.


$ docker run -it --rm --net=host --runtime nvidia\
  -e DISPLAY=$DISPLAY \
  -v /tmp/.X11-unix/:/tmp/.X11-unix \
  -w /opt/nvidia/deepstream/deepstream-7.0 \
  nvcr.io/nvidia/deepstream-l4t:7.0-samples-multiarch

docker run -it --rm --net=host --runtime nvidia\
  -e DISPLAY=$DISPLAY \
  -v /tmp/.X11-unix/:/tmp/.X11-unix \
  -w /opt/nvidia/deepstream/deepstream-7.1 \
  nvcr.io/nvidia/deepstream-l4t:7.1-samples-multiarch

2025年4月15日 星期二

球型攝影機 onvif

參考 https://github.com/FalkTannhaeuser/python-onvif-zeep
參考 https://www.onvif.org/onvif/ver20/util/operationIndex.html

$ python -m venv --system-site-packages /mnt/Data/envs/onvif
$ source /mnt/Data/envs/onvif/bin/activate
$ pip install --upgrade onvif_zeep
$ git clone https://github.com/FalkTannhaeuser/python-onvif-zeep.git

$ onvif-cli devicemgmt GetHostname --user 'admin' --password 'sh22463458' --host '192.168.113.203' --port 80

查詢 ProfileToken
$ onvif-cli media GetProfiles --user 'admin' --password 'sh22463458' --host '192.168.113.203' --port 80 | grep -o "'token': '[^']*'" | awk -F': ' 'END {print $2}'
'MediaProfile00002'
$ onvif-cli ptz GotoPreset "{'ProfileToken':'MediaProfile00002', 'PresetToken':'9'}" --user 'admin' --password 'sh22463458' --host '192.168.113.203' --port 80
$ onvif-cli ptz GetPresets "{'ProfileToken':'MediaProfile00002'}" --user 'admin' --password 'sh22463458' --host '192.168.113.203' --port 80
$ onvif-cli ptz AbsoluteMove "{'ProfileToken':'MediaProfile00002', 'Position':{'PanTilt':{'x': -0.05, 'y': 0.6}, 'Zoom':0.5}}" --user 'admin' --password 'sh22463458' --host '192.168.113.203' --port 80

相對位置移動, 0:不移動 正:上,右,放大 負:下,左,縮小
左上
$ onvif-cli ptz RelativeMove "{'ProfileToken':'MediaProfile00002', 'Translation':{'PanTilt':{'x': 0.105, 'y': 0.22}, 'Zoom':0.3}}" --user 'admin' --password 'sh22463458' --host '192.168.113.203' --port 80
右下
$ onvif-cli ptz RelativeMove "{'ProfileToken':'MediaProfile00002', 'Translation':{'PanTilt':{'x': -0.115, 'y': -0.201}, 'Zoom':0.3}}" --user 'admin' --password 'sh22463458' --host '192.168.113.203' --port 80
左上
$ onvif-cli ptz RelativeMove "{'ProfileToken':'MediaProfile00002', 'Translation':{'PanTilt':{'x': 0.105, 'y': 0.21}, 'Zoom':0.15}}" --user 'admin' --password 'sh22463458' --host '192.168.113.203' --port 80
右下
$ onvif-cli ptz RelativeMove "{'ProfileToken':'MediaProfile00002', 'Translation':{'PanTilt':{'x': -0.105, 'y': -0.21}, 'Zoom':0.15}}" --user 'admin' --password 'sh22463458' --host '192.168.113.203' --port 80

2025年2月20日 星期四

open webui 使用網頁搜尋

https://developers.google.com/custom-search/v1/introduction?hl=zh-tw
點選 程式化搜尋引擎 (免費版) 使用者:取得金鑰

https://programmablesearchengine.google.com/controlpanel/all
點選 新增
輸入 搜尋引擎名稱
在 Paid Element API 金鑰 欄位,輸入之前取得的 金鑰
取得 搜尋引擎 ID

進入 open webui, 點選左下角使用者,選擇 設定
點選 管理員設定/網頁搜尋
開啟 啟用網頁搜尋
網頁搜尋引擎: google_pse
開啟 Full Context Mode
輸入 Google PSE API 金鑰
輸入 Google PSE 引擎 ID

網頁搜尋引擎: duckduckgo
可以直接使用

在 open webui 的傳送訊息欄位下方,可以點選 網頁搜尋

2025年2月19日 星期三

安裝 ollama open-webui nginx

參考 https://github.com/ollama/ollama
參考 https://hub.docker.com/r/ollama/ollama
參考 https://www.53ai.com/news/OpenSourceLLM/2024072585037.html

$ docker run -d --gpus=all -p 11434:11434 --name ollama \
  -v /mnt/Data/ollama/ollama_volume:/root/.ollama \
  ollama/ollama
$ docker exec -it ollama ollama run deepseek-r1
$ git clone https://github.com/ggerganov/llama.cpp.git
$ cd llama.cpp
$ cmake -B build
$ cmake --build build --config Release
$ pip install huggingface_hub

轉換 huggingface 上的 model, 成為 GGUF 格式
vi download.py 
from huggingface_hub import snapshot_download, login

login("hf_BqLATKBqbVzOWNBJcFMwHKzCJfu")

# 下载模型
snapshot_download(
    "taide/Llama-3.1-TAIDE-LX-8B-Chat",
    local_dir="taide_Llama-3.1-TAIDE-LX-8B-Chat",
    local_dir_use_symlinks=False,
    ignore_patterns=["*.gguf"]
)

$ vi convert_hf_to_gguf_update.py
在 models 中, 加入下行, 注意 TOKENIZER_TYPE 的選擇
    {"name": "taide_Llama-3.1-TAIDE-LX-8B-Chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/taide/Llama-3.1-TAIDE-LX-8B-Chat"},
    {"name": "yentinglin_Llama-3-Taiwan-8B-Instruct", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/yentinglin/Llama-3-Taiwan-8B-Instruct"},
$ python convert_hf_to_gguf_update.py hf_BqLATKBqbVzOWNBJcFMwHKzCJfu
$ python convert_hf_to_gguf.py taide_Llama-3.1-TAIDE-LX-8B-Chat --outtype f16 --outfile taide_Llama-3.1-TAIDE-LX-8B-Chat.fp16.gguf
$ llama.cpp/build/bin/llama-quantize taide_Llama-3.1-TAIDE-LX-8B-Chat.fp16.gguf Q4_K_M
$ mv ggml-model-Q4_K_M.gguf taide_Llama-3.1-TAIDE-LX-8B-Chat-Q4_K_M.gguf
$ vi Modelfile.taide-8b
FROM ./yentinglin_Llama-3-Taiwan-8B-Instruct.Q4_K_M.gguf
# set the temperature to 1 [higher is more creative, lower is more coherent]
PARAMETER temperature 1
# set the system message
SYSTEM """
我是一個萬事通
"""

$ docker exec -it ollama /bin/bash
# cd /root/.ollama
# ollama create taide-8b -f ./Modelfile.taide-8b
# ollama list
# ollama show taide-8b
# ollama rm taide-8b
# ollama ps
# ollama run taide-8b
>>> /bye
# OLLAMA_HOST=127.0.0.1:11434 ollama serve
$ curl http://localhost:11434/api/generate -d '{
  "model": "yentinglin-8b", 
  "prompt": "建議適合ai的程式語言"
}'
$ curl http://localhost:11434/api/generate -d '{
  "model": "yentinglin-8b", 
  "prompt": "建議適合ai的程式語言",
  "stream", false
}'
$ curl http://localhost:11434/api/chat -d '{
  "model": "yentinglin-8b", 
  "messages": [
    {"role": "user", "content": "建議適合ai的程式語言"}
  ]
}'
$ curl http://localhost:11434/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "yentinglin-8b",
        "messages": [
            {
                "role": "system",
                "content": "你是一個萬事通"
            },
            {
                "role": "user",
                "content": "眼睛酸痛,怎麼辦?"
            }
        ]
    }'

$ docker logs ollama

$ python ../llama.cpp/convert_hf_to_gguf.py yentinglin_Llama-3-Taiwan-8B-Instruct --outtype f16 --outfile yentinglin_Llama-3-Taiwan-8B-Instruct.fp16.gguf
$ llama.cpp/build/bin/llama-quantize yentinglin_Llama-3-Taiwan-8B-Instruct.fp16.gguf Q4_K_M


建議適合ai的程式語言

$ docker run -d -p 3000:8080 --gpus all \
  --add-host=host.docker.internal:host-gateway \
  -v /mnt/Data/ollama/open-webui_volume:/app/backend/data \
  --name open-webui \
  --restart always \
  ghcr.io/open-webui/open-webui:cuda
  
Firefox Web Browser 輸入 http://localhost:3000
出現 This address is restricted 錯誤
進入 Firefox Web Browser 設定
網址列輸入 about:config, 按 "Accept the Risk and Continue" 按鈕
在收尋欄輸入 network.security.ports.banned.override, 點選 "String", 按 +
輸入 port 3000, 按 V
重新載入 http://localhost:3000

chrome 設定
chrome://flags/#unsafely-treat-insecure-origin-as-secure
輸入網址 http://localhost:3000

安裝 nginx
參考 https://docs.openwebui.com/tutorials/https-nginx/
參考 https://yingrenn.blogspot.com/2020/07/ssl-nginx.html
vi nginx.conf
server {
    listen 443 ssl;
    server_name  www.domain.com.tw;
    ssl_certificate /etc/nginx/conf/Certs/server.pem;
    ssl_certificate_key /etc/nginx/conf/Certs/server.key;
    ssl_trusted_certificate /etc/nginx/conf/Certs/caChain.crt;
    ssl_stapling on;
    ssl_stapling_verify on;
    ssl_session_timeout 5m;
    ssl_protocols TLSv1 TLSv1.1 TLSv1.2;
    ssl_ciphers ECDHE-RSA-AES128-GCM-SHA256:HIGH:!aNULL:!MD5:!RC4:!DHE;
    ssl_prefer_server_ciphers on;
    
    location / {
        proxy_set_header HOST $host;
        proxy_set_header X-Forwarded-Proto $scheme;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_pass http://host.docker.internal:3000;
        
        # Add WebSocket support (Necessary for version 0.5.0 and up)
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection "upgrade";
        
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        
        # (Optional) Disable proxy buffering for better streaming response from models
        proxy_buffering off;
    }
}
server {
     listen 80;
     server_name www.domain.com.tw;
     return 301 https://$host$request_uri; 
}

docker run -itd --name nginx \
  -p 80:80 -p 443:443 \
  --add-host=host.docker.internal:host-gateway \
  -v /mnt/Data/ollama/nginx/conf.d/nginx.conf:/etc/nginx/conf.d/nginx.conf \
  -v /mnt/Data/ollama/nginx/conf:/etc/nginx/conf \
  -m 100m library/nginx:latest

https://www.domain.com.tw

2025年2月4日 星期二

ASR 語音辨識

參考 https://speaches-ai.github.io/speaches/
參考 https://github.com/speaches-ai/speaches/tree/master

curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.yaml
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cuda.yaml
curl --silent --remote-name https://raw.githubusercontent.com/speaches-ai/speaches/master/compose.cuda-cdi.yaml
export COMPOSE_FILE=compose.cuda-cdi.yaml

安裝使用 CUDA with CDI(Container Device Interface) feature enabled
sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml

修改 compose.cuda-cdi.yaml, 加入 command, 並修改 devices
services:
  speaches:
    command: ["uvicorn", "--factory", "speaches.main:create_app", "--ws-ping-interval", "1000", "--ws-ping-timeout", "1200"]
          # WARN: requires Docker Compose 2.24.2
          # https://docs.docker.com/reference/compose-file/merge/#replace-value
          devices:
            - driver: nvidia
              device_ids: ['0']
              capabilities:
                - gpu

伺服器端 log 出現下列錯誤
websockets.exceptions.ConnectionClosedError: sent 1011 (internal error) keepalive ping timeout; no close frame received
compose.cuda-cdi.yaml 的 command 加入 ws-ping-interval ws-ping-timeout

$ docker compose up --detach
$ docker compose stop
$ docker compose rm
$ docker compose logs
$ docker inspect speaches
$ docker cp speaches:/home/ubuntu/speaches/speaches/config.py .

$ docker compose exec speaches sh
$ docker compose run -d speaches uvicorn --factory speaches.main:create_app --ws-ping-interval=10 --ws-ping-timeout=12

伺服器端 log 出現下列錯誤
INFO:speaches.routers.stt:audio_receiver:262:Not enough speech in the last 30.0 seconds.
$ vi speaches/src/speaches/config.py
inactivity_window_seconds: float = 1000.0

說明文件
http://localhost:8000/docs
http://localhost:8000/redoc

$ curl -X POST -F "file=@/mnt/Data/Whisper/examples/《大隋说书人 》 01.mp3" -F "prompt=歡迎收聽第一集處女觀大隨雍洲且墨城深秋夜" -F "language=zh" http://localhost:8000/v1/audio/transcriptions

ubuntu Settings/Sound:Input 選擇正確輸入源,螢幕上的音量可以顯示輸入音量
列出可用音源輸入
$ arecord -l
**** List of CAPTURE Hardware Devices ****
card 0: PCH [HDA Intel PCH], device 0: ALCS1200A Analog [ALCS1200A Analog]
  Subdevices: 1/1
  Subdevice #0: subdevice #0
card 0: PCH [HDA Intel PCH], device 2: ALCS1200A Alt Analog [ALCS1200A Alt Analog]
  Subdevices: 1/1
  Subdevice #0: subdevice #0

使用 card 0
$ ffmpeg -f alsa -i hw:0 -acodec libmp3lame -b:a 128k -abr 1 aaa.mp3
按q, 停止

因為使用 -i hw:0, 無法錄到聲音, 改用 -i default
$ arecord -L
default
    Playback/recording through the PulseAudio sound server

使用預設音源輸入
$ ffmpeg -f alsa -i default -acodec libmp3lame -b:a 128k -abr 1 aaa.mp3
按q, 停止
$ ffmpeg -loglevel quiet -f alsa -i default -ac 1 -ar 16000 -f s16le aaa.wav
按q, 停止
去除 mp3 的 metadata
$ ffmpeg -hide_banner -i '/mnt/Data/Whisper/examples/《大隋说书人 》 01.mp3' -c:v copy -c:a copy -map_metadata -1 test.mp3
轉 mp3 到 pcm
$ ffmpeg -i test.mp3 -f s16le -ar 16000 -ac 1 test.pcm

$ cat test.pcm | pv -qL 32000 | websocat --no-close --binary 'ws://localhost:8000/v1/audio/transcriptions?language=zh'

客戶端出現下列錯誤
Closing WebSocket connection due to ping timeout
命令中加入 --ping-timeout 和 --ping-interval

$ cat test.pcm | pv -qL 32000 | websocat --no-close --binary --ping-timeout 12000 --ping-interval 10000 'ws://localhost:8000/v1/audio/transcriptions?language=zh'

由麥克風輸入,產生 pcm 檔
$ ffmpeg -f alsa -ar 16000 -i default -ac 1 -f s16le aaa.pcm
轉成 mp3
$ ffmpeg -f s16le -ar 16000 -ac 1 -i aaa.pcm -codec:a libmp3lame aaa.mp3
$ cat aaa.pcm | pv -aL 32000 | websocat --no-close --binary --ping-timeout 12000 --ping-interval 10000 'ws://localhost:8000/v1/audio/transcriptions?language=zh'

測試 CLI
export OPENAI_BASE_URL=http://localhost:8000/v1/
export OPENAI_API_KEY="cant-be-empty"
openai api audio.transcriptions.create -m Systran/faster-whisper-large-v3 -f '/mnt/Data/Whisper/examples/《大隋说书人 》 01.mp3' --response-format text


申請 OPENAI_API_KEY, 並測試 
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

curl https://api.openai.com/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer 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" \
  -d '{
    "model": "gpt-4o-mini",
    "store": true,
    "messages": [
      {"role": "user", "content": "write a haiku about ai"}
    ]
  }'

2025年1月17日 星期五

websocat 測試

參考 https://github.com/vi/websocat
安裝檔案選擇
https://github.com/vi/websocat/releases
我的 ubuntu 選擇 websocat.x86_64-unknown-linux-musl

$ wget https://github.com/vi/websocat/releases/download/v1.14.0/websocat.x86_64-unknown-linux-musl -O websocat
$ chmod +x websocat
$ sudo mv websocat /usr/local/bin/

測試是否能執行,查詢版本
$ websocat --version

連線到公用的 echo 伺服器
$ websocat ws://ws.vi-server.org/mirror
123
123
ABC
ABC

使用 docker 連線到公用的 echo 伺服器
$ docker run --rm -ti ghcr.io/vi/websocat:nightly wss://ws.vi-server.org/mirror
123
123
ABC
ABC

開啟接收伺服器
A$ websocat -s 1234
Listening on ws://127.0.0.1:1234/
ABC
123

連線伺服器,傳送資料
B$ websocat ws://127.0.0.1:1234/
ABC
123

安裝 chrome
$ sudo snap install chromium

啟動 chrome, 有遠端除錯模式
$ chromium --remote-debugging-port=9222&

用另一終端執行命令,在 chrome 中開啟分頁
$ curl -X PUT http://127.0.0.1:9222/json/new | grep webSocketDebuggerUrl | cut -d'"' -f4 | head -1
ws://127.0.0.1:9222/devtools/page/DC8E8EF5B872E141E8F60FDB4764F648

注意上個命令返回的網址,修改並執行下列命令
$ echo 'Page.navigate {"url":"https://example.com"}' | websocat -n1 --jsonrpc --jsonrpc-omit-jsonrpc ws://127.0.0.1:9222/devtools/page/DC8E8EF5B872E141E8F60FDB4764F648

在 chrome 的分頁網址上連到 https://example.com/

將 WebSocket 連線轉成 TCP
$ websocat --oneshot -b tcp-l:127.0.0.1:1234 ws://ws.vi-server.org/mirror&
$ nc 127.0.0.1 1234

使用 TCP 和 WebSocket 轉換,測試 ssh
$ websocat --oneshot -b ws-l:127.0.0.1:1234 tcp:127.0.0.1:22&
$ websocat --oneshot -b tcp-l:127.0.0.1:1236 ws://127.0.0.1:1234/&
$ nc 127.0.0.1 1236
SSH-2.0-OpenSSH_8.2p1 Ubuntu-4ubuntu0.9
aaaaasdf
Invalid SSH identification string.