網頁

2025年12月30日 星期二

DGX Spark 使用 TRT LLM

參考 https://build.nvidia.com/spark 之下的
TRT LLM for Inference
NVFP4 Quantization
參考 https://nvidia.github.io/TensorRT-LLM/1.0.0rc2/commands/trtllm-serve.html

# Configure Docker permissions
$ sudo usermod -aG docker $USER
$ newgrp docker
$ id
uid=1000(spark) gid=988(docker) groups=988(docker),4(adm),27(sudo),29(audio),30(dip),46(plugdev),100(users),122(lpadmin),1000(spark)
$ ps
    PID TTY          TIME CMD
   6123 pts/1    00:00:00 bash
  24590 pts/1    00:00:00 bash
  24597 pts/1    00:00:00 ps

# Verify environment prerequisites
$ nvidia-smi
$ docker run --rm --gpus all nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev nvidia-smi

$ export HF_TOKEN=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
$ docker run --rm -it --gpus all \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  python -c "import tensorrt_llm; print(f'TensorRT-LLM version: {tensorrt_llm.__version__}')"
倒數兩行的輸出
[TensorRT-LLM] TensorRT-LLM version: 1.1.0rc3
TensorRT-LLM version: 1.1.0rc3

# Create Hugging Face cache directory
$ mkdir -p $HOME/.cache/huggingface/
## 若有需要改變目錄位置
$ export HF_HOME=/mnt/Data/huggingface

$ export MODEL_HANDLE="openai/gpt-oss-20b"
$ docker run \
  -e MODEL_HANDLE=$MODEL_HANDLE \
  -e HF_TOKEN=$HF_TOKEN \
  -v $HOME/.cache/huggingface/:/root/.cache/huggingface/ \
  --rm -it --ulimit memlock=-1 --ulimit stack=67108864 \
  --gpus=all --ipc=host --network host \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c '
    export TIKTOKEN_ENCODINGS_BASE="/tmp/harmony-reqs" && \
    mkdir -p $TIKTOKEN_ENCODINGS_BASE && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken && \
    hf download $MODEL_HANDLE && \
    python examples/llm-api/quickstart_advanced.py \
      --model_dir $MODEL_HANDLE \
      --prompt "Paris is great because" \
      --max_tokens 64
    '

==================
# Serve LLM with OpenAI-compatible API
$ export MODEL_HANDLE="openai/gpt-oss-20b"
$ export MODEL_HANDLE="openai/gpt-oss-120b"
$ export MODEL_HANDLE="meta-llama/Llama-3.3-70B-Instruct"
$ export MODEL_HANDLE="Qwen/Qwen3-4B-Instruct-2507"
$ export MODEL_HANDLE="deepseek-ai/DeepSeek-R1-Distill-Llama-8B'"

$ docker run --name trtllm_llm_server --rm -it --gpus all --ipc host --network host \
  -e HF_TOKEN=$HF_TOKEN \
  -e MODEL_HANDLE="$MODEL_HANDLE" \
  -v $HOME/.cache/huggingface/:/root/.cache/huggingface/ \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c '
    export TIKTOKEN_ENCODINGS_BASE="/tmp/harmony-reqs" && \
    mkdir -p $TIKTOKEN_ENCODINGS_BASE && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken && \
    hf download $MODEL_HANDLE && \
    cat > /tmp/extra-llm-api-config.yml <<EOF
print_iter_log: false
kv_cache_config:
  dtype: "auto"
  free_gpu_memory_fraction: 0.4
cuda_graph_config:
  enable_padding: true
disable_overlap_scheduler: true
EOF
    trtllm-serve "$MODEL_HANDLE" \
      --max_batch_size 8 \
      --trust_remote_code \
      --host 0.0.0.0 \
      --port 8000 \
      --extra_llm_api_options /tmp/extra-llm-api-config.yml
  '
$ curl -s http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "'"$MODEL_HANDLE"'",
    "messages": [{"role": "user", "content": "請你自我介紹"}],
    "max_tokens": 64
  }'

# Cleanup and rollback
sudo chown -R "$USER:$USER" "$HOME/.cache/huggingface"
rm -rf $HOME/.cache/huggingface/
docker image prune -f
docker rmi nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev


==================
# NVFP4 Quantization
$ mkdir -p ./output_models
$ chmod 755 ./output_models
# 使用 huggingface 的模型, 轉換模型
$ docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
  -v "./output_models:/workspace/output_models" \
  -v "$HOME/.cache/huggingface:/root/.cache/huggingface" \
  -e HF_TOKEN=$HF_TOKEN \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c "
    git clone -b 0.35.0 --single-branch https://github.com/NVIDIA/Model-Optimizer.git /app/TensorRT-Model-Optimizer && \
    cd /app/TensorRT-Model-Optimizer && pip install -e '.[dev]' && \
    export ROOT_SAVE_PATH='/workspace/output_models' && \
    /app/TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh \
    --model $MODEL_HANDLE \
    --quant nvfp4 \
    --tp 1 \
    --export_fmt hf
  "
# 出現 pynvml.NVMLError_NotSupported: Not Supported 錯誤,不用怕
# 使用本地端的模型,轉換模型
$ docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
  -v "./output_models:/workspace/output_models" \
  -v /mnt/models:/mnt/models \
  -v "$HOME/.cache/huggingface:/root/.cache/huggingface" \
  -e HF_TOKEN=$HF_TOKEN \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c "
    git clone -b 0.35.0 --single-branch https://github.com/NVIDIA/Model-Optimizer.git /app/TensorRT-Model-Optimizer && \
    cd /app/TensorRT-Model-Optimizer && pip install -e '.[dev]' && \
    export ROOT_SAVE_PATH='/workspace/output_models' && \
    /app/TensorRT-Model-Optimizer/examples/llm_ptq/scripts/huggingface_example.sh \
    --model /mnt/models/Qwen2.5-Coder-7B \
    --quant nvfp4 \
    --tp 1 \
    --export_fmt hf
  "

$ ls -la ./output_models/
$ find ./output_models/ -name "*.bin" -o -name "*.safetensors" -o -name "config.json"
$ export MODEL_PATH="./output_models/saved_models_DeepSeek-R1-Distill-Llama-8B_nvfp4_hf/"
$ export MODEL_PATH="./output_models/saved_models_Qwen3-4B-Instruct-2507_nvfp4_hf/"
# 使用轉換完成的模型
$ docker run \
  -e HF_TOKEN=$HF_TOKEN \
  -v $HOME/.cache/huggingface/:/root/.cache/huggingface/ \
  -v "$MODEL_PATH:/workspace/model" \
  --rm -it --ulimit memlock=-1 --ulimit stack=67108864 \
  --gpus=all --ipc=host --network host \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c '
    python examples/llm-api/quickstart_advanced.py \
      --model_dir /workspace/model/ \
      --prompt "Paris is great because" \
      --max_tokens 64
    '
# Serve the model with OpenAI-compatible API
$ docker run \
  -e HF_TOKEN=$HF_TOKEN \
  -v "$MODEL_PATH:/workspace/model" \
  --rm -it --ulimit memlock=-1 --ulimit stack=67108864 \
  --gpus=all --ipc=host --network host \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  trtllm-serve /workspace/model \
    --backend pytorch \
    --max_batch_size 4 \
    --host 0.0.0.0 \
    --port 8000
$ curl -X POST http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "openai/gpt-oss-20b",
    "messages": [{"role": "user", "content": "What is artificial intelligence?"}],
    "max_tokens": 100,
    "temperature": 0.7,
    "stream": false
  }'

==================
# trtllm-serve 使用本地端模型
$ export MODEL_HANDLE="/mnt/models/gpt-oss-20b"    # 0.8:42.6GB 48W 16s | 0.4:26.1GB 48W 21s | 0.2:26.6GB 46W 17s
$ export MODEL_HANDLE="/mnt/models/gpt-oss-120b"   # 0.8:  72GB 50W 35s | 0.5:70.4GB 49W 40s | 0.4:68.7GB 49W 39s 
$ docker run --name trtllm_llm_server --rm -it --gpus all --ipc host --network host \
  -e MODEL_HANDLE="$MODEL_HANDLE" \
  -v /mnt/models:/mnt/models \
  -v $HOME/.cache/huggingface/:/root/.cache/huggingface/ \
  nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev \
  bash -c '
    export TIKTOKEN_ENCODINGS_BASE="/tmp/harmony-reqs" && \
    mkdir -p $TIKTOKEN_ENCODINGS_BASE && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken && \
    wget -P $TIKTOKEN_ENCODINGS_BASE https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken && \
    cat > /tmp/extra-llm-api-config.yml <<EOF
print_iter_log: false
kv_cache_config:
  dtype: "auto"
  free_gpu_memory_fraction: 0.5
cuda_graph_config:
  enable_padding: true
disable_overlap_scheduler: true
EOF

    trtllm-serve "$MODEL_HANDLE" \
      --max_batch_size 8 \
      --max_seq_len 65536 \
      --max_num_tokens 131072 \
      --trust_remote_code \
      --host 0.0.0.0 \
      --port 8000 \
      --extra_llm_api_options /tmp/extra-llm-api-config.yml
  '

DGX Spark 使用 vLLM server

經測試發現 vLLM 在記憶體和功耗上都比 TRT LLM差

$ sudo mount -t nfs 192.168.0.107:/mnt/Data/LangGraph/HuggingFace/models /mnt/models

$ export MODEL_HANDLE="/mnt/models/gpt-oss-20b"    # 0.8:96.9GB 43W 148s | 0.4:49.4GB 39W 120s
$ export MODEL_HANDLE="/mnt/models/gpt-oss-120b"   # 0.8:97.9GB 41W  75s | 0.7:86.5GB 40W 104s

$ docker run --rm --name vllm_server -it --gpus all \
-p 8000:8000 \
-v /mnt/models:/models \
nvcr.io/nvidia/vllm:25.11-py3 \
vllm serve "/models/gpt-oss-20b" \
--trust_remote_code \
--max-num-seqs 2 \
--quantization mxfp4 \
--gpu-memory-utilization 0.3 \
--served-model-name llm_chat \
--api-key token-abc123

若執行失敗,可清除記憶體,再試一遍
$ sudo sh -c 'sync && echo 3 > /proc/sys/vm/drop_caches'

DGX Spark 之 Open WebUI 配合 vLLM 或 TRT LLM

# ollama 連接 vllm 或 trt_llm
$ docker run -d --rm \
  --name open-webui-vllm \
  -p 8501:8080 \
  -v open-webui:/app/backend/data \
  -e OPENAI_API_BASE_URL=http://192.168.0.108:8000/v1 \
  -e OPENAI_API_KEY=token-abc123 \
  ghcr.io/open-webui/open-webui:main
其中 8080 為固定,不需改變
開啟 Open WebUI(http://localhost:8501)
登入後點選左下 使用者 → Admin Panel → Settings → Connections → OpenAI
你會看到已自動配置的 OpenAI 連線(指向 vLLM)

上述命令只可使用單一 vLLM 模型
改用下述命令,由網頁手動輸入多組 vLLM 模型
$ docker run -d --rm --gpus=all \
  -p 8501:8080 \
  -v open-webui:/app/backend/data \
  -v open-webui-ollama:/root/.ollama \
  --name open-webui ghcr.io/open-webui/open-webui:ollama
若啟動失敗,應該是 open-webui volume 不一致,使用下列命令
$ docker volume rm open-webui

網頁左下使用者名稱/Admin Panel/Settings/Connections/ +
URL: http://192.168.0.108:8000/v1
按 Save

DGX Spark 使用 dashboard

參考 https://github.com/DanTup/dgx_dashboard
可以觀察 記憶體使用量,溫度,功耗,Docker Container

$ docker run -d --gpus all \
    -v /var/run/docker.sock:/var/run/docker.sock \
    -p 8080:8080 \
    --pull=always \
    --restart=unless-stopped \
    --name dashboard \
    ghcr.io/dantup/dgx_dashboard:latest
    
$ docker stop dashboard && docker rm dashboard

firefox http://192.168.0.108:8080/

2025年12月24日 星期三

DGX Spark 上的 vLLM 和 TRT LLM

NVFP4 Quantization 支援度不好,目前成功的如下:
DeepSeek-R1-Distill-Llama-8B
Qwen3-4B-Instruct-2507
chatgpt 說 Qwen3-xB-AxxB 的都不行, 不能有 AxxB

比較 gpt-oss-20b 和 gpt-oss-120b
trtllm 確實比 vllm 省記憶體,且速度快

2025年12月23日 星期二

DGX Spark 之溫度監測

$ sudo apt-get install lm-sensors
$ sudo sensors-detect
會問很多問題,通常回 YES
$ sudo apt-get install psensors
$ psensors

$ sudo apt-get install gnome-shell-extension-manager
$ sudo apt-get install chrome-gnome-shell
$ sudo apt-get install gnome-browser-connector

不要使用 chromium, 直接使用 firefox
firefox 開啟 https://extensions.gnome.org
安裝 GNOME Shell integration extension
搜尋 Vitals 安裝


2025年12月22日 星期一

DGX Spark 之 vLLM 安裝測試

$ curl -LsSf https://hf.co/cli/install.sh | bash
$ hf download openai/gpt-oss-20b --local-dir ./models/gpt-oss-20b

$ docker pull nvcr.io/nvidia/vllm:25.11-py3
$ docker run -it --gpus all -p 8000:8000 \
-v /mnt/models:/models \
nvcr.io/nvidia/vllm:25.11-py3 \
vllm serve "/models/gpt-oss-20b" \
--served-model-name llm_chat \
--api-key token-abc123

$ curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer token-abc123" \
-d '{
    "model": "llm_chat",
    "messages": [{"role": "user", "content": "你好,請自我介紹"}],
    "max_tokens": 500
}'

為了避免出現 out-of-memory (OOM)
$ sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'

ubuntu 之 nfs server 和 client

A 主機
$ sudo apt update
$ sudo apt install -y nfs-kernel-server
$ sudo vi /etc/exports
/mnt/Data/models 192.168.1.20(ro,sync,no_subtree_check)
ro:只讀(強烈建議,避免模型被誤寫)
sync:資料一致性
no_subtree_check:效能與穩定性
/mnt/Data/models 192.168.1.20(ro,all_squash,anonuid=1000,anongid=1000,sync,no_subtree_check)
all_squash:所有 client user 都映射成 anonymous
anonuid/anongid:指定成某個 UID/GID

$ sudo exportfs -ra
$ sudo systemctl restart nfs-kernel-server
$ showmount -e localhost

B 主機
$ sudo apt update
$ sudo apt install -y nfs-common
$ sudo mkdir -p /mnt/models
$ sudo mount -t nfs 192.168.1.10:/mnt/Data/models /mnt/models
$ ls /mnt/models
$ sudo vi /etc/fstab
192.168.1.10:/mnt/Data/models  /mnt/models  nfs  ro,_netdev,auto  0  0

DGX Spark 安裝 Text to Knowledge Graph

安裝 Text to Knowledge Graph
參考 https://build.nvidia.com/spark/txt2kg/instructions

$ git clone https://github.com/NVIDIA/dgx-spark-playbooks
$ cd dgx-spark-playbook/nvidia/txt2kg/assets
$ ./start.sh
瀏覽器 http://localhost:3001
$ ./stop.sh

因為同時使用了 Open WebUI with Ollama docker, 切換 compose_ollama_data 到 open-webui-ollama
$ docker volume ls
$ docker volume inspect compose_ollama_data
[
    {
        "CreatedAt": "2025-12-19T14:38:37+08:00",
        "Driver": "local",
        "Labels": {
            "com.docker.compose.config-hash": "b60a4e44fe9b008057f3eaff8c4477427e0db99c0c9a70285f81b92ba016830d",
            "com.docker.compose.project": "compose",
            "com.docker.compose.version": "2.40.0",
            "com.docker.compose.volume": "ollama_data"
        },
        "Mountpoint": "/var/lib/docker/volumes/compose_ollama_data/_data",
        "Name": "compose_ollama_data",
        "Options": null,
        "Scope": "local"
    }
]
$ cd dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/compose
$ cp docker-compose.yml docker-compose.yml.bak
$ vi docker-compose.yml
  ollama:
    volumes:
      - ollama_data:/root/.ollama
改成
  ollama:
    volumes:
      - open-webui-ollama:/root/.ollama
volumes:
  ollama_data:
改成
volumes:
  open-webui-ollama:
    external: true

查詢 ollama-compose docker 目前使用的 volume
$ docker inspect ollama-compose --format '{{ json .Mounts }}'

把 volume 掛進暫時容器,並清空內容
$ docker run --rm \
  -v compose_ollama_data:/data \
  alpine \
  sh -c "rm -rf /data/*"
確保沒有容器在用
$ docker ps -a --filter volume=compose_ollama_data
刪除 volume
docker volume rm compose_ollama_data


$ export OLLAMA_MODEL=gpt-oss:20b
$ docker exec ollama-compose ollama list
$ docker exec ollama-compose ollama pull gpt-oss:20b

2025年12月19日 星期五

DGX Spark 安裝 NVIDIA Sync 之 terminal

出現錯誤
Bad permissions. Try removing permissions for user: UNKNOWN\\UNKNOWN (S-1-15-3-1024-3299255270-1847605585-2201808924-710406709-3613095291-873286183-3101090833-2655911836) on file C:/Users/xxx/AppData/Local/NVIDIA Corporation/Sync/config/ssh_config.
Bad owner or permissions on C:/Users/xxx/AppData/Local/NVIDIA Corporation/Sync/config/ssh_config

用 powershell 執行
takeown /f "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\ssh_config"
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\ssh_config" /inheritance:r
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\ssh_config" /grant xxx:F

用下列命令確認
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\ssh_config"
C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\ssh_config ThinkPad-E000\xxx:(F)

出現錯誤
Bad permissions. Try removing permissions for user: UNKNOWN\\UNKNOWN (S-1-15-3-1024-3299255270-1847605585-2201808924-710406709-3613095291-873286183-3101090833-2655911836) on file C:/Users/xxx/AppData/Local/NVIDIA Corporation/Sync/config/nvsync.key.
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@         WARNING: UNPROTECTED PRIVATE KEY FILE!          @
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Permissions for 'C:\\Users\\xxx\\AppData\\Local\\NVIDIA Corporation\\Sync\\config\\nvsync.key' are too open.
It is required that your private key files are NOT accessible by others.
This private key will be ignored.
Load key "C:\\Users\\xxx\\AppData\\Local\\NVIDIA Corporation\\Sync\\config\\nvsync.key": bad permissions
spark@gx10-spark.local's password:

用 powershell 執行
takeown /f "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\nvsync.key"
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\nvsync.key" /inheritance:r
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\nvsync.key" /grant xxx:F

用下列命令確認
icacls "C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\nvsync.key"
C:\Users\xxx\AppData\Local\NVIDIA Corporation\Sync\config\nvsync.key ThinkPad-000\xxx:(F)

2025年12月15日 星期一

jetson orin 安裝 vLLM

https://hackmd.io/@johnnynunez/S1vJlvThee
https://pypi.jetson-ai-lab.io/jp6/cu126

$ uv init uv_vllm
$ cd uv_vllm
$ rm .python-version
$ vi pyproject.toml
requires-python = "==3.10.*"
$ uv venv --python 3.10

$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/62a/1beee9f2f1470/torch-2.8.0-cp310-cp310-linux_aarch64.whl#sha256=62a1beee9f2f147076a974d2942c90060c12771c94740830327cae705b2595fc
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/81a/775c8af36ac85/torchaudio-2.8.0-cp310-cp310-linux_aarch64.whl#sha256=81a775c8af36ac859fb3f4a1b2f662d5fcf284a835b6bb4ed8d0827a6aa9c0b7
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/907/c4c1933789645/torchvision-0.23.0-cp310-cp310-linux_aarch64.whl#sha256=907c4c1933789645ebb20dd9181d40f8647978e6bd30086ae7b01febb937d2d1
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/9da/4bcb8e8f0eba0/triton-3.4.0-cp310-cp310-linux_aarch64.whl#sha256=9da4bcb8e8f0eba00a097ad8c57b26102add499e520d67fb2d5362bebf976ca3
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/014/eff8ba676c7a3/bitsandbytes-0.47.0.dev0-cp310-cp310-linux_aarch64.whl#sha256=014eff8ba676c7a3830b9430744115af50790d2f7ff1b57f155a8839bcc39104

避免資源耗盡
$ ulimit -v
unlimited
$ ulimit -v $((26*1024*1024)) # 26GB
$ export MAX_JOBS=6

$ export TORCH_CUDA_ARCH_LIST="8.6;8.7"
$ export TRITON_PTXAS_PATH=/usr/local/cuda-12.6/bin/ptxas
$ export PATH=/usr/local/cuda-12.6/bin:$PATH
$ export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64:$LD_LIBRARY_PATH

$ git clone --recursive https://github.com/vllm-project/vllm.git
$ cd vlm
$ git checkout v0.11.0
$ python3 use_existing_torch.py
$ uv pip install -r requirements/build.txt
不要直接安裝,雖然可以成功,但無法使用,懷疑是 vlm 目錄名稱會和 python package 衝突,也無法隨意改名
$ uv pip install --no-build-isolation -e .
改建立 wheel
$ uv build --no-build-isolation --wheel
$ cd ..
$ mv vllm vllm_v0.11.0
$ uv remove vllm
$ uv add vllm_v0.11.0/dist/vu add vllm_v0.11.0/dist/vllm-0.11.1.dev0+gb8b302cde.d20251214.cu126-cp310-cp310-linux_aarch64.whl
$ uv pip list
$ uv run python -c "import vllm; print(vllm.__version__)"

rm -rf build dist *.egg-info
find . -name "*.so" -delete
uv cache clean
rm -rf ~/.cache/torch_extensions

$ uv run vllm serve "/mnt/Data/LangGraph/HuggingFace/models/Qwen3-4B-Instruct-2507" --trust_remote_code --tensor-parallel-size 1 --max-model-len 20k --max-num-seqs 16 --gpu-memory-utilization 0.6 --quantization bitsandbytes --api-key token-abc123

可用的 --quantization ['awq', 'deepspeedfp', 'tpu_int8', 'fp8', 'ptpc_fp8', 'fbgemm_fp8', 'modelopt', 'modelopt_fp4', 'bitblas', 'gguf', 'gptq_marlin_24', 'gptq_marlin', 'gptq_bitblas', 'awq_marlin', 'gptq', 'compressed-tensors', 'bitsandbytes', 'hqq', 'experts_int8', 'ipex', 'quark', 'moe_wna16', 'torchao', 'auto-round', 'rtn', 'inc', 'mxfp4', 'petit_nvfp4']

測試可用的 quantization [bitsandbytes fp8 experts_int8]