https://hackmd.io/@johnnynunez/S1vJlvThee
https://pypi.jetson-ai-lab.io/jp6/cu126
$ uv init uv_vllm
$ cd uv_vllm
$ rm .python-version
$ vi pyproject.toml
requires-python = "==3.10.*"
$ uv venv --python 3.10
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/62a/1beee9f2f1470/torch-2.8.0-cp310-cp310-linux_aarch64.whl#sha256=62a1beee9f2f147076a974d2942c90060c12771c94740830327cae705b2595fc
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/81a/775c8af36ac85/torchaudio-2.8.0-cp310-cp310-linux_aarch64.whl#sha256=81a775c8af36ac859fb3f4a1b2f662d5fcf284a835b6bb4ed8d0827a6aa9c0b7
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/907/c4c1933789645/torchvision-0.23.0-cp310-cp310-linux_aarch64.whl#sha256=907c4c1933789645ebb20dd9181d40f8647978e6bd30086ae7b01febb937d2d1
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/9da/4bcb8e8f0eba0/triton-3.4.0-cp310-cp310-linux_aarch64.whl#sha256=9da4bcb8e8f0eba00a097ad8c57b26102add499e520d67fb2d5362bebf976ca3
$ uv add --no-cache https://pypi.jetson-ai-lab.io/jp6/cu126/+f/014/eff8ba676c7a3/bitsandbytes-0.47.0.dev0-cp310-cp310-linux_aarch64.whl#sha256=014eff8ba676c7a3830b9430744115af50790d2f7ff1b57f155a8839bcc39104
避免資源耗盡
$ ulimit -v
unlimited
$ ulimit -v $((26*1024*1024)) # 26GB
$ export MAX_JOBS=6
$ export TORCH_CUDA_ARCH_LIST="8.6;8.7"
$ export TRITON_PTXAS_PATH=/usr/local/cuda-12.6/bin/ptxas
$ export PATH=/usr/local/cuda-12.6/bin:$PATH
$ export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64:$LD_LIBRARY_PATH
$ git clone --recursive https://github.com/vllm-project/vllm.git
$ cd vlm
$ git checkout v0.11.0
$ python3 use_existing_torch.py
$ uv pip install -r requirements/build.txt
不要直接安裝,雖然可以成功,但無法使用,懷疑是 vlm 目錄名稱會和 python package 衝突,也無法隨意改名
$ uv pip install --no-build-isolation -e .
改建立 wheel
$ uv build --no-build-isolation --wheel
$ cd ..
$ mv vllm vllm_v0.11.0
$ uv remove vllm
$ uv add vllm_v0.11.0/dist/vu add vllm_v0.11.0/dist/vllm-0.11.1.dev0+gb8b302cde.d20251214.cu126-cp310-cp310-linux_aarch64.whl
$ uv pip list
$ uv run python -c "import vllm; print(vllm.__version__)"
rm -rf build dist *.egg-info
find . -name "*.so" -delete
uv cache clean
rm -rf ~/.cache/torch_extensions
$ uv run vllm serve "/mnt/Data/LangGraph/HuggingFace/models/Qwen3-4B-Instruct-2507" --trust_remote_code --tensor-parallel-size 1 --max-model-len 20k --max-num-seqs 16 --gpu-memory-utilization 0.6 --quantization bitsandbytes --api-key token-abc123
可用的 --quantization ['awq', 'deepspeedfp', 'tpu_int8', 'fp8', 'ptpc_fp8', 'fbgemm_fp8', 'modelopt', 'modelopt_fp4', 'bitblas', 'gguf', 'gptq_marlin_24', 'gptq_marlin', 'gptq_bitblas', 'awq_marlin', 'gptq', 'compressed-tensors', 'bitsandbytes', 'hqq', 'experts_int8', 'ipex', 'quark', 'moe_wna16', 'torchao', 'auto-round', 'rtn', 'inc', 'mxfp4', 'petit_nvfp4']
測試可用的 quantization [bitsandbytes fp8 experts_int8]
沒有留言:
張貼留言