$ curl -LsSf https://hf.co/cli/install.sh | bash
$ hf download openai/gpt-oss-20b --local-dir ./models/gpt-oss-20b
$ docker pull nvcr.io/nvidia/vllm:25.11-py3
$ docker run -it --gpus all -p 8000:8000 \
-v /mnt/models:/models \
nvcr.io/nvidia/vllm:25.11-py3 \
vllm serve "/models/gpt-oss-20b" \
--served-model-name llm_chat \
--api-key token-abc123
$ curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer token-abc123" \
-d '{
"model": "llm_chat",
"messages": [{"role": "user", "content": "你好,請自我介紹"}],
"max_tokens": 500
}'
為了避免出現 out-of-memory (OOM)
$ sync && echo 3 > /proc/sys/vm/drop_caches
沒有留言:
張貼留言