網頁

2026年3月12日 星期四

DGX Spark 如何避免 OOM 當機

# 參考 https://forums.developer.nvidia.com/t/mitigating-oom-system-freezes-on-uma-based-single-board-computers/362769
# 另外可參考 DGX Spark 之溫度 https://yingrenn.blogspot.com/2026/02/dgx-spark.html

# 安裝輕量級的 Dropbear SSH
$ sudo apt update && sudo apt install dropbear
$ sudo vi /etc/default/dropbear
NO_START=0
DROPBEAR_PORT=2222

$ sudo systemctl enable dropbear
$ sudo systemctl start dropbear

# Standard connection (OpenSSH)
$ ssh spark@<your-ip>
# Emergency connection (Dropbear)
$ ssh spark@<your-ip> -p 2222

# 安裝 earlyoom
$ sudo apt update
$ sudo apt install earlyoom
$ sudo vi /etc/default/earlyoom
EARLYOOM_ARGS="-m 5 -s 10 --avoid 'pipewire|wireplumber|systemd|ssh|journald' --prefer 'vllm|python|triton'"
# This tells earlyoom to intervene when RAM is under 5% AND Swap is under 10%. 
# It will aggressively target vllm or Python scripts over other processes

$ sudo EDITOR=vi systemctl edit earlyoom
### Editing /etc/systemd/system/earlyoom.service.d/override.conf
### Anything between here and the comment below will become the contents of the drop-in file

[Service]
LimitMEMLOCK=infinity
CapabilityBoundingSet=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
AmbientCapabilities=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
MemoryLock=infinity
OOMScoreAdjust=-1000

### Edits below this comment will be discarded


### /usr/lib/systemd/system/earlyoom.service
# [Unit]
# Description=Early OOM Daemon
# Documentation=man:earlyoom(1) https://github.com/rfjakob/earlyoom
# [Service]
# EnvironmentFile=-/etc/default/earlyoom
# ExecStart=/usr/bin/earlyoom $EARLYOOM_ARGS
# # Run as an unprivileged user with random user id
# DynamicUser=true
# # Allow killing processes and calling mlockall()
# AmbientCapabilities=CAP_KILL CAP_IPC_LOCK
# # We don't need write access anywhere
# ProtectSystem=strict
# # We don't need /home at all, make it inaccessible
# ProtectHome=true
# # earlyoom never exits on it's own, so have systemd
# # restart it should it get killed for some reason.
# Restart=always
# # set memory limits and max tasks number
# TasksMax=10
# MemoryMax=50M
# [Install]
# WantedBy=multi-user.target

$ sudo systemctl daemon-reload
$ sudo systemctl restart earlyoom
$ sudo systemctl status earlyoom

# 查詢 log
$ journalctl -u earlyoom -f

$ cat /etc/systemd/system/earlyoom.service.d/override.conf 
[Service]
LimitMEMLOCK=infinity
CapabilityBoundingSet=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
AmbientCapabilities=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
MemoryLock=infinity
OOMScoreAdjust=-1000
$ ps aux |grep earlyoom
earlyoom   80791  0.0  0.0   2288  1688 ?        SLs  11:55   0:00 /usr/bin/earlyoom -m 5 -s 10 --avoid pipewire|wireplumber|systemd|ssh|journald --prefer vllm|python|triton
$ cat /proc/$(pgrep earlyoom)/oom_score_adj
-1000

2026年2月9日 星期一

DGX Spark 安裝 piper tts

參考 https://github.com/OHF-Voice/piper1-gpl/tree/main
參考 https://huggingface.co/csukuangfj/vits-piper-zh_CN-huayan-medium
參考 https://huggingface.co/csukuangfj/vits-piper-zh_CN-huayan-x_low

$ uv venv --python 3.13
$ source .venv/bin/activate
$ uv pip install piper-tts
$ uv pip install g2pw
$ uv pip install requests
$ uv pip install torch --index-url https://download.pytorch.org/whl/cu130
$ uv pip install unicode_rbnf
$ uv pip install sentence_stream
$ uv pip install fastapi
$ uv pip install uvicorn
$ uv pip install python-multipart


$ python3 -m piper.download_voices
$ python3 -m piper.download_voices zh_CN-huayan-x_low --download-dir models
$ ls models/
$ python3 -m piper.download_voices zh_CN-huayan-medium --download-dir models
$ python3 -m piper.download_voices zh_CN-chaowen-medium --download-dir models
$ python3 -m piper.download_voices zh_CN-xiao_ya-medium --download-dir models
$ python3 -m piper.download_voices en_US-lessac-medium --download-dir models

# for voice.synthesize, 用 curl 可以成功,但 open-webui 測試失敗
$ curl -X POST http://127.0.0.1:8100/v1/audio/speech \
     -H "Content-Type: application/json" \
     -d '{"input": "你好,這是一段測試語音。"}' \
     --output output.pcm
$ ffmpeg -f s16le -ar 16000 -ac 1 -i output.pcm \
       -codec:a libmp3lame -b:a 128k output.mp3

# voice.synthesize_wav, 用 curl 可以成功,並且 open-webui 測試成功
$ curl -X POST http://127.0.0.1:8100/v1/audio/speech \
  -H "Content-Type: application/json" \
  -d '{"input":"你好,這是 synthesize_wav 測試"}' \
  --output output.wav

2026年2月6日 星期五

DGX Spark 使用 Qwen3-ASR-1.7B

參考 https://huggingface.co/Qwen/Qwen3-ASR-1.7B

$ export HF_TOKEN=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
$ hf download Qwen/Qwen3-ASR-1.7B --local-dir Qwen3-ASR-1.7B
$ hf download Qwen/Qwen3-ASR-0.6B --local-dir Qwen3-ASR-0.6B

$ uv init qwen3-asr
$ cd qwen3-asr/
$ rm .python-version
# 參考 cu130 版本資訊 https://download.pytorch.org/whl/cu130/
$ uv venv --python 3.13
$ source .venv/bin/activate
$ uv pip install -e .
$ uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
$ uv pip install qwen-asr
$ uv pip uninstall torch torchvision torchaudio
$ uv pip install torch==2.9.1 --index-url https://download.pytorch.org/whl/cu130
$ uv pip install torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cu130
$ uv pip install torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130
# 別想著直接使用下列命令安裝,會導致之後的安裝 flash-attn 失敗
# uv pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130

# 安裝 flash-attn, 使用 wheels 要求 python 3.10
# uv pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.6.4/flash_attn-2.8.3%2Bcu130torch2.9-cp310-cp310-linux_aarch64.whl
# 安裝 flash-attn 使用編譯安裝,要求使用 torch 2.9.1
$ uv pip install numpy ninja packaging setuptools wheel
$ export TORCH_CUDA_ARCH_LIST="12.1"
$ export CUDA_HOME=/usr/local/cuda-13.0
$ FLASH_ATTENTION_FORCE_BUILD=TRUE MAX_JOBS=4 uv pip install flash-attn --no-build-isolation --no-cache-dir
# 因為很耗記憶體,所以時常會出現 Out of memory, 可以在很長的 log 中找到 Killed
# 或者使用下列兩個命令查詢確認

# 安裝 qwen-asr, 因為內建 vllm 會去找 CUDA 12 版本,所以失敗
# git clone https://github.com/QwenLM/Qwen3-ASR.git
# uv pip install -e ./Qwen3-ASR[vllm] --no-build-isolation -v
# 直接安裝 vllm
$ uv pip install https://github.com/vllm-project/vllm/releases/download/v0.14.0/vllm-0.14.0+cu130-cp38-abi3-manylinux_2_35_aarch64.whl

$ qwen-asr-serve /mnt/models/Qwen3-ASR-0.6B \
  --allowed-local-media-path /home/spark/DiskD/audio_llm \
  --gpu-memory-utilization 0.5 \
  --host 0.0.0.0 --port 8000
$ curl http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -X POST \
  -d '{
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "audio_url",
            "audio_url": {
              "url": "file:///home/spark/DiskD/audio_llm/breeze-asr/output.wav"
            }
          },
          {
            "type": "audio_url",
            "audio_url": {
              "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"
            }
          }
        ]
      }
    ]
  }' | jq -r '.choices[0].message.content'

$ uvicorn test_c:app --host 0.0.0.0 --port 8000
$ curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
  -F "file=@/home/spark/DiskD/audio_llm/breeze-asr/output.wav" \
  -F "model_name=gpt-4o-mini-transcribe" \
  -F "language=zh" | jq
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  525k  100  2955  100  522k   2355   416k  0:00:01  0:00:01 --:--:--  418k
{
  "results": [
    {
      "language": "Chinese",
      "text": "说书相声这种东西,人靠一张嘴,通过语言的结构,把看官听众吸引到故事里面。在演出的时候,要求你身上的每个动作都必须要有含义。",
      "time_stamps": {
        "items": [
          {
            "text": "说",
            "start_time": 0.08,
            "end_time": 0.32
          },
          {
            "text": "书",
            "start_time": 0.32,
            "end_time": 0.48
          },
          {
            "text": "相",
            "start_time": 0.48,
            "end_time": 0.72
          },
          {
            "text": "声",
            "start_time": 0.72,
            "end_time": 1.04
          },
          ........
          {
            "text": "有",
            "start_time": 11.52,
            "end_time": 11.6
          },
          {
            "text": "含",
            "start_time": 11.6,
            "end_time": 11.84
          },
          {
            "text": "义",
            "start_time": 11.84,
            "end_time": 12.08
          }
        ]
      }
    }
  ]
}

$ curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
  -F "file_url=https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav" \
  -F "model_name=gpt-4o-mini-transcribe" \
  -F "language=en" | jq
$ curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
  -F "file=@/home/spark/DiskD/audio_llm/breeze-asr/output.wav" \
  -F "file_url=https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav" \
  -F "model_name=gpt-4o-mini-transcribe" | jq

DGX Spark 使用 Breeze-ASR-25

參考 https://huggingface.co/MediaTek-Research/Breeze-ASR-25

$ uv init breeze-asr
$ cd breeze-asr/
$ rm .python-version
# 在 pyproject.toml 增加下列文件,才可順利安裝 torch 等套件
$ vi pyproject.toml
[[tool.uv.index]]
name = "pytorch-cu130"
url = "https://download.pytorch.org/whl/cu130"
explicit = true  # 關鍵:這會阻止一般套件跑去 PyTorch 倉庫找

[tool.uv.sources]
torch = { index = "pytorch-cu130" }
torchaudio = { index = "pytorch-cu130" }
torchvision = { index = "pytorch-cu130" }
torchcodec = { index = "pytorch-cu130" }

$ uv venv --python 3.13
$ source .venv/bin/activate
$ uv pip install -e .

# 安裝時有順序區別,torch 要比 datasets[audio] 先安裝
$ uv add torch torchaudio torchcodec
$ uv add transformers
$ uv add datasets[audio]
$ uv add accelerate
$ sudo apt update
$ sudo apt install -y ffmpeg libavutil-dev

# 照著文件執行會出現下列錯誤,google AI 和 chatgpt 都建議直接使用 model
KeyError: 'num_frames'

# 若使用下列程式碼,輸出的文件不會分段,全部混再一起
result = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True
)[0]

# 下面為最後的程式,可產生如下的輸出
===== TEXT + TIME =====
0.00s 所說相生這種東西人靠一張嘴
3.28s 通過語言的結構把看官聽眾吸引到故事裡面
7.72s 在演出的時候
9.04s 要求你身上的每個動作都必須要有含義

import torch
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration

audio_path = "/home/spark/DiskD/audio_llm/breeze-asr/output.wav"

# load audio
waveform, sr = torchaudio.load(audio_path)
waveform = waveform.mean(dim=0)

if sr != 16000:
    waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)

# load model
processor = WhisperProcessor.from_pretrained("/mnt/models/Breeze-ASR-25")
model = WhisperForConditionalGeneration.from_pretrained(
    "/mnt/models/Breeze-ASR-25"
).to("cuda")
model.eval()

# preprocess
inputs = processor(
    waveform,
    sampling_rate=16000,
    return_tensors="pt"
)

# inference
with torch.no_grad():
    outputs = model.generate(
        inputs.input_features.to("cuda"),
        return_timestamps=True,
        return_dict_in_generate=True,
        output_scores=True,
    )

# --------------------------------------------------
# 5. Decode tokens
# --------------------------------------------------
token_ids = outputs["sequences"][0].tolist()
tokens = processor.tokenizer.convert_ids_to_tokens(token_ids)

# Whisper timestamp token設定
timestamp_begin = processor.tokenizer.convert_tokens_to_ids("<|0.00|>")
time_precision = 0.02  # Whisper: 20ms

current_time = None
buffer = []

print("===== TEXT + TIME =====")

for tid in token_ids:
    # timestamp token
    if tid >= timestamp_begin:
        # 先輸出上一段
        if buffer and current_time is not None:
            text = processor.tokenizer.decode(
                buffer,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            if text.strip():
                print(f"{current_time:.2f}s\t{text}")

        # 更新時間
        current_time = (tid - timestamp_begin) * time_precision
        buffer = []
    else:
        buffer.append(tid)

# flush 最後一段
if buffer and current_time is not None:
    text = processor.tokenizer.decode(
        buffer,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    if text.strip():
        print(f"{current_time:.2f}s\t{text}")

2026年2月3日 星期二

DGX Spark 安裝 Riva ASR

參考文件 https://docs.nvidia.com/nim/riva/asr/latest/overview.html

$ export NGC_API_KEY="NTVwdDZqbTdrNnBva285Y3EzbmQxOGNodjY6NWM0NjYzYjYtMzczMy00MjVkLTg1NWQtZTE2MDNmZTAxNDBj"
$ echo "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin

parakeet-0-6b-ctc-en-us 缺 arm64 版本
parakeet-1-1b-ctc-en-us ok
parakeet-tdt-0.6b-v2 缺 arm64 版本
parakeet-1-1b-rnnt-multilingual 失敗
parakeet-ctc-0.6b-zh-cn 缺 arm64 版本
parakeet-ctc-0.6b-zh-tw 缺 arm64 版本

$ export CONTAINER_ID=parakeet-ctc-0.6b-zh-tw
$ export NIM_TAGS_SELECTOR="mode=str,vad=silero,diarizer=sortformer"

$ docker run -it --rm --name=$CONTAINER_ID \
   --gpus '"device=0"' \
   --shm-size=8GB \
   -e NGC_API_KEY \
   -e NIM_HTTP_API_PORT=9001 \
   -e NIM_GRPC_API_PORT=50052 \
   -p 9001:9001 \
   -p 50052:50052 \
   -e NIM_TAGS_SELECTOR \
   nvcr.io/nim/nvidia/$CONTAINER_ID:latest

# Create the cache directory on the host machine:
$ export LOCAL_NIM_CACHE=$(pwd)/cache/nim
$ mkdir -p $LOCAL_NIM_CACHE
$ chmod 777 $LOCAL_NIM_CACHE

# Set the appropriate values
$ export CONTAINER_ID=parakeet-1-1b-ctc-en-us
$ export NIM_TAGS_SELECTOR="name=parakeet-1-1b-ctc-en-us,mode=all,vad=silero,diarizer=sortformer,model_type=prebuilt"

# Run the container with the cache directory mounted in the appropriate location:
$ docker run -it --rm --name=$CONTAINER_ID \
      --gpus '"device=0"' \
      --shm-size=8GB \
      -e NGC_API_KEY \
      -e NIM_TAGS_SELECTOR \
      -e NIM_HTTP_API_PORT=9001 \
      -e NIM_GRPC_API_PORT=50052 \
      -p 9001:9001 \
      -p 50052:50052 \
      -v $LOCAL_NIM_CACHE:/opt/nim/.cache \
      nvcr.io/nim/nvidia/$CONTAINER_ID:latest
   
$ curl -X 'GET' 'http://localhost:9001/v1/health/ready'
{"status":"ready"}

$ uv init riva-client
$ cd riva-client/
$ rm .python-version
$ uv venv --python 3.10
$ uv add nvidia-riva-client
$ git clone https://github.com/nvidia-riva/python-clients.git

$ sudo apt-get install python3-pip
$ pip install -U nvidia-riva-client
$ git clone https://github.com/nvidia-riva/python-clients.git

$ docker cp $CONTAINER_ID:/opt/riva/wav/zh-TW_sample.wav .
$ python3 python-clients/scripts/asr/transcribe_file.py \
   --server 0.0.0.0:50052 \
   --list-models
Available ASR models
{'en-US': [{'model': ['parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer']}]}
$ python3 python-clients/scripts/asr/transcribe_file_offline.py \
   --server 0.0.0.0:50052 \
   --list-models
Available ASR models
{}
$ curl -s http://0.0.0.0:9001/v1/audio/transcriptions -F language=zh-TW \
   -F file="@zh-TW_sample.wav"
   
$ docker stop $CONTAINER_ID
$ docker rm $CONTAINER_ID

https://docs.nvidia.com/nim/riva/asr/latest/support-matrix.html#parakeet-0-6b-ctc-taiwanese-mandarin-english
Available modes include streaming low latency (str), 
streaming high throughput (str-thr), and offline (ofl). 
Setting the mode to (all) deploys all inference modes where applicable.
The profiles with silero and sortformer use Silero VAD to detect start and end of utterance 
and Sortformer SD for speaker diarization.

CONTAINER_ID=parakeet-ctc-0.6b-zh-tw
NIM_TAGS_SELECTOR
mode=ofl,vad=default,diarizer=disabled
mode=str,vad=default,diarizer=disabled
mode=str-thr,vad=default,diarizer=disabled
mode=all,vad=default,diarizer=disabled
mode=ofl,vad=silero,diarizer=sortformer
mode=str,vad=silero,diarizer=sortformer
mode=str-thr,vad=silero,diarizer=sortformer
mode=all,vad=silero,diarizer=sortformer

DGX Spark 安裝 Riva TTS

參考文件 https://docs.nvidia.com/nim/riva/tts/latest/overview.html

$ export NGC_API_KEY="Y6NWM0NjYzYjYtMzczMy00MjVkLTg1NWQtZTE2MDNmZTAxNDBj"
$ echo "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin

$ export CONTAINER_ID=magpie-tts-multilingual
$ export NIM_TAGS_SELECTOR=name=magpie-tts-multilingual

$ docker run -it --rm --name=$CONTAINER_ID \
   --gpus '"device=0"' \
   --shm-size=8GB \
   -e NGC_API_KEY \
   -e NIM_HTTP_API_PORT=9000 \
   -e NIM_GRPC_API_PORT=50051 \
   -p 9000:9000 \
   -p 50051:50051 \
   -e NIM_TAGS_SELECTOR \
   nvcr.io/nim/nvidia/$CONTAINER_ID:latest

# Create the cache directory on the host machine
$ export LOCAL_NIM_CACHE=~/.cache/nim
$ mkdir -p $LOCAL_NIM_CACHE
$ chmod 777 $LOCAL_NIM_CACHE

# Set appropriate value for container ID
$ export CONTAINER_ID=magpie-tts-multilingual

# Set the appropriate values for NIM_TAGS_SELECTOR.
$ export NIM_TAGS_SELECTOR="name=magpie-tts-multilingual,model_type=prebuilt"

# Run the container with the cache directory mounted in the appropriate location
$ docker run -it --rm --name=$CONTAINER_ID \
   --gpus '"device=0"' \
   --shm-size=8GB \
   -e NGC_API_KEY \
   -e NIM_TAGS_SELECTOR \
   -e NIM_HTTP_API_PORT=9000 \
   -e NIM_GRPC_API_PORT=50051 \
   -p 9000:9000 \
   -p 50051:50051 \
   -v $LOCAL_NIM_CACHE:/opt/nim/.cache \
   nvcr.io/nim/nvidia/$CONTAINER_ID:latest
   
$ curl -X 'GET' 'http://localhost:9000/v1/health/ready'
{"status":"ready"}

$ sudo apt-get install python3-pip
$ pip install -U nvidia-riva-client
$ cd $HOME
$ git clone https://github.com/nvidia-riva/python-clients.git

$ curl -sS http://localhost:9000/v1/audio/list_voices | jq
{
   "en-US,es-US,fr-FR,de-DE,zh-CN": {
      "voices": [
            "Magpie-Multilingual.ZH-CN.HouZhen",
            "Magpie-Multilingual.ZH-CN.Siwei",
            "Magpie-Multilingual.ZH-CN.Louise",
            "Magpie-Multilingual.ZH-CN.Pascal",
            "Magpie-Multilingual.ZH-CN.Isabela",
            "Magpie-Multilingual.ZH-CN.Diego",
            "Magpie-Multilingual.ZH-CN.Ray",
            "Magpie-Multilingual.ZH-CN.Mia",
            "Magpie-Multilingual.ZH-CN.Aria",
            "Magpie-Multilingual.ZH-CN.Long",
            "Magpie-Multilingual.ZH-CN.North",
      ]
   }
}

$ curl -sS http://localhost:9000/v1/audio/synthesize --fail-with-body \
   -F language=zh-CN  \
   -F text="說書,相聲這種東西,人靠一張嘴,通過語言的結構,把看官聽眾吸引到故事里面。在演出的時候要求你身上的每個動作,都必須要有含義。" \
   -F voice=Magpie-Multilingual.ZH-CN.Mia \
   --output output.wav
$ curl -sS http://localhost:9000/v1/audio/synthesize_online --fail-with-body \
   -F language=zh-CN  \
   -F text="說書,相聲這種東西,人靠一張嘴,通過語言的結構,把看官聽眾吸引到故事里面。在演出的時候要求你身上的每個動作,都必須要有含義。" \
   -F voice=Magpie-Multilingual.ZH-CN.Ray \
   -F sample_rate_hz=22050 \
   --output output.raw
$ sudo apt update
$ sudo apt install -y sox
$ sox -b 16 -e signed -c 1 -r 22050 output.raw output.wav

Available Voices
Magpie-Multilingual.ZH-CN.Aria
Magpie-Multilingual.ZH-CN.Diego
Magpie-Multilingual.ZH-CN.HouZhen
Magpie-Multilingual.ZH-CN.Isabela
Magpie-Multilingual.ZH-CN.Long
Magpie-Multilingual.ZH-CN.Louise
Magpie-Multilingual.ZH-CN.Mia
Magpie-Multilingual.ZH-CN.North
Magpie-Multilingual.ZH-CN.Pascal
Magpie-Multilingual.ZH-CN.Ray
Magpie-Multilingual.ZH-CN.Siwei

$ docker stop $CONTAINER_ID
$ docker rm $CONTAINER_ID

DGX Spark 安裝使用 NIM

參考文件 https://build.nvidia.com/spark/nim-llm/overview

下載 ngc
$ wget --content-disposition \
https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/4.11.1/files/ngccli_arm64.zip \
-O ngccli_arm64.zip && unzip ngccli_arm64.zip
$ find ngc-cli/ -type f -exec md5sum {} + | LC_ALL=C sort | md5sum -c ngc-cli.md5
$ sha256sum ngccli_arm64.zip
9c285ef8250c30c21e70a836700121945b296a8441005b829179bde6106ce6d1
$ chmod u+x ngc-cli/ngc
$ echo "export PATH=\"\$PATH:$(pwd)/ngc-cli\"" >> ~/.bash_profile && source ~/.bash_profile

$ ngc config set
Enter API key [no-apikey]. Choices: [<VALID_APIKEY>, 'no-apikey']: 
Enter CLI output format type [ascii]. Choices: ['ascii', 'csv', 'json']: 
Validating configuration...
Successfully validated configuration.
Saving configuration...
Successfully saved NGC configuration to /home/spark/.ngc/config
$ docker login nvcr.io
Username: $oauthtoken
Password: Y6NWM0NjYzYjYtMzczMy00MjVkLTg1NWQtZTE2MDNmZTAxNDBj

$ newgrp docker
$ export NGC_API_KEY="Y6NWM0NjYzYjYtMzczMy00MjVkLTg1NWQtZTE2MDNmZTAxNDBj"
$ echo "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin

$ export CONTAINER_NAME="nim-llm-demo"
$ export IMG_NAME="nvcr.io/nim/meta/llama-3.1-8b-instruct-dgx-spark:latest"
$ export LOCAL_NIM_CACHE=$(pwd)/cache/nim
$ export LOCAL_NIM_WORKSPACE=~/workspace
$ mkdir -p "$LOCAL_NIM_WORKSPACE"
$ chmod -R a+w "$LOCAL_NIM_WORKSPACE"
$ mkdir -p "$LOCAL_NIM_CACHE"
$ chmod -R a+w "$LOCAL_NIM_CACHE"

$ docker run -it --rm --name=$CONTAINER_NAME \
  --gpus all \
  --shm-size=16GB \
  -e NGC_API_KEY=$NGC_API_KEY \
  -v "$LOCAL_NIM_CACHE:/opt/nim/.cache" \
  -v "$LOCAL_NIM_WORKSPACE:/opt/nim/workspace" \
  -p 8000:8000 \
  $IMG_NAME

$ curl -X 'POST' \
    'http://0.0.0.0:8000/v1/chat/completions' \
    -H 'accept: application/json' \
    -H 'Content-Type: application/json' \
    -d '{
      "model": "meta/llama-3.1-8b-instruct",
      "messages": [
        {
          "role":"system",
          "content":"detailed thinking on"
        },
        {
          "role":"user",
          "content":"Can you write me a song?"
        }
      ],
      "top_p": 1,
      "n": 1,
      "max_tokens": 15,
      "frequency_penalty": 1.0,
      "stop": ["hello"]

    }'
    
$ docker stop $CONTAINER_NAME
$ docker rm $CONTAINER_NAME