網頁

2026年3月12日 星期四

DGX Spark 如何避免 OOM 當機

# 參考 https://forums.developer.nvidia.com/t/mitigating-oom-system-freezes-on-uma-based-single-board-computers/362769
# 另外可參考 DGX Spark 之溫度 https://yingrenn.blogspot.com/2026/02/dgx-spark.html

# 安裝輕量級的 Dropbear SSH
$ sudo apt update && sudo apt install dropbear
$ sudo vi /etc/default/dropbear
NO_START=0
DROPBEAR_PORT=2222

$ sudo systemctl enable dropbear
$ sudo systemctl start dropbear

# Standard connection (OpenSSH)
$ ssh spark@<your-ip>
# Emergency connection (Dropbear)
$ ssh spark@<your-ip> -p 2222

# 安裝 earlyoom
$ sudo apt update
$ sudo apt install earlyoom
$ sudo vi /etc/default/earlyoom
EARLYOOM_ARGS="-m 5 -s 10 --avoid 'pipewire|wireplumber|systemd|ssh|journald' --prefer 'vllm|python|triton'"
# This tells earlyoom to intervene when RAM is under 5% AND Swap is under 10%. 
# It will aggressively target vllm or Python scripts over other processes

$ sudo EDITOR=vi systemctl edit earlyoom
### Editing /etc/systemd/system/earlyoom.service.d/override.conf
### Anything between here and the comment below will become the contents of the drop-in file

[Service]
LimitMEMLOCK=infinity
CapabilityBoundingSet=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
AmbientCapabilities=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
MemoryLock=infinity
OOMScoreAdjust=-1000

### Edits below this comment will be discarded


### /usr/lib/systemd/system/earlyoom.service
# [Unit]
# Description=Early OOM Daemon
# Documentation=man:earlyoom(1) https://github.com/rfjakob/earlyoom
# [Service]
# EnvironmentFile=-/etc/default/earlyoom
# ExecStart=/usr/bin/earlyoom $EARLYOOM_ARGS
# # Run as an unprivileged user with random user id
# DynamicUser=true
# # Allow killing processes and calling mlockall()
# AmbientCapabilities=CAP_KILL CAP_IPC_LOCK
# # We don't need write access anywhere
# ProtectSystem=strict
# # We don't need /home at all, make it inaccessible
# ProtectHome=true
# # earlyoom never exits on it's own, so have systemd
# # restart it should it get killed for some reason.
# Restart=always
# # set memory limits and max tasks number
# TasksMax=10
# MemoryMax=50M
# [Install]
# WantedBy=multi-user.target

$ sudo systemctl daemon-reload
$ sudo systemctl restart earlyoom
$ sudo systemctl status earlyoom

# 查詢 log
$ journalctl -u earlyoom -f

$ cat /etc/systemd/system/earlyoom.service.d/override.conf 
[Service]
LimitMEMLOCK=infinity
CapabilityBoundingSet=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
AmbientCapabilities=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
MemoryLock=infinity
OOMScoreAdjust=-1000
$ ps aux |grep earlyoom
earlyoom   80791  0.0  0.0   2288  1688 ?        SLs  11:55   0:00 /usr/bin/earlyoom -m 5 -s 10 --avoid pipewire|wireplumber|systemd|ssh|journald --prefer vllm|python|triton
$ cat /proc/$(pgrep earlyoom)/oom_score_adj
-1000