# 參考 https://forums.developer.nvidia.com/t/mitigating-oom-system-freezes-on-uma-based-single-board-computers/362769
# 另外可參考 DGX Spark 之溫度 https://yingrenn.blogspot.com/2026/02/dgx-spark.html
# 安裝輕量級的 Dropbear SSH
$ sudo apt update && sudo apt install dropbear
$ sudo vi /etc/default/dropbear
NO_START=0
DROPBEAR_PORT=2222
$ sudo systemctl enable dropbear
$ sudo systemctl start dropbear
# Standard connection (OpenSSH)
$ ssh spark@<your-ip>
# Emergency connection (Dropbear)
$ ssh spark@<your-ip> -p 2222
# 安裝 earlyoom
$ sudo apt update
$ sudo apt install earlyoom
$ sudo vi /etc/default/earlyoom
EARLYOOM_ARGS="-m 5 -s 10 --avoid 'pipewire|wireplumber|systemd|ssh|journald' --prefer 'vllm|python|triton'"
# This tells earlyoom to intervene when RAM is under 5% AND Swap is under 10%.
# It will aggressively target vllm or Python scripts over other processes
$ sudo EDITOR=vi systemctl edit earlyoom
### Editing /etc/systemd/system/earlyoom.service.d/override.conf
### Anything between here and the comment below will become the contents of the drop-in file
[Service]
LimitMEMLOCK=infinity
CapabilityBoundingSet=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
AmbientCapabilities=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
MemoryLock=infinity
OOMScoreAdjust=-1000
### Edits below this comment will be discarded
### /usr/lib/systemd/system/earlyoom.service
# [Unit]
# Description=Early OOM Daemon
# Documentation=man:earlyoom(1) https://github.com/rfjakob/earlyoom
#
# [Service]
# EnvironmentFile=-/etc/default/earlyoom
# ExecStart=/usr/bin/earlyoom $EARLYOOM_ARGS
# # Run as an unprivileged user with random user id
# DynamicUser=true
# # Allow killing processes and calling mlockall()
# AmbientCapabilities=CAP_KILL CAP_IPC_LOCK
# # We don't need write access anywhere
# ProtectSystem=strict
# # We don't need /home at all, make it inaccessible
# ProtectHome=true
# # earlyoom never exits on it's own, so have systemd
# # restart it should it get killed for some reason.
# Restart=always
# # set memory limits and max tasks number
# TasksMax=10
# MemoryMax=50M
#
# [Install]
# WantedBy=multi-user.target
$ sudo systemctl daemon-reload
$ sudo systemctl restart earlyoom
$ sudo systemctl status earlyoom
# 查詢 log
$ journalctl -u earlyoom -f
$ cat /etc/systemd/system/earlyoom.service.d/override.conf
[Service]
LimitMEMLOCK=infinity
CapabilityBoundingSet=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
AmbientCapabilities=CAP_IPC_LOCK CAP_SYS_NICE CAP_KILL
MemoryLock=infinity
OOMScoreAdjust=-1000
$ ps aux |grep earlyoom
earlyoom 80791 0.0 0.0 2288 1688 ? SLs 11:55 0:00 /usr/bin/earlyoom -m 5 -s 10 --avoid pipewire|wireplumber|systemd|ssh|journald --prefer vllm|python|triton
$ cat /proc/$(pgrep earlyoom)/oom_score_adj
-1000
沒有留言:
張貼留言