All technological notes.
Handbook for diagnosing and remediating common Linux issues in production/DevOps environments.
# System load averages: 1-min, 5-min, 15-min
# Rule of thumb: load > number of CPU cores = overloaded
uptime
# Interactive process viewer — shows CPU%, MEM%, PID, command
top
# Inside top:
# M → sort by memory usage
# P → sort by CPU usage
# k → kill a process by PID
# q → quit
# Enhanced top with color, tree view, and mouse support (install if missing)
htop
# Top 5 processes by CPU or memory (non-interactive snapshot)
ps aux --sort=-%cpu | head -n 5
ps aux --sort=-%mem | head -n 5
# Full process list with PPID (parent PID) — useful for tracing spawned processes
ps -ef
# CPU core count — calibrate load average thresholds
nproc
# Human-readable summary: total, used, free, buff/cache, available
free -h
# Virtual memory stats every 1 second: r=run queue, si/so=swap in/out, us/sy/id=CPU breakdown
# High si/so = swapping heavily → memory pressure
vmstat 1
# Per-process memory snapshot sorted by RSS (resident set size)
ps aux --sort=-%cpu | head -n 10
# Identify OOM (Out Of Memory) kills in kernel log
dmesg | grep -i "oom\|killed process"
# Tail logs for a specific service (follow mode -f = live stream)
journalctl -u <service_name>
journalctl -u <service_name> -f
# Show recent errors with context (-x = explanations, -e = jump to end)
journalctl -xe
# Filter by time range
journalctl -u <service_name> --since "1 hour ago"
journalctl -u <service_name> --since "2024-01-01 00:00" --until "2024-01-01 06:00"
# Kernel messages (hardware errors, OOM, driver issues)
dmesg -T | tail -50
dmesg -T | grep -i "error\|warn\|fail"
# Find PID by process name
pgrep <process_name>
pgrep -a <process_name> # also show full command line
# Show process tree — visualize parent/child relationships
pstree -p
# Trace system calls in real time — diagnose hangs, permission errors, missing files
strace -p <PID>
strace -p <PID> -e trace=network # filter to network calls only
strace -p <PID> -e trace=file # filter to file I/O only
# Graceful → forceful kill escalation
kill <PID> # SIGTERM (15): ask process to shut down cleanly
kill -9 <PID> # SIGKILL (9): force kill — use when SIGTERM is ignored
pkill <process_name> # kill by name (SIGTERM)
killall <process_name> # kill all instances by name
# Check if process is still alive after kill
ps -p <PID>
# Filesystem usage (human-readable) — spot full or near-full mounts
df -h
# Alert on filesystems over 80% full
df -h | awk '$5+0 > 80 { print $0 }'
# Directory size summary — find what's consuming space
du -sh /path
du -sh /* 2>/dev/null | sort -rh | head -20 # top space consumers from /
# I/O performance stats — check read/write throughput and await time
iostat -x 1 # extended stats every 1 second
# High %util (>80%) or high await = I/O bottleneck
# Real-time I/O per process (requires iotop)
iotop -o # show only processes doing I/O
# List open files on a mount point — find what's blocking umount
lsof +D /mount/point
# Hostname and OS info
hostnamectl
# Network interfaces and IP addresses
ip a
# Routing table — check default gateway and routes
ip r
# Active listening ports (t=TCP, u=UDP, l=listening, n=numeric, p=process)
# Preferred over netstat on modern systems
ss -tulnp
# Legacy equivalent (may not be installed by default)
netstat -tulnp
# Test connectivity and latency
ping -c 4 <host>
# Trace network path to a host — identify where packets drop
traceroute <host>
# DNS resolution
dig <domain> # detailed DNS query with TTL
nslookup <domain> # quick lookup (less detail)
dig +short <domain> # IP only
# Test if a port is reachable (replace telnet)
nc -zv <host> <port>
# Capture packets on an interface (requires root)
tcpdump -i eth0 -n port 80
tcpdump -i any -n 'host <IP>'
# Check service status (shows recent log lines too)
systemctl status <service_name>
systemctl start <service_name>
systemctl stop <service_name>
systemctl restart <service_name> # stop + start
systemctl reload <service_name> # reload config without full restart (if supported)
# Enable/disable auto-start on boot
systemctl enable <service_name>
systemctl disable <service_name>
# List all failed services — first stop after an incident
systemctl --failed
# Reload systemd after editing a unit file
systemctl daemon-reload
# Locate binary path — verify which version of a command is used
which <command>
type <command> # also shows aliases and builtins
# List open files for a process or file path
lsof -p <PID> # all files opened by a process
lsof -c <process_name> # by process name
lsof /path/to/file # who has this file open
lsof -i :<port> # process using a specific port
# Trace system calls (file I/O focus) — diagnose "permission denied" or "no such file"
strace -p <PID> -e trace=file
# File permissions
chmod +x <file> # make executable
chmod 644 <file> # rw-r--r-- (typical config file)
chmod 600 <file> # rw------- (private key, .env)
chmod -R 755 /path/to/dir # recursive: rwxr-xr-x
# Ownership
chown <user>:<group> <file>
chown -R <user>:<group> /path/to/dir
# Check SELinux/AppArmor denials if permissions look correct but still denied
ausearch -m avc -ts recent # SELinux audit denials
# Debian / Ubuntu (apt)
apt update # refresh package index
apt upgrade # upgrade all installed packages
apt install <package_name>
apt remove <package_name> # remove package, keep config
apt purge <package_name> # remove package + config files
apt autoremove # remove unused dependencies
# Search for a package
apt search <keyword>
apt show <package_name> # show version, deps, description
# RHEL / CentOS / Amazon Linux (dnf/yum)
dnf update
dnf install <package_name>
dnf remove <package_name>
dnf search <keyword>