kiss server monitoring tool with email alerts
go monitoring

refactor: split packages and add alerts (journalctl, reboot)

+1069 -418
-3
.dockerignore
··· 1 - .git 2 - *.md 3 - .tangled
···
+14
.servmon.example.yaml
··· 20 failure_threshold: 20 21 check_interval: 1m0s 22 cooldown: 15m0s 23 email: 24 smtp_server: smtp.example.com 25 smtp_port: 587
··· 20 failure_threshold: 20 21 check_interval: 1m0s 22 cooldown: 15m0s 23 + journalctl: 24 + enabled: true 25 + check_interval: 5m0s 26 + lookback_period: 5m0s 27 + error_threshold: 10 28 + priorities: 29 + - err 30 + - crit 31 + - alert 32 + - emerg 33 + cooldown: 30m0s 34 + reboot: 35 + enabled: true 36 + uptime_threshold: 10m0s 37 email: 38 smtp_server: smtp.example.com 39 smtp_port: 587
-8
Dockerfile
··· 1 - FROM golang:1.25-alpine AS builder 2 - WORKDIR /build 3 - COPY . . 4 - RUN go build -o servmon . 5 - 6 - FROM alpine:latest 7 - COPY --from=builder /build/servmon /servmon 8 - ENTRYPOINT ["/servmon"]
···
+72 -9
config.go internal/config/config.go
··· 1 - package main 2 3 import ( 4 "fmt" ··· 16 } 17 18 type Thresholds struct { 19 - CPU ThresholdConfig `yaml:"cpu"` 20 - Memory ThresholdConfig `yaml:"memory"` 21 - Disks []DiskConfig `yaml:"disks"` 22 - HTTP HTTP `yaml:"http"` 23 } 24 25 type ThresholdConfig struct { ··· 45 Cooldown time.Duration `yaml:"cooldown"` 46 } 47 48 type Email struct { 49 SMTPServer string `yaml:"smtp_server"` 50 SMTPPort int `yaml:"smtp_port"` ··· 67 return nil 68 } 69 70 - // defaultConfig returns a default configuration for the monitoring service. 71 - func defaultConfig() *Config { 72 return &Config{ 73 AlertThresholds: Thresholds{ 74 CPU: ThresholdConfig{ ··· 98 CheckInterval: 1 * time.Minute, 99 Cooldown: 15 * time.Minute, 100 }, 101 }, 102 Email: Email{ 103 SMTPServer: "smtp.example.com", ··· 110 } 111 } 112 113 - // loadConfig loads a configuration from a file. 114 - func loadConfig(path string) (*Config, error) { 115 data, err := os.ReadFile(path) 116 if err != nil { 117 return nil, fmt.Errorf("error reading config file: %w", err) ··· 194 } 195 } 196 197 // Validate Email configuration 198 if c.Email.SMTPServer == "" { 199 return fmt.Errorf("SMTP server cannot be empty") ··· 210 211 return nil 212 }
··· 1 + package config 2 3 import ( 4 "fmt" ··· 16 } 17 18 type Thresholds struct { 19 + CPU ThresholdConfig `yaml:"cpu"` 20 + Memory ThresholdConfig `yaml:"memory"` 21 + Disks []DiskConfig `yaml:"disks"` 22 + HTTP HTTP `yaml:"http"` 23 + Journalctl JournalctlConfig `yaml:"journalctl"` 24 + Reboot RebootConfig `yaml:"reboot"` 25 } 26 27 type ThresholdConfig struct { ··· 47 Cooldown time.Duration `yaml:"cooldown"` 48 } 49 50 + type JournalctlConfig struct { 51 + Enabled bool `yaml:"enabled"` 52 + CheckInterval time.Duration `yaml:"check_interval"` 53 + LookbackPeriod time.Duration `yaml:"lookback_period"` 54 + ErrorThreshold int `yaml:"error_threshold"` 55 + Priorities []string `yaml:"priorities"` // err, crit, alert, emerg 56 + Cooldown time.Duration `yaml:"cooldown"` 57 + } 58 + 59 + type RebootConfig struct { 60 + Enabled bool `yaml:"enabled"` 61 + UptimeThreshold time.Duration `yaml:"uptime_threshold"` // If uptime < threshold, send reboot notification 62 + } 63 + 64 type Email struct { 65 SMTPServer string `yaml:"smtp_server"` 66 SMTPPort int `yaml:"smtp_port"` ··· 83 return nil 84 } 85 86 + // Default returns a default configuration for the monitoring service. 87 + func Default() *Config { 88 return &Config{ 89 AlertThresholds: Thresholds{ 90 CPU: ThresholdConfig{ ··· 114 CheckInterval: 1 * time.Minute, 115 Cooldown: 15 * time.Minute, 116 }, 117 + Journalctl: JournalctlConfig{ 118 + Enabled: true, 119 + CheckInterval: 5 * time.Minute, 120 + LookbackPeriod: 5 * time.Minute, 121 + ErrorThreshold: 10, 122 + Priorities: []string{"err", "crit", "alert", "emerg"}, 123 + Cooldown: 30 * time.Minute, 124 + }, 125 + Reboot: RebootConfig{ 126 + Enabled: true, 127 + UptimeThreshold: 10 * time.Minute, 128 + }, 129 }, 130 Email: Email{ 131 SMTPServer: "smtp.example.com", ··· 138 } 139 } 140 141 + // Load loads a configuration from a file. 142 + func Load(path string) (*Config, error) { 143 data, err := os.ReadFile(path) 144 if err != nil { 145 return nil, fmt.Errorf("error reading config file: %w", err) ··· 222 } 223 } 224 225 + // Validate Journalctl configuration 226 + if c.AlertThresholds.Journalctl.Enabled { 227 + if c.AlertThresholds.Journalctl.CheckInterval <= 0 { 228 + return fmt.Errorf("journalctl check interval must be positive") 229 + } 230 + if c.AlertThresholds.Journalctl.LookbackPeriod <= 0 { 231 + return fmt.Errorf("journalctl lookback period must be positive") 232 + } 233 + if c.AlertThresholds.Journalctl.ErrorThreshold <= 0 { 234 + return fmt.Errorf("journalctl error threshold must be positive") 235 + } 236 + if len(c.AlertThresholds.Journalctl.Priorities) == 0 { 237 + return fmt.Errorf("journalctl priorities cannot be empty") 238 + } 239 + if c.AlertThresholds.Journalctl.Cooldown <= 0 { 240 + return fmt.Errorf("journalctl cooldown must be positive") 241 + } 242 + } 243 + 244 + // Validate Reboot configuration 245 + if c.AlertThresholds.Reboot.Enabled { 246 + if c.AlertThresholds.Reboot.UptimeThreshold <= 0 { 247 + return fmt.Errorf("reboot uptime threshold must be positive") 248 + } 249 + } 250 + 251 // Validate Email configuration 252 if c.Email.SMTPServer == "" { 253 return fmt.Errorf("SMTP server cannot be empty") ··· 264 265 return nil 266 } 267 + 268 + // GetDiskPaths returns a comma-separated list of monitored disk paths 269 + func (c *Config) GetDiskPaths() string { 270 + var paths []string 271 + for _, disk := range c.AlertThresholds.Disks { 272 + paths = append(paths, disk.Path) 273 + } 274 + return strings.Join(paths, ", ") 275 + }
-45
email.go
··· 1 - package main 2 - 3 - import ( 4 - "fmt" 5 - "log" 6 - 7 - "github.com/wneessen/go-mail" 8 - ) 9 - 10 - // sendEmail sends an alert email using the configuration 11 - func sendEmail(subject, body string, cfg *Config) error { 12 - log.Printf("Attempting to send email alert: %s", subject) 13 - 14 - msg := mail.NewMsg() 15 - if err := msg.From(cfg.Email.From); err != nil { 16 - return fmt.Errorf("failed to set FROM address '%s': %w", cfg.Email.From, err) 17 - } 18 - if err := msg.To(cfg.Email.To); err != nil { 19 - return fmt.Errorf("failed to set TO address '%s': %w", cfg.Email.To, err) 20 - } 21 - 22 - msg.Subject(fmt.Sprintf("[ServMon Alert] %s", subject)) 23 - msg.SetBodyString(mail.TypeTextPlain, body) 24 - 25 - // Create SMTP client with configuration 26 - client, err := mail.NewClient( 27 - cfg.Email.SMTPServer, 28 - mail.WithPort(cfg.Email.SMTPPort), 29 - mail.WithSMTPAuth(mail.SMTPAuthPlain), 30 - mail.WithTLSPortPolicy(mail.TLSMandatory), 31 - mail.WithUsername(cfg.Email.Username), 32 - mail.WithPassword(cfg.Email.Password), 33 - ) 34 - if err != nil { 35 - return fmt.Errorf("failed to create SMTP client for %s:%d: %w", cfg.Email.SMTPServer, cfg.Email.SMTPPort, err) 36 - } 37 - 38 - // Send the email 39 - if err := client.DialAndSend(msg); err != nil { 40 - return fmt.Errorf("failed to send email to %s via %s:%d: %w", cfg.Email.To, cfg.Email.SMTPServer, cfg.Email.SMTPPort, err) 41 - } 42 - 43 - log.Printf("✓ Email alert sent successfully to %s: %s", cfg.Email.To, subject) 44 - return nil 45 - }
···
+1 -1
go.mod
··· 3 go 1.25.1 4 5 require ( 6 - github.com/shirou/gopsutil/v4 v4.25.11 7 github.com/spf13/cobra v1.10.2 8 github.com/wneessen/go-mail v0.7.2 9 gopkg.in/yaml.v3 v3.0.1
··· 3 go 1.25.1 4 5 require ( 6 + github.com/shirou/gopsutil/v4 v4.25.12 7 github.com/spf13/cobra v1.10.2 8 github.com/wneessen/go-mail v0.7.2 9 gopkg.in/yaml.v3 v3.0.1
+2 -2
go.sum
··· 17 github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= 18 github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= 19 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 20 - github.com/shirou/gopsutil/v4 v4.25.11 h1:X53gB7muL9Gnwwo2evPSE+SfOrltMoR6V3xJAXZILTY= 21 - github.com/shirou/gopsutil/v4 v4.25.11/go.mod h1:EivAfP5x2EhLp2ovdpKSozecVXn1TmuG7SMzs/Wh4PU= 22 github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= 23 github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= 24 github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
··· 17 github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= 18 github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= 19 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 20 + github.com/shirou/gopsutil/v4 v4.25.12 h1:e7PvW/0RmJ8p8vPGJH4jvNkOyLmbkXgXW4m6ZPic6CY= 21 + github.com/shirou/gopsutil/v4 v4.25.12/go.mod h1:EivAfP5x2EhLp2ovdpKSozecVXn1TmuG7SMzs/Wh4PU= 22 github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= 23 github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= 24 github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+57
internal/alert/alert.go
···
··· 1 + package alert 2 + 3 + import ( 4 + "context" 5 + "os" 6 + "time" 7 + ) 8 + 9 + // Alerter is an interface for sending alerts 10 + type Alerter interface { 11 + // Send sends an alert 12 + Send(ctx context.Context, alert *Alert) error 13 + // Name returns the name of the alerter 14 + Name() string 15 + } 16 + 17 + // AlertLevel represents the severity of an alert 18 + type AlertLevel string 19 + 20 + const ( 21 + LevelInfo AlertLevel = "INFO" 22 + LevelWarning AlertLevel = "WARNING" 23 + LevelCritical AlertLevel = "CRITICAL" 24 + ) 25 + 26 + // Alert represents a monitoring alert 27 + type Alert struct { 28 + Level AlertLevel 29 + Title string 30 + Message string 31 + Timestamp time.Time 32 + Hostname string 33 + Metadata map[string]any 34 + } 35 + 36 + // NewAlert creates a new alert with hostname and timestamp 37 + func NewAlert(level AlertLevel, title, message string) *Alert { 38 + hostname, err := os.Hostname() 39 + if err != nil { 40 + hostname = "unknown" 41 + } 42 + 43 + return &Alert{ 44 + Level: level, 45 + Title: title, 46 + Message: message, 47 + Timestamp: time.Now(), 48 + Hostname: hostname, 49 + Metadata: make(map[string]any), 50 + } 51 + } 52 + 53 + // WithMetadata adds metadata to the alert 54 + func (a *Alert) WithMetadata(key string, value any) *Alert { 55 + a.Metadata[key] = value 56 + return a 57 + }
+124
internal/alert/email.go
···
··· 1 + package alert 2 + 3 + import ( 4 + "context" 5 + "fmt" 6 + "log" 7 + "strings" 8 + "time" 9 + 10 + "github.com/wneessen/go-mail" 11 + ) 12 + 13 + // EmailConfig holds email configuration 14 + type EmailConfig struct { 15 + SMTPServer string 16 + SMTPPort int 17 + From string 18 + To string 19 + Username string 20 + Password string 21 + } 22 + 23 + // EmailAlerter sends alerts via email 24 + type EmailAlerter struct { 25 + config EmailConfig 26 + } 27 + 28 + // NewEmailAlerter creates a new email alerter 29 + func NewEmailAlerter(config EmailConfig) *EmailAlerter { 30 + return &EmailAlerter{ 31 + config: config, 32 + } 33 + } 34 + 35 + // Send sends an alert via email 36 + func (e *EmailAlerter) Send(ctx context.Context, alert *Alert) error { 37 + log.Printf("Sending %s alert via email: %s", alert.Level, alert.Title) 38 + 39 + msg := mail.NewMsg() 40 + if err := msg.From(e.config.From); err != nil { 41 + return fmt.Errorf("failed to set FROM address '%s': %w", e.config.From, err) 42 + } 43 + if err := msg.To(e.config.To); err != nil { 44 + return fmt.Errorf("failed to set TO address '%s': %w", e.config.To, err) 45 + } 46 + 47 + subject := e.formatSubject(alert) 48 + msg.Subject(subject) 49 + 50 + body := e.formatBody(alert) 51 + msg.SetBodyString(mail.TypeTextPlain, body) 52 + 53 + // Create SMTP client with configuration 54 + client, err := mail.NewClient( 55 + e.config.SMTPServer, 56 + mail.WithPort(e.config.SMTPPort), 57 + mail.WithSMTPAuth(mail.SMTPAuthPlain), 58 + mail.WithTLSPortPolicy(mail.TLSMandatory), 59 + mail.WithUsername(e.config.Username), 60 + mail.WithPassword(e.config.Password), 61 + ) 62 + if err != nil { 63 + return fmt.Errorf("failed to create SMTP client for %s:%d: %w", e.config.SMTPServer, e.config.SMTPPort, err) 64 + } 65 + 66 + if err := client.DialAndSend(msg); err != nil { 67 + return fmt.Errorf("failed to send email to %s via %s:%d: %w", e.config.To, e.config.SMTPServer, e.config.SMTPPort, err) 68 + } 69 + 70 + log.Printf("✓ Email alert sent successfully to %s: %s", e.config.To, alert.Title) 71 + return nil 72 + } 73 + 74 + // Name returns the name of the alerter 75 + func (e *EmailAlerter) Name() string { 76 + return "Email" 77 + } 78 + 79 + // formatSubject formats the email subject with emoji and level 80 + func (e *EmailAlerter) formatSubject(alert *Alert) string { 81 + var emoji string 82 + switch alert.Level { 83 + case LevelInfo: 84 + emoji = "ℹ️" 85 + case LevelWarning: 86 + emoji = "⚠️" 87 + case LevelCritical: 88 + emoji = "🚨" 89 + default: 90 + emoji = "📢" 91 + } 92 + 93 + return fmt.Sprintf("%s [ServMon %s] %s", emoji, alert.Level, alert.Title) 94 + } 95 + 96 + // formatBody formats the email body with detailed information 97 + func (e *EmailAlerter) formatBody(alert *Alert) string { 98 + var sb strings.Builder 99 + 100 + // Header 101 + sb.WriteString("ServMon Alert\n") 102 + 103 + // Alert details 104 + sb.WriteString(fmt.Sprintf("Level: %s\n", alert.Level)) 105 + sb.WriteString(fmt.Sprintf("Hostname: %s\n", alert.Hostname)) 106 + sb.WriteString(fmt.Sprintf("Time: %s\n", alert.Timestamp.Format(time.RFC1123))) 107 + sb.WriteString("\n") 108 + 109 + // Title 110 + sb.WriteString(fmt.Sprintf("Title:\n%s\n\n", alert.Title)) 111 + 112 + // Message 113 + sb.WriteString(fmt.Sprintf("Details:\n%s\n", alert.Message)) 114 + 115 + // Metadata if present 116 + if len(alert.Metadata) > 0 { 117 + sb.WriteString("\nAdditional Information:\n") 118 + for key, value := range alert.Metadata { 119 + sb.WriteString(fmt.Sprintf("%s: %v\n", key, value)) 120 + } 121 + } 122 + 123 + return sb.String() 124 + }
+618
internal/monitor/monitor.go
···
··· 1 + package monitor 2 + 3 + import ( 4 + "context" 5 + "fmt" 6 + "log" 7 + "net/http" 8 + "os/exec" 9 + "regexp" 10 + "strings" 11 + "time" 12 + 13 + "pkg.rbrt.fr/servmon/internal/alert" 14 + "pkg.rbrt.fr/servmon/internal/config" 15 + 16 + "github.com/shirou/gopsutil/v4/cpu" 17 + "github.com/shirou/gopsutil/v4/disk" 18 + "github.com/shirou/gopsutil/v4/host" 19 + "github.com/shirou/gopsutil/v4/mem" 20 + ) 21 + 22 + // Monitor represents a system monitor 23 + type Monitor struct { 24 + config *config.Config 25 + alerter alert.Alerter 26 + } 27 + 28 + // New creates a new monitor 29 + func New(cfg *config.Config, alerter alert.Alerter) *Monitor { 30 + return &Monitor{ 31 + config: cfg, 32 + alerter: alerter, 33 + } 34 + } 35 + 36 + // Start starts all monitoring goroutines 37 + func (m *Monitor) Start(ctx context.Context) { 38 + log.Println("Starting monitoring services...") 39 + 40 + // Start CPU monitoring 41 + go m.MonitorCPU(ctx) 42 + 43 + // Start Memory monitoring 44 + go m.MonitorMemory(ctx) 45 + 46 + // Start Disk monitoring for each configured disk 47 + for _, diskCfg := range m.config.AlertThresholds.Disks { 48 + go m.MonitorDisk(ctx, diskCfg) 49 + } 50 + 51 + // Start HTTP monitoring if configured 52 + if m.config.AlertThresholds.HTTP.URL != "" { 53 + go m.MonitorHTTP(ctx) 54 + } 55 + 56 + // Start Journalctl monitoring if configured 57 + if m.config.AlertThresholds.Journalctl.Enabled { 58 + go m.MonitorJournalctl(ctx) 59 + } 60 + 61 + log.Println("✓ All monitoring services started successfully") 62 + } 63 + 64 + // MonitorCPU monitors CPU usage 65 + func (m *Monitor) MonitorCPU(ctx context.Context) { 66 + cfg := m.config.AlertThresholds.CPU 67 + log.Printf("CPU Monitor: threshold=%.1f%%, interval=%v, cooldown=%v", 68 + cfg.Threshold, cfg.CheckInterval, cfg.Cooldown) 69 + 70 + // Initialize cooldown timer in expired state 71 + alertCooldown := time.NewTimer(0) 72 + <-alertCooldown.C 73 + 74 + ticker := time.NewTicker(cfg.CheckInterval) 75 + defer ticker.Stop() 76 + 77 + for { 78 + select { 79 + case <-ctx.Done(): 80 + log.Println("CPU monitor shutting down") 81 + return 82 + case <-ticker.C: 83 + } 84 + 85 + percent, err := cpu.Percent(cfg.Duration, false) 86 + if err != nil { 87 + log.Printf("Error getting CPU usage: %v", err) 88 + continue 89 + } 90 + 91 + // Calculate average CPU usage across all cores 92 + if len(percent) == 0 { 93 + log.Printf("CPU percentage returned empty array, skipping check") 94 + continue 95 + } 96 + 97 + var total float64 98 + for _, p := range percent { 99 + total += p 100 + } 101 + avg := total / float64(len(percent)) 102 + 103 + // Check threshold 104 + if avg > cfg.Threshold { 105 + select { 106 + case <-alertCooldown.C: 107 + // Cooldown expired, send alert 108 + a := alert.NewAlert( 109 + alert.LevelWarning, 110 + fmt.Sprintf("High CPU Usage: %.2f%%", avg), 111 + fmt.Sprintf("CPU usage of %.2f%% has exceeded the threshold of %.2f%%", avg, cfg.Threshold), 112 + ) 113 + a.WithMetadata("current_usage", fmt.Sprintf("%.2f%%", avg)) 114 + a.WithMetadata("threshold", fmt.Sprintf("%.2f%%", cfg.Threshold)) 115 + a.WithMetadata("duration", cfg.Duration.String()) 116 + 117 + if err := m.alerter.Send(ctx, a); err != nil { 118 + log.Printf("Failed to send CPU alert: %v", err) 119 + } 120 + alertCooldown.Reset(cfg.Cooldown) 121 + default: 122 + // Within cooldown, skip alert 123 + } 124 + } 125 + } 126 + } 127 + 128 + // MonitorMemory monitors memory usage 129 + func (m *Monitor) MonitorMemory(ctx context.Context) { 130 + cfg := m.config.AlertThresholds.Memory 131 + log.Printf("Memory Monitor: threshold=%.1f%%, interval=%v, cooldown=%v", 132 + cfg.Threshold, cfg.CheckInterval, cfg.Cooldown) 133 + 134 + alertCooldown := time.NewTimer(0) 135 + <-alertCooldown.C 136 + 137 + ticker := time.NewTicker(cfg.CheckInterval) 138 + defer ticker.Stop() 139 + 140 + for { 141 + select { 142 + case <-ctx.Done(): 143 + log.Println("Memory monitor shutting down") 144 + return 145 + case <-ticker.C: 146 + } 147 + 148 + vm, err := mem.VirtualMemory() 149 + if err != nil { 150 + log.Printf("Error getting memory usage: %v", err) 151 + continue 152 + } 153 + 154 + usedPercent := vm.UsedPercent 155 + 156 + if usedPercent > cfg.Threshold { 157 + select { 158 + case <-alertCooldown.C: 159 + // Cooldown expired, send alert 160 + a := alert.NewAlert( 161 + alert.LevelWarning, 162 + fmt.Sprintf("High Memory Usage: %.2f%%", usedPercent), 163 + fmt.Sprintf("Memory usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.Threshold), 164 + ) 165 + a.WithMetadata("current_usage", fmt.Sprintf("%.2f%%", usedPercent)) 166 + a.WithMetadata("threshold", fmt.Sprintf("%.2f%%", cfg.Threshold)) 167 + a.WithMetadata("used", fmt.Sprintf("%.2f GB", float64(vm.Used)/(1024*1024*1024))) 168 + a.WithMetadata("total", fmt.Sprintf("%.2f GB", float64(vm.Total)/(1024*1024*1024))) 169 + a.WithMetadata("available", fmt.Sprintf("%.2f GB", float64(vm.Available)/(1024*1024*1024))) 170 + 171 + if err := m.alerter.Send(ctx, a); err != nil { 172 + log.Printf("Failed to send memory alert: %v", err) 173 + } 174 + alertCooldown.Reset(cfg.Cooldown) 175 + default: 176 + // Within cooldown, skip alert 177 + } 178 + } 179 + } 180 + } 181 + 182 + // MonitorDisk monitors disk usage 183 + func (m *Monitor) MonitorDisk(ctx context.Context, diskCfg config.DiskConfig) { 184 + log.Printf("📊 Disk Monitor [%s]: threshold=%.1f%%, interval=%v, cooldown=%v", 185 + diskCfg.Path, diskCfg.Threshold, diskCfg.CheckInterval, diskCfg.Cooldown) 186 + 187 + alertCooldown := time.NewTimer(0) 188 + <-alertCooldown.C 189 + 190 + ticker := time.NewTicker(diskCfg.CheckInterval) 191 + defer ticker.Stop() 192 + 193 + for { 194 + select { 195 + case <-ctx.Done(): 196 + log.Printf("Disk monitor [%s] shutting down", diskCfg.Path) 197 + return 198 + case <-ticker.C: 199 + } 200 + 201 + usage, err := disk.Usage(diskCfg.Path) 202 + if err != nil { 203 + log.Printf("Error getting disk usage for %s: %v", diskCfg.Path, err) 204 + continue 205 + } 206 + 207 + usedPercent := usage.UsedPercent 208 + 209 + if usedPercent > diskCfg.Threshold { 210 + select { 211 + case <-alertCooldown.C: 212 + // Cooldown expired, send alert 213 + a := alert.NewAlert( 214 + alert.LevelWarning, 215 + fmt.Sprintf("High Disk Usage: %s %.2f%%", diskCfg.Path, usedPercent), 216 + fmt.Sprintf("Disk usage for %s of %.2f%% has exceeded the threshold of %.2f%%", diskCfg.Path, usedPercent, diskCfg.Threshold), 217 + ) 218 + a.WithMetadata("path", diskCfg.Path) 219 + a.WithMetadata("current_usage", fmt.Sprintf("%.2f%%", usedPercent)) 220 + a.WithMetadata("threshold", fmt.Sprintf("%.2f%%", diskCfg.Threshold)) 221 + a.WithMetadata("used", fmt.Sprintf("%.2f GB", float64(usage.Used)/(1024*1024*1024))) 222 + a.WithMetadata("total", fmt.Sprintf("%.2f GB", float64(usage.Total)/(1024*1024*1024))) 223 + a.WithMetadata("free", fmt.Sprintf("%.2f GB", float64(usage.Free)/(1024*1024*1024))) 224 + 225 + if err := m.alerter.Send(ctx, a); err != nil { 226 + log.Printf("Failed to send disk alert: %v", err) 227 + } 228 + alertCooldown.Reset(diskCfg.Cooldown) 229 + default: 230 + // Within cooldown, skip alert 231 + } 232 + } 233 + } 234 + } 235 + 236 + // MonitorHTTP monitors HTTP endpoint health 237 + func (m *Monitor) MonitorHTTP(ctx context.Context) { 238 + cfg := m.config.AlertThresholds.HTTP 239 + log.Printf("📊 HTTP Monitor [%s]: failure_threshold=%.1f%%, interval=%v, cooldown=%v", 240 + cfg.URL, cfg.FailureThreshold, cfg.CheckInterval, cfg.Cooldown) 241 + 242 + alertCooldown := time.NewTimer(0) 243 + <-alertCooldown.C 244 + 245 + client := &http.Client{ 246 + Timeout: cfg.Timeout, 247 + } 248 + 249 + for { 250 + select { 251 + case <-ctx.Done(): 252 + log.Println("HTTP monitor shutting down") 253 + return 254 + case <-time.After(cfg.CheckInterval): 255 + } 256 + 257 + // Perform HTTP checks in batch to calculate failure rate 258 + failureCount := 0 259 + for i := 0; i < cfg.SampleRate; i++ { 260 + req, err := http.NewRequest("GET", cfg.URL, nil) 261 + if err != nil { 262 + failureCount++ 263 + continue 264 + } 265 + 266 + reqCtx, cancel := context.WithTimeout(context.Background(), cfg.Timeout) 267 + 268 + resp, err := client.Do(req.WithContext(reqCtx)) 269 + if err != nil { 270 + failureCount++ 271 + } else { 272 + if resp.StatusCode >= 400 { 273 + failureCount++ 274 + } 275 + resp.Body.Close() 276 + } 277 + 278 + cancel() 279 + } 280 + 281 + // Calculate failure rate 282 + failureRate := (float64(failureCount) / float64(cfg.SampleRate)) * 100 283 + 284 + if failureRate > cfg.FailureThreshold { 285 + select { 286 + case <-alertCooldown.C: 287 + // Cooldown expired, send alert 288 + level := alert.LevelWarning 289 + if failureRate >= 50 { 290 + level = alert.LevelCritical 291 + } 292 + 293 + a := alert.NewAlert( 294 + level, 295 + fmt.Sprintf("HTTP Health Check Failed: %.2f%%", failureRate), 296 + fmt.Sprintf("HTTP endpoint %s has a failure rate of %.2f%% (threshold: %.2f%%)", cfg.URL, failureRate, cfg.FailureThreshold), 297 + ) 298 + a.WithMetadata("url", cfg.URL) 299 + a.WithMetadata("failure_rate", fmt.Sprintf("%.2f%%", failureRate)) 300 + a.WithMetadata("threshold", fmt.Sprintf("%.2f%%", cfg.FailureThreshold)) 301 + a.WithMetadata("failures", fmt.Sprintf("%d/%d", failureCount, cfg.SampleRate)) 302 + a.WithMetadata("sample_rate", cfg.SampleRate) 303 + 304 + if err := m.alerter.Send(ctx, a); err != nil { 305 + log.Printf("Failed to send HTTP alert: %v", err) 306 + } 307 + alertCooldown.Reset(cfg.Cooldown) 308 + default: 309 + // Within cooldown, skip alert 310 + } 311 + } 312 + } 313 + } 314 + 315 + // GetCurrentStatus returns the current system status 316 + func GetCurrentStatus() (string, error) { 317 + // Get CPU usage 318 + cpuPercent, err := cpu.Percent(1*time.Second, false) 319 + if err != nil { 320 + return "", fmt.Errorf("failed to get CPU usage: %w", err) 321 + } 322 + var cpuTotal float64 323 + for _, p := range cpuPercent { 324 + cpuTotal += p 325 + } 326 + cpuAvg := cpuTotal / float64(len(cpuPercent)) 327 + 328 + // Get memory usage 329 + vm, err := mem.VirtualMemory() 330 + if err != nil { 331 + return "", fmt.Errorf("failed to get memory usage: %w", err) 332 + } 333 + 334 + // Get root disk usage 335 + rootDisk, err := disk.Usage("/") 336 + if err != nil { 337 + return "", fmt.Errorf("failed to get disk usage: %w", err) 338 + } 339 + 340 + status := fmt.Sprintf( 341 + "Current System Status:\n"+ 342 + "CPU: %.2f%%\n"+ 343 + "Memory: %.2f%% (%.2f GB used / %.2f GB total)\n"+ 344 + "Disk: %.2f%% (%.2f GB used / %.2f GB total)", 345 + cpuAvg, 346 + vm.UsedPercent, 347 + float64(vm.Used)/(1024*1024*1024), 348 + float64(vm.Total)/(1024*1024*1024), 349 + rootDisk.UsedPercent, 350 + float64(rootDisk.Used)/(1024*1024*1024), 351 + float64(rootDisk.Total)/(1024*1024*1024), 352 + ) 353 + 354 + return status, nil 355 + } 356 + 357 + // MonitorJournalctl monitors systemd journal logs for errors and critical messages 358 + func (m *Monitor) MonitorJournalctl(ctx context.Context) { 359 + cfg := m.config.AlertThresholds.Journalctl 360 + log.Printf("📊 Journalctl Monitor: error_threshold=%d, interval=%v, lookback=%v, priorities=%v, cooldown=%v", 361 + cfg.ErrorThreshold, cfg.CheckInterval, cfg.LookbackPeriod, cfg.Priorities, cfg.Cooldown) 362 + 363 + alertCooldown := time.NewTimer(0) 364 + <-alertCooldown.C 365 + 366 + ticker := time.NewTicker(cfg.CheckInterval) 367 + defer ticker.Stop() 368 + 369 + for { 370 + select { 371 + case <-ctx.Done(): 372 + log.Println("Journalctl monitor shutting down") 373 + return 374 + case <-ticker.C: 375 + } 376 + 377 + // Build journalctl command with priority filters 378 + priorityArgs := strings.Join(cfg.Priorities, ",") 379 + sinceArg := fmt.Sprintf("%dm ago", int(cfg.LookbackPeriod.Minutes())) 380 + 381 + cmd := exec.CommandContext(ctx, "journalctl", "-x", "-e", 382 + "--since", sinceArg, 383 + "-p", priorityArgs, 384 + "--no-pager", 385 + "-o", "short-precise") 386 + 387 + output, err := cmd.CombinedOutput() 388 + if err != nil { 389 + log.Printf("Error running journalctl: %v", err) 390 + continue 391 + } 392 + 393 + // Parse the output 394 + logs := string(output) 395 + if logs == "" || strings.TrimSpace(logs) == "" { 396 + // No errors found, which is good 397 + continue 398 + } 399 + 400 + // Count and aggregate errors by message pattern 401 + lines := strings.Split(logs, "\n") 402 + errorCount := 0 403 + errorPatterns := make(map[string]int) 404 + criticalCount := 0 405 + 406 + // Regex to extract the main error message (after the process name) 407 + messageRegex := regexp.MustCompile(`\]: (.+)$`) 408 + 409 + for _, line := range lines { 410 + line = strings.TrimSpace(line) 411 + if line == "" || strings.HasPrefix(line, "--") { 412 + continue 413 + } 414 + 415 + errorCount++ 416 + 417 + // Check for critical level 418 + if strings.Contains(strings.ToLower(line), "crit") || 419 + strings.Contains(strings.ToLower(line), "alert") || 420 + strings.Contains(strings.ToLower(line), "emerg") { 421 + criticalCount++ 422 + } 423 + 424 + // Extract and aggregate error patterns 425 + matches := messageRegex.FindStringSubmatch(line) 426 + if len(matches) > 1 { 427 + message := matches[1] 428 + // Normalize the message (remove specific IDs, paths, etc.) 429 + normalized := normalizeErrorMessage(message) 430 + errorPatterns[normalized]++ 431 + } 432 + } 433 + 434 + // Check if error count exceeds threshold 435 + if errorCount > cfg.ErrorThreshold { 436 + select { 437 + case <-alertCooldown.C: 438 + // Cooldown expired, send alert 439 + level := alert.LevelWarning 440 + if criticalCount > 0 { 441 + level = alert.LevelCritical 442 + } 443 + 444 + // Build top error patterns for the alert 445 + topErrors := getTopErrors(errorPatterns, 5) 446 + 447 + title := fmt.Sprintf("Journal Log Errors Detected: %d errors", errorCount) 448 + message := fmt.Sprintf("Found %d error/critical messages in journalctl logs (threshold: %d). ", 449 + errorCount, cfg.ErrorThreshold) 450 + 451 + if criticalCount > 0 { 452 + message += fmt.Sprintf("%d critical-level messages detected. ", criticalCount) 453 + } 454 + 455 + message += "Review system logs for details." 456 + 457 + a := alert.NewAlert(level, title, message) 458 + a.WithMetadata("error_count", errorCount) 459 + a.WithMetadata("critical_count", criticalCount) 460 + a.WithMetadata("threshold", cfg.ErrorThreshold) 461 + a.WithMetadata("lookback_period", cfg.LookbackPeriod.String()) 462 + a.WithMetadata("priorities", strings.Join(cfg.Priorities, ", ")) 463 + 464 + if len(topErrors) > 0 { 465 + a.WithMetadata("top_errors", strings.Join(topErrors, " | ")) 466 + } 467 + 468 + if err := m.alerter.Send(ctx, a); err != nil { 469 + log.Printf("Failed to send journalctl alert: %v", err) 470 + } 471 + alertCooldown.Reset(cfg.Cooldown) 472 + default: 473 + // Within cooldown, skip alert 474 + } 475 + } 476 + } 477 + } 478 + 479 + // normalizeErrorMessage removes specific details from error messages to help aggregate similar errors 480 + func normalizeErrorMessage(msg string) string { 481 + // Remove PIDs 482 + re := regexp.MustCompile(`\b\d{3,}\b`) 483 + msg = re.ReplaceAllString(msg, "[PID]") 484 + 485 + // Remove file paths 486 + re = regexp.MustCompile(`/[\w/.-]+`) 487 + msg = re.ReplaceAllString(msg, "[PATH]") 488 + 489 + // Remove timestamps 490 + re = regexp.MustCompile(`\d{2}:\d{2}:\d{2}`) 491 + msg = re.ReplaceAllString(msg, "[TIME]") 492 + 493 + // Remove hex addresses 494 + re = regexp.MustCompile(`0x[0-9a-fA-F]+`) 495 + msg = re.ReplaceAllString(msg, "[ADDR]") 496 + 497 + // Truncate to first 100 chars 498 + if len(msg) > 100 { 499 + msg = msg[:100] + "..." 500 + } 501 + 502 + return msg 503 + } 504 + 505 + // getTopErrors returns the top N most frequent error patterns 506 + func getTopErrors(patterns map[string]int, n int) []string { 507 + type errorCount struct { 508 + pattern string 509 + count int 510 + } 511 + 512 + var errors []errorCount 513 + for pattern, count := range patterns { 514 + errors = append(errors, errorCount{pattern, count}) 515 + } 516 + 517 + // Simple bubble sort for small lists 518 + for i := 0; i < len(errors); i++ { 519 + for j := i + 1; j < len(errors); j++ { 520 + if errors[j].count > errors[i].count { 521 + errors[i], errors[j] = errors[j], errors[i] 522 + } 523 + } 524 + } 525 + 526 + // Get top N 527 + var result []string 528 + limit := n 529 + if len(errors) < limit { 530 + limit = len(errors) 531 + } 532 + 533 + for i := 0; i < limit; i++ { 534 + result = append(result, fmt.Sprintf("%s (×%d)", errors[i].pattern, errors[i].count)) 535 + } 536 + 537 + return result 538 + } 539 + 540 + // CheckRebootAndNotify checks if the system recently rebooted and sends notification 541 + func CheckRebootAndNotify(ctx context.Context, cfg *config.Config, alerter alert.Alerter) error { 542 + if !cfg.AlertThresholds.Reboot.Enabled { 543 + return nil 544 + } 545 + 546 + // Get system boot time 547 + bootTime, err := host.BootTime() 548 + if err != nil { 549 + return fmt.Errorf("failed to get boot time: %w", err) 550 + } 551 + 552 + // Calculate uptime 553 + uptime := time.Since(time.Unix(int64(bootTime), 0)) 554 + 555 + log.Printf("System uptime: %v (reboot threshold: %v)", 556 + uptime.Round(time.Second), cfg.AlertThresholds.Reboot.UptimeThreshold) 557 + 558 + // Check if uptime is less than threshold (indicating recent reboot) 559 + if uptime < cfg.AlertThresholds.Reboot.UptimeThreshold { 560 + log.Printf("Recent reboot detected - system uptime is %v", uptime.Round(time.Second)) 561 + 562 + bootTimeFormatted := time.Unix(int64(bootTime), 0).Format(time.RFC1123) 563 + 564 + a := alert.NewAlert( 565 + alert.LevelInfo, 566 + "System Reboot Detected", 567 + fmt.Sprintf("The system was recently rebooted. Current uptime: %s. Boot time: %s", 568 + formatDuration(uptime), bootTimeFormatted), 569 + ) 570 + 571 + a.WithMetadata("uptime", formatDuration(uptime)) 572 + a.WithMetadata("boot_time", bootTimeFormatted) 573 + a.WithMetadata("uptime_threshold", cfg.AlertThresholds.Reboot.UptimeThreshold.String()) 574 + 575 + // Get system info 576 + if info, err := host.Info(); err == nil { 577 + a.WithMetadata("os", info.OS) 578 + a.WithMetadata("platform", info.Platform) 579 + a.WithMetadata("platform_version", info.PlatformVersion) 580 + a.WithMetadata("kernel_version", info.KernelVersion) 581 + } 582 + 583 + if err := alerter.Send(ctx, a); err != nil { 584 + return fmt.Errorf("failed to send reboot notification: %w", err) 585 + } 586 + 587 + log.Println("✓ Reboot notification sent successfully") 588 + } 589 + 590 + return nil 591 + } 592 + 593 + // formatDuration formats a duration in a human-readable way 594 + func formatDuration(d time.Duration) string { 595 + d = d.Round(time.Second) 596 + 597 + days := d / (24 * time.Hour) 598 + d -= days * 24 * time.Hour 599 + 600 + hours := d / time.Hour 601 + d -= hours * time.Hour 602 + 603 + minutes := d / time.Minute 604 + d -= minutes * time.Minute 605 + 606 + seconds := d / time.Second 607 + 608 + if days > 0 { 609 + return fmt.Sprintf("%dd %dh %dm %ds", days, hours, minutes, seconds) 610 + } 611 + if hours > 0 { 612 + return fmt.Sprintf("%dh %dm %ds", hours, minutes, seconds) 613 + } 614 + if minutes > 0 { 615 + return fmt.Sprintf("%dm %ds", minutes, seconds) 616 + } 617 + return fmt.Sprintf("%ds", seconds) 618 + }
+58 -93
main.go
··· 5 "errors" 6 "fmt" 7 "os" 8 - "os/exec" 9 "os/signal" 10 "path" 11 - "runtime" 12 "runtime/debug" 13 "strings" 14 "syscall" 15 "time" 16 17 "github.com/spf13/cobra" 18 ) 19 20 var ( 21 - flagConfig = "config" 22 - flagDaemon = "daemon" 23 - cfgFile string 24 - runAsDaemon bool 25 ) 26 27 func main() { ··· 39 40 rootCmd := &cobra.Command{ 41 Use: "servmon", 42 - Short: "KISS server monitoring tool with email alerts", 43 Version: version, 44 RunE: func(cmd *cobra.Command, args []string) error { 45 - runAsDaemon, err := cmd.Flags().GetBool(flagDaemon) 46 - if err != nil { 47 - return fmt.Errorf("error getting flag %s: %v", flagDaemon, err) 48 - } 49 - 50 - if runAsDaemon { 51 - pid, err := runAsDaemonProcess() 52 - if err != nil { 53 - return err 54 - } 55 - 56 - cmd.Println("Running as daemon with PID", pid) 57 - return nil 58 - } 59 - 60 cfgPath, err := cmd.Flags().GetString(flagConfig) 61 if err != nil { 62 return fmt.Errorf("error getting flag %s: %v", flagConfig, err) 63 } 64 65 if _, err := os.Stat(cfgPath); os.IsNotExist(err) { 66 - cfg := defaultConfig() 67 if err := cfg.Save(cfgFile); err != nil { 68 return err 69 } 70 71 - cmd.Println("Configuration file generated at", cfgFile) 72 return nil 73 } else if err != nil { 74 return fmt.Errorf("error checking config file: %v", err) 75 } 76 77 - cfg, err := loadConfig(cfgPath) 78 if err != nil { 79 return err 80 } 81 82 // Set up signal handling for graceful shutdown 83 sigChan := make(chan os.Signal, 1) 84 signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) ··· 87 ctx, cancel := context.WithCancel(context.Background()) 88 defer cancel() 89 90 - // Start monitoring goroutines 91 - go monitorCPU(ctx, cfg) 92 - go monitorMemory(ctx, cfg) 93 94 - for _, diskCfg := range cfg.AlertThresholds.Disks { 95 - go monitorDisk(ctx, cfg, diskCfg) 96 } 97 98 - if cfg.AlertThresholds.HTTP.URL != "" { 99 - go monitorHTTP(ctx, cfg) 100 - } 101 102 - cmd.Println("Servmon started successfully. Monitoring active.") 103 - cmd.Println("Press Ctrl+C to stop.") 104 105 - // Send email notification that monitoring is now active 106 go func() { 107 hostname, err := os.Hostname() 108 if err != nil { 109 hostname = "unknown" 110 } 111 112 - subject := fmt.Sprintf("Monitoring Active on %s", hostname) 113 - body := fmt.Sprintf("ServMon has started successfully and is now actively monitoring:\n\n"+ 114 - "- CPU threshold: %.1f%%\n"+ 115 - "- Memory threshold: %.1f%%\n"+ 116 - "- Disk paths: %s\n", 117 - cfg.AlertThresholds.CPU.Threshold, 118 - cfg.AlertThresholds.Memory.Threshold, 119 - getDiskPaths(cfg)) 120 121 if cfg.AlertThresholds.HTTP.URL != "" { 122 - body += fmt.Sprintf("- HTTP endpoint: %s\n", cfg.AlertThresholds.HTTP.URL) 123 } 124 125 - body += fmt.Sprintf("\nMonitoring started at: %s", time.Now().Format(time.RFC1123)) 126 - 127 - if err := sendEmail(subject, body, cfg); err != nil { 128 - cmd.Printf("Warning: Failed to send monitoring active notification: %v\n", err) 129 } else { 130 - cmd.Println("Monitoring active notification sent successfully.") 131 } 132 }() 133 134 // Wait for shutdown signal 135 sig := <-sigChan 136 - cmd.Printf("\nReceived signal %v, shutting down gracefully...\n", sig) 137 cancel() 138 return nil 139 }, 140 } 141 142 rootCmd.CompletionOptions.DisableDefaultCmd = true 143 rootCmd.PersistentFlags().StringVar(&cfgFile, flagConfig, path.Join(homeDir, ".servmon.yaml"), "config file") 144 - rootCmd.PersistentFlags().BoolVarP(&runAsDaemon, flagDaemon, "d", false, "run as daemon") 145 146 if err := rootCmd.Execute(); err != nil { 147 fmt.Fprint(os.Stderr, err) ··· 149 } 150 } 151 152 - func runAsDaemonProcess() (int, error) { 153 - if runtime.GOOS == "linux" || runtime.GOOS == "freebsd" { 154 - var args []string 155 - for _, a := range os.Args[1:] { 156 - if a != "-d" && a != "--daemon" { 157 - args = append(args, a) 158 - } 159 - } 160 - 161 - cmd := exec.Command(os.Args[0], args...) 162 - cmd.Stdout = os.NewFile(3, "log.out") 163 - cmd.Stderr = os.NewFile(4, "log.err") 164 - cmd.Stdin = os.NewFile(3, "log.in") 165 - 166 - if err := cmd.Start(); err != nil { 167 - return 0, fmt.Errorf("error starting as daemon: %v", err) 168 - } 169 - 170 - pid := cmd.Process.Pid 171 - 172 - // Detach the process 173 - err := cmd.Process.Release() 174 - if err != nil { 175 - return 0, fmt.Errorf("error detaching process: %v", err) 176 - } 177 - 178 - return pid, nil 179 - } 180 - 181 - return 0, fmt.Errorf("daemon mode is only supported on Linux and FreeBSD, not on %s", runtime.GOOS) 182 - } 183 - 184 func getVersion() (string, error) { 185 version, ok := debug.ReadBuildInfo() 186 if !ok { ··· 189 190 return strings.TrimSpace(version.Main.Version), nil 191 } 192 - 193 - // getDiskPaths returns a comma-separated list of monitored disk paths 194 - func getDiskPaths(cfg *Config) string { 195 - var paths []string 196 - for _, disk := range cfg.AlertThresholds.Disks { 197 - paths = append(paths, disk.Path) 198 - } 199 - return strings.Join(paths, ", ") 200 - }
··· 5 "errors" 6 "fmt" 7 "os" 8 "os/signal" 9 "path" 10 "runtime/debug" 11 "strings" 12 "syscall" 13 "time" 14 15 + "pkg.rbrt.fr/servmon/internal/alert" 16 + "pkg.rbrt.fr/servmon/internal/config" 17 + "pkg.rbrt.fr/servmon/internal/monitor" 18 + 19 "github.com/spf13/cobra" 20 ) 21 22 var ( 23 + flagConfig = "config" 24 + cfgFile string 25 ) 26 27 func main() { ··· 39 40 rootCmd := &cobra.Command{ 41 Use: "servmon", 42 + Short: "Server monitoring tool with email alerts", 43 Version: version, 44 RunE: func(cmd *cobra.Command, args []string) error { 45 cfgPath, err := cmd.Flags().GetString(flagConfig) 46 if err != nil { 47 return fmt.Errorf("error getting flag %s: %v", flagConfig, err) 48 } 49 50 if _, err := os.Stat(cfgPath); os.IsNotExist(err) { 51 + cfg := config.Default() 52 if err := cfg.Save(cfgFile); err != nil { 53 return err 54 } 55 56 + cmd.Println("✓ Configuration file generated at", cfgFile) 57 + cmd.Println("Please edit the configuration file and restart servmon") 58 return nil 59 } else if err != nil { 60 return fmt.Errorf("error checking config file: %v", err) 61 } 62 63 + cfg, err := config.Load(cfgPath) 64 if err != nil { 65 return err 66 } 67 68 + // Show current system status 69 + if status, err := monitor.GetCurrentStatus(); err == nil { 70 + cmd.Println() 71 + cmd.Println(status) 72 + cmd.Println() 73 + } 74 + 75 // Set up signal handling for graceful shutdown 76 sigChan := make(chan os.Signal, 1) 77 signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) ··· 80 ctx, cancel := context.WithCancel(context.Background()) 81 defer cancel() 82 83 + // Initialize alerter 84 + emailAlerter := alert.NewEmailAlerter(alert.EmailConfig{ 85 + SMTPServer: cfg.Email.SMTPServer, 86 + SMTPPort: cfg.Email.SMTPPort, 87 + From: cfg.Email.From, 88 + To: cfg.Email.To, 89 + Username: cfg.Email.Username, 90 + Password: cfg.Email.Password, 91 + }) 92 93 + // Check for recent reboot and send notification if needed 94 + if err := monitor.CheckRebootAndNotify(ctx, cfg, emailAlerter); err != nil { 95 + cmd.Printf("Warning: Failed to check reboot status: %v\n", err) 96 } 97 98 + // Create monitor with alerter 99 + mon := monitor.New(cfg, emailAlerter) 100 + 101 + // Start monitoring 102 + mon.Start(ctx) 103 104 + cmd.Println() 105 + cmd.Println("✓ ServMon started successfully. Monitoring active.") 106 + cmd.Println("Monitoring in progress... Press Ctrl+C to stop.") 107 + cmd.Println() 108 109 + // Send startup notification 110 go func() { 111 hostname, err := os.Hostname() 112 if err != nil { 113 hostname = "unknown" 114 } 115 116 + startupAlert := alert.NewAlert( 117 + alert.LevelInfo, 118 + fmt.Sprintf("Monitoring Started on %s", hostname), 119 + "ServMon has started successfully and is now actively monitoring your system.", 120 + ) 121 + 122 + startupAlert.WithMetadata("cpu_threshold", fmt.Sprintf("%.1f%%", cfg.AlertThresholds.CPU.Threshold)) 123 + startupAlert.WithMetadata("memory_threshold", fmt.Sprintf("%.1f%%", cfg.AlertThresholds.Memory.Threshold)) 124 + startupAlert.WithMetadata("disk_paths", cfg.GetDiskPaths()) 125 126 if cfg.AlertThresholds.HTTP.URL != "" { 127 + startupAlert.WithMetadata("http_endpoint", cfg.AlertThresholds.HTTP.URL) 128 } 129 130 + if err := emailAlerter.Send(ctx, startupAlert); err != nil { 131 + cmd.Printf("Warning: Failed to send startup notification: %v\n", err) 132 } else { 133 + cmd.Println("✓ Startup notification sent successfully") 134 } 135 }() 136 137 // Wait for shutdown signal 138 sig := <-sigChan 139 + cmd.Printf("Received signal %v, shutting down gracefully...\n", sig) 140 cancel() 141 + 142 + // Give goroutines time to clean up 143 + time.Sleep(1 * time.Second) 144 + cmd.Println("✓ ServMon stopped successfully") 145 return nil 146 }, 147 } 148 149 rootCmd.CompletionOptions.DisableDefaultCmd = true 150 rootCmd.PersistentFlags().StringVar(&cfgFile, flagConfig, path.Join(homeDir, ".servmon.yaml"), "config file") 151 152 if err := rootCmd.Execute(); err != nil { 153 fmt.Fprint(os.Stderr, err) ··· 155 } 156 } 157 158 func getVersion() (string, error) { 159 version, ok := debug.ReadBuildInfo() 160 if !ok { ··· 163 164 return strings.TrimSpace(version.Main.Version), nil 165 }
-223
monitor.go
··· 1 - package main 2 - 3 - import ( 4 - "context" 5 - "fmt" 6 - "log" 7 - "net/http" 8 - "time" 9 - 10 - "github.com/shirou/gopsutil/v4/cpu" 11 - "github.com/shirou/gopsutil/v4/disk" 12 - "github.com/shirou/gopsutil/v4/mem" 13 - ) 14 - 15 - func monitorCPU(ctx context.Context, cfg *Config) { 16 - log.Printf("Monitoring CPU usage with threshold %.2f%%, check interval %v, and cooldown %v", 17 - cfg.AlertThresholds.CPU.Threshold, cfg.AlertThresholds.CPU.CheckInterval, cfg.AlertThresholds.CPU.Cooldown) 18 - 19 - // Initialize cooldown timer in expired state so the first alert can fire immediately. 20 - // We create a timer with 0 duration and drain it right away, so the select case 21 - // <-alertCooldown.C will succeed on the first threshold breach. 22 - alertCooldown := time.NewTimer(0) 23 - <-alertCooldown.C 24 - 25 - ticker := time.NewTicker(cfg.AlertThresholds.CPU.CheckInterval) 26 - defer ticker.Stop() 27 - 28 - for { 29 - select { 30 - case <-ctx.Done(): 31 - log.Println("CPU monitor shutting down") 32 - return 33 - case <-ticker.C: 34 - } 35 - percent, err := cpu.Percent(cfg.AlertThresholds.CPU.Duration, false) 36 - if err != nil { 37 - log.Printf("Error getting CPU usage: %v", err) 38 - continue 39 - } 40 - 41 - // Average CPU usage across all cores 42 - var total float64 43 - for _, p := range percent { 44 - total += p 45 - } 46 - 47 - // Safety check: prevent division by zero 48 - if len(percent) == 0 { 49 - log.Printf("Warning: CPU percentage returned empty array, skipping check") 50 - continue 51 - } 52 - avg := total / float64(len(percent)) 53 - 54 - if avg > cfg.AlertThresholds.CPU.Threshold { 55 - // Check if we're within the cooldown period using non-blocking select. 56 - // If the timer has expired, we can send an alert and reset the timer. 57 - // If not, we skip the alert to prevent spam. 58 - select { 59 - case <-alertCooldown.C: 60 - // Cooldown expired, send alert 61 - err := sendEmail(fmt.Sprintf("CPU Usage Alert: %.2f%%", avg), 62 - fmt.Sprintf("CPU usage of %.2f%% has exceeded the threshold of %.2f%%", avg, cfg.AlertThresholds.CPU.Threshold), cfg) 63 - if err != nil { 64 - log.Printf("Error sending email: %v", err) 65 - } 66 - // Reset timer to start a new cooldown period 67 - alertCooldown.Reset(cfg.AlertThresholds.CPU.Cooldown) 68 - default: 69 - // Within cooldown, skip alert 70 - } 71 - } 72 - } 73 - } 74 - 75 - func monitorMemory(ctx context.Context, cfg *Config) { 76 - log.Printf("Monitoring memory usage with threshold %.2f%%, check interval %v, and cooldown %v", 77 - cfg.AlertThresholds.Memory.Threshold, cfg.AlertThresholds.Memory.CheckInterval, cfg.AlertThresholds.Memory.Cooldown) 78 - 79 - alertCooldown := time.NewTimer(0) 80 - <-alertCooldown.C // Drain the initial timer immediately so first alert can fire 81 - 82 - ticker := time.NewTicker(cfg.AlertThresholds.Memory.CheckInterval) 83 - defer ticker.Stop() 84 - 85 - for { 86 - select { 87 - case <-ctx.Done(): 88 - log.Println("Memory monitor shutting down") 89 - return 90 - case <-ticker.C: 91 - } 92 - vm, err := mem.VirtualMemory() 93 - if err != nil { 94 - log.Printf("Error getting memory usage: %v", err) 95 - continue 96 - } 97 - 98 - usedPercent := vm.UsedPercent 99 - 100 - if usedPercent > cfg.AlertThresholds.Memory.Threshold { 101 - // Check if we're within the cooldown period 102 - select { 103 - case <-alertCooldown.C: 104 - // Cooldown expired, send alert 105 - err := sendEmail(fmt.Sprintf("Memory Usage Alert: %.2f%%", usedPercent), 106 - fmt.Sprintf("Memory usage of %.2f%% has exceeded the threshold of %.2f%%", usedPercent, cfg.AlertThresholds.Memory.Threshold), cfg) 107 - if err != nil { 108 - log.Printf("Error sending email: %v", err) 109 - } 110 - alertCooldown.Reset(cfg.AlertThresholds.Memory.Cooldown) 111 - default: 112 - // Within cooldown, skip alert 113 - } 114 - } 115 - } 116 - } 117 - 118 - func monitorDisk(ctx context.Context, cfg *Config, diskCfg DiskConfig) { 119 - log.Printf("Monitoring disk %s usage with threshold %.2f%%, check interval %v, and cooldown %v", 120 - diskCfg.Path, diskCfg.Threshold, diskCfg.CheckInterval, diskCfg.Cooldown) 121 - 122 - alertCooldown := time.NewTimer(0) 123 - <-alertCooldown.C // Drain the initial timer immediately so first alert can fire 124 - 125 - ticker := time.NewTicker(diskCfg.CheckInterval) 126 - defer ticker.Stop() 127 - 128 - for { 129 - select { 130 - case <-ctx.Done(): 131 - log.Printf("Disk monitor for %s shutting down\n", diskCfg.Path) 132 - return 133 - case <-ticker.C: 134 - } 135 - usage, err := disk.Usage(diskCfg.Path) 136 - if err != nil { 137 - log.Printf("Error getting disk usage for %s: %v", diskCfg.Path, err) 138 - continue 139 - } 140 - 141 - usedPercent := usage.UsedPercent 142 - if usedPercent > diskCfg.Threshold { 143 - // Check if we're within the cooldown period 144 - select { 145 - case <-alertCooldown.C: 146 - // Cooldown expired, send alert 147 - err := sendEmail(fmt.Sprintf("Disk Usage Alert: %s %.2f%%", diskCfg.Path, usedPercent), 148 - fmt.Sprintf("Disk usage for %s of %.2f%% has exceeded the threshold of %.2f%%", diskCfg.Path, usedPercent, diskCfg.Threshold), cfg) 149 - if err != nil { 150 - log.Printf("Error sending email: %v", err) 151 - } 152 - alertCooldown.Reset(diskCfg.Cooldown) 153 - default: 154 - // Within cooldown, skip alert 155 - } 156 - } 157 - } 158 - } 159 - 160 - func monitorHTTP(ctx context.Context, cfg *Config) { 161 - log.Printf("Monitoring HTTP checks (%s) with threshold %.2f%% and cooldown %v", cfg.AlertThresholds.HTTP.URL, cfg.AlertThresholds.HTTP.FailureThreshold, cfg.AlertThresholds.HTTP.Cooldown) 162 - 163 - alertCooldown := time.NewTimer(0) 164 - <-alertCooldown.C // Drain the initial timer immediately so first alert can fire 165 - client := &http.Client{ 166 - Timeout: cfg.AlertThresholds.HTTP.Timeout, 167 - } 168 - 169 - for { 170 - // Wait for check interval or context cancellation 171 - select { 172 - case <-ctx.Done(): 173 - log.Println("HTTP monitor shutting down") 174 - return 175 - case <-time.After(cfg.AlertThresholds.HTTP.CheckInterval): 176 - } 177 - 178 - // Perform HTTP checks in batch to calculate failure rate. 179 - // We make sample_rate number of requests and track how many fail. 180 - failureCount := 0 181 - for i := 0; i < cfg.AlertThresholds.HTTP.SampleRate; i++ { 182 - req, err := http.NewRequest("GET", cfg.AlertThresholds.HTTP.URL, nil) 183 - if err != nil { 184 - failureCount++ 185 - continue 186 - } 187 - 188 - ctx, cancel := context.WithTimeout(context.Background(), cfg.AlertThresholds.HTTP.Timeout) 189 - 190 - resp, err := client.Do(req.WithContext(ctx)) 191 - if err != nil { 192 - failureCount++ 193 - } else { 194 - if resp.StatusCode >= 400 { 195 - failureCount++ 196 - } 197 - resp.Body.Close() 198 - } 199 - 200 - cancel() 201 - } 202 - 203 - // Calculate failure rate 204 - failureRate := (float64(failureCount) / float64(cfg.AlertThresholds.HTTP.SampleRate)) * 100 205 - if failureRate > cfg.AlertThresholds.HTTP.FailureThreshold { 206 - // Check if we're within the cooldown period 207 - select { 208 - case <-alertCooldown.C: 209 - // Cooldown expired, check again 210 - alertCooldown.Reset(cfg.AlertThresholds.HTTP.Cooldown) 211 - default: 212 - // Within cooldown, skip alert 213 - continue 214 - } 215 - 216 - err := sendEmail(fmt.Sprintf("HTTP Failure Alert: %.2f%%", failureRate), 217 - fmt.Sprintf("HTTP failure rate of %.2f%% has exceeded the threshold of %.2f%%", failureRate, cfg.AlertThresholds.HTTP.FailureThreshold), cfg) 218 - if err != nil { 219 - log.Printf("Error sending email: %v", err) 220 - } 221 - } 222 - } 223 - }
···
+79 -34
readme.md
··· 1 - # Servmon 2 3 KISS (Keep It Simple, Stupid) server monitoring tool with email alerts. 4 5 For those who want to keep it simple instead of using complex setups like Prometheus, Grafana, and Alertmanager. 6 - It uses the awesome [gopsutil](https://github.com/shirou/gopsutil) library to get system metrics. 7 8 ## Features 9 10 - - [x] **CPU Monitoring** - Monitor CPU usage with configurable thresholds and duration 11 - - [x] **Memory Monitoring** - Track memory usage with percentage-based alerts 12 - - [x] **Disk Monitoring** - Monitor multiple disk partitions independently 13 - - [x] **HTTP Health Checks** - Periodic health checks with failure rate monitoring 14 - - [x] **Email Alerts** - SMTP-based email notifications with configurable cooldowns 15 - - [x] **Graceful Shutdown** - Clean shutdown on SIGTERM/SIGINT 16 - - [x] **Config Validation** - Automatic validation of configuration parameters 17 - - [ ] Disk Write/Read performance monitoring 18 19 ## Installation 20 21 - ### Go 22 - 23 ```bash 24 go install pkg.rbrt.fr/servmon@latest 25 ``` 26 27 - ### Docker 28 29 - ```bash 30 - docker build -t servmon . 31 ``` 32 33 - ## How to use 34 35 - ### Go 36 37 ```bash 38 - servmon --help 39 ``` 40 41 - ### Docker 42 43 ```bash 44 - # Create config directory 45 - mkdir -p config 46 - cp .servmon.example.yaml config/.servmon.yaml 47 - # Edit config/.servmon.yaml with your settings 48 49 - # Run 50 - docker run -d \ 51 - --name servmon \ 52 - --restart unless-stopped \ 53 - --pid=host \ 54 - --network=host \ 55 - -v /:/host:ro \ 56 - -v /sys:/sys:ro \ 57 - -v /proc:/proc:ro \ 58 - -v $(pwd)/config/.servmon.yaml:/root/.servmon.yaml:ro \ 59 - servmon 60 ```
··· 1 + # ServMon 2 3 KISS (Keep It Simple, Stupid) server monitoring tool with email alerts. 4 5 For those who want to keep it simple instead of using complex setups like Prometheus, Grafana, and Alertmanager. 6 7 ## Features 8 9 + - **CPU & Memory Monitoring** - Alert on high usage 10 + - **Disk Monitoring** - Monitor multiple partitions 11 + - **HTTP Health Checks** - Monitor endpoint availability 12 + - **Journalctl Monitoring** - Scan systemd logs for errors 13 + - **Reboot Detection** - Email notification on system reboot 14 + - **Email Alerts** - Rich formatting with severity levels 15 + - **Smart Cooldowns** - Prevent alert spam 16 17 ## Installation 18 19 ```bash 20 go install pkg.rbrt.fr/servmon@latest 21 ``` 22 23 + ## Configuration 24 + 25 + Edit `~/.servmon.yaml` or `/etc/servmon/config.yaml`: 26 + 27 + ```yaml 28 + alert_thresholds: 29 + cpu: 30 + threshold: 90 31 + duration: 5m0s 32 + cooldown: 30m0s 33 + check_interval: 10s 34 + 35 + memory: 36 + threshold: 80 37 + cooldown: 30m0s 38 + check_interval: 10s 39 + 40 + disks: 41 + - path: / 42 + threshold: 90 43 + cooldown: 4h0m0s 44 + check_interval: 1m0s 45 + 46 + http: 47 + url: http://localhost:8080/health 48 + timeout: 5s 49 + sample_rate: 10 50 + failure_threshold: 20 51 + check_interval: 1m0s 52 + cooldown: 15m0s 53 + 54 + journalctl: 55 + enabled: true 56 + check_interval: 5m0s 57 + lookback_period: 5m0s 58 + error_threshold: 10 59 + priorities: 60 + - err 61 + - crit 62 + - alert 63 + - emerg 64 + cooldown: 30m0s 65 66 + reboot: 67 + enabled: true 68 + uptime_threshold: 10m0s 69 + 70 + email: 71 + smtp_server: smtp.gmail.com 72 + smtp_port: 587 73 + from: alerts@yourdomain.com 74 + to: admin@yourdomain.com 75 + username: alerts@yourdomain.com 76 + password: your-app-password 77 ``` 78 79 + ## Usage 80 81 + ### Run Manually 82 83 ```bash 84 + servmon --config ~/.servmon.yaml 85 ``` 86 87 + ### Systemd Service 88 89 ```bash 90 + # Start 91 + sudo systemctl start servmon 92 93 + # Enable auto-start 94 + sudo systemctl enable servmon 95 + 96 + # View logs 97 + sudo journalctl -u servmon -f 98 + 99 + # Check status 100 + sudo systemctl status servmon 101 ``` 102 + 103 + ## License 104 + 105 + [MIT](license).
+44
servmon.service
···
··· 1 + [Unit] 2 + Description=ServMon - KISS Server Monitoring with Email Alerts 3 + Documentation=https://pkg.rbrt.fr/servmon 4 + After=network-online.target 5 + Wants=network-online.target 6 + 7 + [Service] 8 + Type=simple 9 + User=root 10 + Group=root 11 + 12 + # Path to servmon binary (adjust if installed elsewhere) 13 + ExecStart=/usr/local/bin/servmon --config /etc/servmon/config.yaml 14 + 15 + # Restart policy 16 + Restart=always 17 + RestartSec=10 18 + 19 + # Logging 20 + StandardOutput=journal 21 + StandardError=journal 22 + SyslogIdentifier=servmon 23 + 24 + # Security hardening 25 + NoNewPrivileges=true 26 + PrivateTmp=true 27 + 28 + # Allow reading system information 29 + ProtectSystem=strict 30 + ProtectHome=true 31 + ReadOnlyPaths=/ 32 + 33 + # Allow writing to specific paths if needed 34 + # ReadWritePaths=/var/log/servmon 35 + 36 + # Process limits 37 + LimitNOFILE=65536 38 + LimitNPROC=512 39 + 40 + # Environment 41 + Environment="PATH=/usr/local/bin:/usr/bin:/bin" 42 + 43 + [Install] 44 + WantedBy=multi-user.target