groups: - name: mockupaws-application interval: 30s rules: #------------------------------------------------------------------------------ # Availability & Uptime #------------------------------------------------------------------------------ - alert: ServiceDown expr: up{job="mockupaws-backend"} == 0 for: 1m labels: severity: critical service: backend annotations: summary: "mockupAWS Backend is down" description: "The mockupAWS backend has been down for more than 1 minute." runbook_url: "https://docs.mockupaws.com/runbooks/service-down" - alert: ServiceUnhealthy expr: probe_success{job="blackbox-http"} == 0 for: 2m labels: severity: critical annotations: summary: "mockupAWS is unreachable" description: "Health check has failed for {{ $labels.instance }} for more than 2 minutes." #------------------------------------------------------------------------------ # Error Rate Alerts #------------------------------------------------------------------------------ - alert: HighErrorRate expr: | ( sum(rate(http_requests_total{job="mockupaws-backend",status=~"5.."}[5m])) / sum(rate(http_requests_total{job="mockupaws-backend"}[5m])) ) > 0.01 for: 2m labels: severity: critical annotations: summary: "High error rate detected" description: "Error rate is {{ $value | humanizePercentage }} over the last 5 minutes." - alert: High5xxRate expr: sum(rate(http_requests_total{status=~"5.."}[1m])) > 10 for: 1m labels: severity: critical annotations: summary: "High 5xx error rate" description: "More than 10 5xx errors per minute." #------------------------------------------------------------------------------ # Latency Alerts #------------------------------------------------------------------------------ - alert: HighLatencyP95 expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5 for: 3m labels: severity: warning annotations: summary: "High latency detected (p95 > 500ms)" description: "95th percentile latency is {{ $value }}s." - alert: VeryHighLatencyP95 expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1.0 for: 2m labels: severity: critical annotations: summary: "Very high latency detected (p95 > 1s)" description: "95th percentile latency is {{ $value }}s." - alert: HighLatencyP50 expr: histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m])) > 0.2 for: 5m labels: severity: warning annotations: summary: "Latency above target (p50 > 200ms)" description: "50th percentile latency is {{ $value }}s." #------------------------------------------------------------------------------ # Throughput Alerts #------------------------------------------------------------------------------ - alert: LowRequestRate expr: rate(http_requests_total[5m]) < 0.1 for: 10m labels: severity: warning annotations: summary: "Low request rate detected" description: "Request rate is unusually low ({{ $value }}/s)." - alert: TrafficSpike expr: | ( rate(http_requests_total[5m]) / avg_over_time(rate(http_requests_total[1h] offset 1h)[1h:5m]) ) > 5 for: 2m labels: severity: warning annotations: summary: "Traffic spike detected" description: "Traffic is {{ $value }}x higher than average." - name: infrastructure interval: 30s rules: #------------------------------------------------------------------------------ # CPU Alerts #------------------------------------------------------------------------------ - alert: HighCPUUsage expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is above 80% for more than 5 minutes." - alert: CriticalCPUUsage expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 for: 2m labels: severity: critical annotations: summary: "Critical CPU usage on {{ $labels.instance }}" description: "CPU usage is above 95%." #------------------------------------------------------------------------------ # Memory Alerts #------------------------------------------------------------------------------ - alert: HighMemoryUsage expr: | ( node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes ) / node_memory_MemTotal_bytes * 100 > 85 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is above 85% for more than 5 minutes." - alert: CriticalMemoryUsage expr: | ( node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes ) / node_memory_MemTotal_bytes * 100 > 95 for: 2m labels: severity: critical annotations: summary: "Critical memory usage on {{ $labels.instance }}" description: "Memory usage is above 95%." #------------------------------------------------------------------------------ # Disk Alerts #------------------------------------------------------------------------------ - alert: HighDiskUsage expr: | ( node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_avail_bytes{mountpoint="/"} ) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 80 for: 5m labels: severity: warning annotations: summary: "High disk usage on {{ $labels.instance }}" description: "Disk usage is above 80% for more than 5 minutes." - alert: CriticalDiskUsage expr: | ( node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_avail_bytes{mountpoint="/"} ) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 90 for: 2m labels: severity: critical annotations: summary: "Critical disk usage on {{ $labels.instance }}" description: "Disk usage is above 90%." - name: database interval: 30s rules: #------------------------------------------------------------------------------ # PostgreSQL Alerts #------------------------------------------------------------------------------ - alert: PostgreSQLDown expr: pg_up == 0 for: 1m labels: severity: critical annotations: summary: "PostgreSQL is down" description: "PostgreSQL instance {{ $labels.instance }} is down." - alert: PostgreSQLHighConnections expr: | ( pg_stat_activity_count{state="active"} / pg_settings_max_connections ) * 100 > 80 for: 5m labels: severity: warning annotations: summary: "High PostgreSQL connection usage" description: "PostgreSQL connection usage is {{ $value }}%." - alert: PostgreSQLReplicationLag expr: pg_replication_lag > 30 for: 5m labels: severity: warning annotations: summary: "PostgreSQL replication lag" description: "Replication lag is {{ $value }} seconds." - alert: PostgreSQLSlowQueries expr: | rate(pg_stat_statements_calls[5m]) > 0 and ( rate(pg_stat_statements_total_time[5m]) / rate(pg_stat_statements_calls[5m]) ) > 1000 for: 5m labels: severity: warning annotations: summary: "Slow PostgreSQL queries detected" description: "Average query time is above 1 second." - name: redis interval: 30s rules: #------------------------------------------------------------------------------ # Redis Alerts #------------------------------------------------------------------------------ - alert: RedisDown expr: redis_up == 0 for: 1m labels: severity: critical annotations: summary: "Redis is down" description: "Redis instance {{ $labels.instance }} is down." - alert: RedisHighMemoryUsage expr: | ( redis_memory_used_bytes / redis_memory_max_bytes ) * 100 > 85 for: 5m labels: severity: warning annotations: summary: "High Redis memory usage" description: "Redis memory usage is {{ $value }}%." - alert: RedisLowHitRate expr: | ( rate(redis_keyspace_hits_total[5m]) / ( rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]) ) ) < 0.8 for: 10m labels: severity: warning annotations: summary: "Low Redis cache hit rate" description: "Redis cache hit rate is below 80%." - alert: RedisTooManyConnections expr: redis_connected_clients > 100 for: 5m labels: severity: warning annotations: summary: "High Redis connection count" description: "Redis has {{ $value }} connected clients." - name: business interval: 60s rules: #------------------------------------------------------------------------------ # Business Metrics Alerts #------------------------------------------------------------------------------ - alert: LowScenarioCreationRate expr: rate(scenarios_created_total[1h]) < 0.1 for: 30m labels: severity: warning annotations: summary: "Low scenario creation rate" description: "Scenario creation rate is unusually low." - alert: HighReportGenerationFailures expr: | ( rate(reports_failed_total[5m]) / rate(reports_total[5m]) ) > 0.1 for: 5m labels: severity: warning annotations: summary: "High report generation failure rate" description: "Report failure rate is {{ $value | humanizePercentage }}." - alert: IngestionBacklog expr: ingestion_queue_depth > 1000 for: 5m labels: severity: warning annotations: summary: "Log ingestion backlog" description: "Ingestion queue has {{ $value }} pending items."