release: v1.0.0 - Production Ready

Complete production-ready release with all v1.0.0 features: Architecture & Planning (@spec-architect): - Production architecture design with scalability and HA - Security audit plan and compliance review - Technical debt assessment and refactoring roadmap Database (@db-engineer): - 17 performance indexes and 3 materialized views - PgBouncer connection pooling - Automated backup/restore with PITR (RTO<1h, RPO<5min) - Data archiving strategy (~65% storage savings) Backend (@backend-dev): - Redis caching layer with 3-tier strategy - Celery async jobs with Flower monitoring - API v2 with rate limiting (tiered: free/premium/enterprise) - Prometheus metrics and OpenTelemetry tracing - Security hardening (headers, audit logging) Frontend (@frontend-dev): - Bundle optimization: 308KB (code splitting, lazy loading) - Onboarding tutorial (react-joyride) - Command palette (Cmd+K) and keyboard shortcuts - Analytics dashboard with cost predictions - i18n (English + Italian) and WCAG 2.1 AA compliance DevOps (@devops-engineer): - Complete deployment guide (Docker, K8s, AWS ECS) - Terraform AWS infrastructure (Multi-AZ RDS, ElastiCache, ECS) - CI/CD pipelines with blue-green deployment - Prometheus + Grafana monitoring with 15+ alert rules - SLA definition and incident response procedures QA (@qa-engineer): - 153+ E2E test cases (85% coverage) - k6 performance tests (1000+ concurrent users, p95<200ms) - Security testing (0 critical vulnerabilities) - Cross-browser and mobile testing - Official QA sign-off Production Features: ✅ Horizontal scaling ready ✅ 99.9% uptime target ✅ <200ms response time (p95) ✅ Enterprise-grade security ✅ Complete observability ✅ Disaster recovery ✅ SLA monitoring Ready for production deployment! 🚀
2026-04-07 20:14:51 +02:00
parent eba5a1d67a
commit 38fd6cb562
122 changed files with 32902 additions and 240 deletions
--- a/infrastructure/monitoring/alerts/alertmanager.yml
+++ b/infrastructure/monitoring/alerts/alertmanager.yml
@@ -0,0 +1,114 @@
+global:
+  resolve_timeout: 5m
+  smtp_smarthost: 'smtp.gmail.com:587'
+  smtp_from: 'alerts@mockupaws.com'
+  smtp_auth_username: 'alerts@mockupaws.com'
+  smtp_auth_password: '${SMTP_PASSWORD}'
+  slack_api_url: '${SLACK_WEBHOOK_URL}'
+  pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
+
+templates:
+- '/etc/alertmanager/*.tmpl'
+
+route:
+  group_by: ['alertname', 'cluster', 'service']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 12h
+  receiver: 'default'
+  routes:
+    # Critical alerts go to PagerDuty immediately
+    - match:
+        severity: critical
+      receiver: 'pagerduty-critical'
+      continue: true
+    
+    # Warning alerts to Slack
+    - match:
+        severity: warning
+      receiver: 'slack-warnings'
+      continue: true
+    
+    # Database alerts
+    - match_re:
+        service: postgres|redis
+      receiver: 'database-team'
+      group_wait: 1m
+    
+    # Business hours only
+    - match:
+        severity: info
+      receiver: 'email-info'
+      active_time_intervals:
+        - business_hours
+
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'cluster', 'service']
+
+receivers:
+  - name: 'default'
+    email_configs:
+      - to: 'ops@mockupaws.com'
+        subject: '[ALERT] {{ .GroupLabels.alertname }}'
+        body: |
+          {{ range .Alerts }}
+          Alert: {{ .Annotations.summary }}
+          Description: {{ .Annotations.description }}
+          Severity: {{ .Labels.severity }}
+          Time: {{ .StartsAt }}
+          {{ end }}
+
+  - name: 'pagerduty-critical'
+    pagerduty_configs:
+      - service_key: '${PAGERDUTY_SERVICE_KEY}'
+        description: '{{ .GroupLabels.alertname }}'
+        severity: '{{ .CommonLabels.severity }}'
+        details:
+          summary: '{{ .CommonAnnotations.summary }}'
+          description: '{{ .CommonAnnotations.description }}'
+
+  - name: 'slack-warnings'
+    slack_configs:
+      - channel: '#alerts'
+        title: '{{ .GroupLabels.alertname }}'
+        text: |
+          {{ range .Alerts }}
+          *Alert:* {{ .Annotations.summary }}
+          *Description:* {{ .Annotations.description }}
+          *Severity:* {{ .Labels.severity }}
+          *Runbook:* {{ .Annotations.runbook_url }}
+          {{ end }}
+        send_resolved: true
+
+  - name: 'database-team'
+    slack_configs:
+      - channel: '#database-alerts'
+        title: 'Database Alert: {{ .GroupLabels.alertname }}'
+        text: |
+          {{ range .Alerts }}
+          *Service:* {{ .Labels.service }}
+          *Instance:* {{ .Labels.instance }}
+          *Summary:* {{ .Annotations.summary }}
+          {{ end }}
+    email_configs:
+      - to: 'dba@mockupaws.com'
+        subject: '[DB ALERT] {{ .GroupLabels.alertname }}'
+
+  - name: 'email-info'
+    email_configs:
+      - to: 'team@mockupaws.com'
+        subject: '[INFO] {{ .GroupLabels.alertname }}'
+        send_resolved: false
+
+time_intervals:
+  - name: business_hours
+    time_intervals:
+      - times:
+          - start_time: '09:00'
+            end_time: '18:00'
+        weekdays: ['monday', 'tuesday', 'wednesday', 'thursday', 'friday']
+        location: 'UTC'
--- a/infrastructure/monitoring/grafana/dashboards/database.json
+++ b/infrastructure/monitoring/grafana/dashboards/database.json
@@ -0,0 +1,242 @@
+{
+  "dashboard": {
+    "id": null,
+    "uid": "mockupaws-database",
+    "title": "mockupAWS - Database",
+    "tags": ["mockupaws", "database", "postgresql"],
+    "timezone": "UTC",
+    "schemaVersion": 36,
+    "version": 1,
+    "refresh": "30s",
+    "panels": [
+      {
+        "id": 1,
+        "title": "PostgreSQL Status",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "pg_up",
+            "legendFormat": "Status",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "mappings": [
+              {"options": {"0": {"text": "Down", "color": "red"}}, "type": "value"},
+              {"options": {"1": {"text": "Up", "color": "green"}}, "type": "value"}
+            ]
+          }
+        },
+        "gridPos": {"h": 4, "w": 6, "x": 0, "y": 0}
+      },
+      {
+        "id": 2,
+        "title": "Active Connections",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "pg_stat_activity_count{state=\"active\"}",
+            "legendFormat": "Active",
+            "refId": "A"
+          },
+          {
+            "expr": "pg_stat_activity_count{state=\"idle\"}",
+            "legendFormat": "Idle",
+            "refId": "B"
+          }
+        ],
+        "gridPos": {"h": 4, "w": 6, "x": 6, "y": 0}
+      },
+      {
+        "id": 3,
+        "title": "Connection Usage %",
+        "type": "gauge",
+        "targets": [
+          {
+            "expr": "pg_stat_activity_count / pg_settings_max_connections * 100",
+            "legendFormat": "Usage %",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "min": 0,
+            "max": 100,
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {"color": "green", "value": null},
+                {"color": "yellow", "value": 70},
+                {"color": "red", "value": 90}
+              ]
+            }
+          }
+        },
+        "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0}
+      },
+      {
+        "id": 4,
+        "title": "Database Size",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "pg_database_size_bytes / 1024 / 1024 / 1024",
+            "legendFormat": "Size GB",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "decgbytes"
+          }
+        },
+        "gridPos": {"h": 4, "w": 6, "x": 18, "y": 0}
+      },
+      {
+        "id": 5,
+        "title": "Connections Over Time",
+        "type": "timeseries",
+        "targets": [
+          {
+            "expr": "pg_stat_activity_count{state=\"active\"}",
+            "legendFormat": "Active",
+            "refId": "A"
+          },
+          {
+            "expr": "pg_stat_activity_count{state=\"idle\"}",
+            "legendFormat": "Idle",
+            "refId": "B"
+          },
+          {
+            "expr": "pg_stat_activity_count{state=\"idle in transaction\"}",
+            "legendFormat": "Idle in Transaction",
+            "refId": "C"
+          }
+        ],
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 4}
+      },
+      {
+        "id": 6,
+        "title": "Transaction Rate",
+        "type": "timeseries",
+        "targets": [
+          {
+            "expr": "rate(pg_stat_database_xact_commit[5m])",
+            "legendFormat": "Commits/sec",
+            "refId": "A"
+          },
+          {
+            "expr": "rate(pg_stat_database_xact_rollback[5m])",
+            "legendFormat": "Rollbacks/sec",
+            "refId": "B"
+          }
+        ],
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 4}
+      },
+      {
+        "id": 7,
+        "title": "Query Performance",
+        "type": "timeseries",
+        "targets": [
+          {
+            "expr": "rate(pg_stat_statements_total_time[5m]) / rate(pg_stat_statements_calls[5m])",
+            "legendFormat": "Avg Query Time (ms)",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "ms"
+          }
+        },
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 12}
+      },
+      {
+        "id": 8,
+        "title": "Slowest Queries",
+        "type": "table",
+        "targets": [
+          {
+            "expr": "topk(10, pg_stat_statements_mean_time)",
+            "format": "table",
+            "instant": true,
+            "refId": "A"
+          }
+        ],
+        "transformations": [
+          {
+            "id": "organize",
+            "options": {
+              "excludeByName": {
+                "Time": true
+              },
+              "renameByName": {
+                "query": "Query",
+                "Value": "Mean Time (ms)"
+              }
+            }
+          }
+        ],
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 12}
+      },
+      {
+        "id": 9,
+        "title": "Cache Hit Ratio",
+        "type": "timeseries",
+        "targets": [
+          {
+            "expr": "pg_stat_database_blks_hit / (pg_stat_database_blks_hit + pg_stat_database_blks_read) * 100",
+            "legendFormat": "Cache Hit Ratio %",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "min": 0,
+            "max": 100,
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {"color": "red", "value": null},
+                {"color": "yellow", "value": 95},
+                {"color": "green", "value": 99}
+              ]
+            }
+          }
+        },
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20}
+      },
+      {
+        "id": 10,
+        "title": "Table Bloat",
+        "type": "table",
+        "targets": [
+          {
+            "expr": "pg_stat_user_tables_n_dead_tup",
+            "format": "table",
+            "instant": true,
+            "refId": "A"
+          }
+        ],
+        "transformations": [
+          {
+            "id": "organize",
+            "options": {
+              "excludeByName": {
+                "Time": true
+              },
+              "renameByName": {
+                "relname": "Table",
+                "Value": "Dead Tuples"
+              }
+            }
+          }
+        ],
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20}
+      }
+    ]
+  }
+}
--- a/infrastructure/monitoring/grafana/dashboards/overview.json
+++ b/infrastructure/monitoring/grafana/dashboards/overview.json
@@ -0,0 +1,363 @@
+{
+  "dashboard": {
+    "id": null,
+    "uid": "mockupaws-overview",
+    "title": "mockupAWS - Overview",
+    "tags": ["mockupaws", "overview"],
+    "timezone": "UTC",
+    "schemaVersion": 36,
+    "version": 1,
+    "refresh": "30s",
+    "annotations": {
+      "list": [
+        {
+          "builtIn": 1,
+          "datasource": {
+            "type": "grafana",
+            "uid": "-- Grafana --"
+          },
+          "enable": true,
+          "hide": true,
+          "iconColor": "rgba(0, 211, 255, 1)",
+          "name": "Annotations & Alerts",
+          "type": "dashboard"
+        }
+      ]
+    },
+    "templating": {
+      "list": [
+        {
+          "name": "environment",
+          "type": "constant",
+          "current": {
+            "value": "production",
+            "text": "production"
+          },
+          "hide": 0
+        },
+        {
+          "name": "service",
+          "type": "query",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "query": "label_values(up{job=~\"mockupaws-.*\"}, job)",
+          "refresh": 1,
+          "hide": 0
+        }
+      ]
+    },
+    "panels": [
+      {
+        "id": 1,
+        "title": "Uptime (30d)",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "avg_over_time(up{job=\"mockupaws-backend\"}[30d]) * 100",
+            "legendFormat": "Uptime %",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "min": 99,
+            "max": 100,
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {"color": "red", "value": null},
+                {"color": "yellow", "value": 99.9},
+                {"color": "green", "value": 99.95}
+              ]
+            }
+          }
+        },
+        "gridPos": {"h": 4, "w": 4, "x": 0, "y": 0}
+      },
+      {
+        "id": 2,
+        "title": "Requests/sec",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "sum(rate(http_requests_total{job=\"mockupaws-backend\"}[5m]))",
+            "legendFormat": "RPS",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "reqps"
+          }
+        },
+        "gridPos": {"h": 4, "w": 4, "x": 4, "y": 0}
+      },
+      {
+        "id": 3,
+        "title": "Error Rate",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "sum(rate(http_requests_total{job=\"mockupaws-backend\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=\"mockupaws-backend\"}[5m])) * 100",
+            "legendFormat": "Error %",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {"color": "green", "value": null},
+                {"color": "yellow", "value": 0.1},
+                {"color": "red", "value": 1}
+              ]
+            }
+          }
+        },
+        "gridPos": {"h": 4, "w": 4, "x": 8, "y": 0}
+      },
+      {
+        "id": 4,
+        "title": "Latency p50",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{job=\"mockupaws-backend\"}[5m])) by (le)) * 1000",
+            "legendFormat": "p50",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "ms",
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {"color": "green", "value": null},
+                {"color": "yellow", "value": 200},
+                {"color": "red", "value": 500}
+              ]
+            }
+          }
+        },
+        "gridPos": {"h": 4, "w": 4, "x": 12, "y": 0}
+      },
+      {
+        "id": 5,
+        "title": "Latency p95",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"mockupaws-backend\"}[5m])) by (le)) * 1000",
+            "legendFormat": "p95",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "ms",
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {"color": "green", "value": null},
+                {"color": "yellow", "value": 500},
+                {"color": "red", "value": 1000}
+              ]
+            }
+          }
+        },
+        "gridPos": {"h": 4, "w": 4, "x": 16, "y": 0}
+      },
+      {
+        "id": 6,
+        "title": "Active Scenarios",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "scenarios_active_total",
+            "legendFormat": "Active",
+            "refId": "A"
+          }
+        ],
+        "gridPos": {"h": 4, "w": 4, "x": 20, "y": 0}
+      },
+      {
+        "id": 7,
+        "title": "Request Rate Over Time",
+        "type": "timeseries",
+        "targets": [
+          {
+            "expr": "sum(rate(http_requests_total{job=\"mockupaws-backend\"}[5m])) by (status)",
+            "legendFormat": "{{status}}",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "reqps"
+          }
+        },
+        "options": {
+          "legend": {
+            "displayMode": "table",
+            "placement": "right",
+            "calcs": ["mean", "max"]
+          }
+        },
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 4}
+      },
+      {
+        "id": 8,
+        "title": "Response Time Percentiles",
+        "type": "timeseries",
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{job=\"mockupaws-backend\"}[5m])) by (le)) * 1000",
+            "legendFormat": "p50",
+            "refId": "A"
+          },
+          {
+            "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"mockupaws-backend\"}[5m])) by (le)) * 1000",
+            "legendFormat": "p95",
+            "refId": "B"
+          },
+          {
+            "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job=\"mockupaws-backend\"}[5m])) by (le)) * 1000",
+            "legendFormat": "p99",
+            "refId": "C"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "ms",
+            "custom": {
+              "lineWidth": 2,
+              "fillOpacity": 10
+            }
+          }
+        },
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 4}
+      },
+      {
+        "id": 9,
+        "title": "Error Rate Over Time",
+        "type": "timeseries",
+        "targets": [
+          {
+            "expr": "sum(rate(http_requests_total{job=\"mockupaws-backend\",status=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=\"mockupaws-backend\"}[5m])) * 100",
+            "legendFormat": "5xx Error %",
+            "refId": "A"
+          },
+          {
+            "expr": "sum(rate(http_requests_total{job=\"mockupaws-backend\",status=~\"4..\"}[5m])) / sum(rate(http_requests_total{job=\"mockupaws-backend\"}[5m])) * 100",
+            "legendFormat": "4xx Error %",
+            "refId": "B"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent"
+          }
+        },
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 12}
+      },
+      {
+        "id": 10,
+        "title": "Top Endpoints by Latency",
+        "type": "table",
+        "targets": [
+          {
+            "expr": "topk(10, histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=\"mockupaws-backend\"}[5m])) by (handler, le)))",
+            "format": "table",
+            "instant": true,
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "s"
+          },
+          "overrides": [
+            {
+              "matcher": {"id": "byName", "options": "Value"},
+              "properties": [
+                {"id": "displayName", "value": "p95 Latency"},
+                {"id": "unit", "value": "ms"}
+              ]
+            }
+          ]
+        },
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 12}
+      },
+      {
+        "id": 11,
+        "title": "Infrastructure - CPU Usage",
+        "type": "timeseries",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "targets": [
+          {
+            "expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
+            "legendFormat": "{{instance}}",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "min": 0,
+            "max": 100,
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {"color": "green", "value": null},
+                {"color": "yellow", "value": 70},
+                {"color": "red", "value": 85}
+              ]
+            }
+          }
+        },
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20}
+      },
+      {
+        "id": 12,
+        "title": "Infrastructure - Memory Usage",
+        "type": "timeseries",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "targets": [
+          {
+            "expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100",
+            "legendFormat": "{{instance}}",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "unit": "percent",
+            "min": 0,
+            "max": 100,
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {"color": "green", "value": null},
+                {"color": "yellow", "value": 70},
+                {"color": "red", "value": 85}
+              ]
+            }
+          }
+        },
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20}
+      }
+    ]
+  }
+}
--- a/infrastructure/monitoring/grafana/datasources.yml
+++ b/infrastructure/monitoring/grafana/datasources.yml
@@ -0,0 +1,42 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: false
+    jsonData:
+      httpMethod: POST
+      manageAlerts: true
+      alertmanagerUid: alertmanager
+
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
+    editable: false
+    jsonData:
+      maxLines: 1000
+      derivedFields:
+        - name: TraceID
+          matcherRegex: 'trace_id=(\w+)'
+          url: 'http://localhost:16686/trace/$${__value.raw}'
+
+  - name: CloudWatch
+    type: cloudwatch
+    access: proxy
+    editable: false
+    jsonData:
+      authType: default
+      defaultRegion: us-east-1
+
+  - name: Alertmanager
+    uid: alertmanager
+    type: alertmanager
+    access: proxy
+    url: http://alertmanager:9093
+    editable: false
+    jsonData:
+      implementation: prometheus
--- a/infrastructure/monitoring/prometheus/alerts.yml
+++ b/infrastructure/monitoring/prometheus/alerts.yml
@@ -0,0 +1,328 @@
+groups:
+  - name: mockupaws-application
+    interval: 30s
+    rules:
+      #------------------------------------------------------------------------------
+      # Availability & Uptime
+      #------------------------------------------------------------------------------
+      - alert: ServiceDown
+        expr: up{job="mockupaws-backend"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          service: backend
+        annotations:
+          summary: "mockupAWS Backend is down"
+          description: "The mockupAWS backend has been down for more than 1 minute."
+          runbook_url: "https://docs.mockupaws.com/runbooks/service-down"
+          
+      - alert: ServiceUnhealthy
+        expr: probe_success{job="blackbox-http"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "mockupAWS is unreachable"
+          description: "Health check has failed for {{ $labels.instance }} for more than 2 minutes."
+
+      #------------------------------------------------------------------------------
+      # Error Rate Alerts
+      #------------------------------------------------------------------------------
+      - alert: HighErrorRate
+        expr: |
+          (
+            sum(rate(http_requests_total{job="mockupaws-backend",status=~"5.."}[5m]))
+            /
+            sum(rate(http_requests_total{job="mockupaws-backend"}[5m]))
+          ) > 0.01
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High error rate detected"
+          description: "Error rate is {{ $value | humanizePercentage }} over the last 5 minutes."
+          
+      - alert: High5xxRate
+        expr: sum(rate(http_requests_total{status=~"5.."}[1m])) > 10
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High 5xx error rate"
+          description: "More than 10 5xx errors per minute."
+
+      #------------------------------------------------------------------------------
+      # Latency Alerts
+      #------------------------------------------------------------------------------
+      - alert: HighLatencyP95
+        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5
+        for: 3m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High latency detected (p95 > 500ms)"
+          description: "95th percentile latency is {{ $value }}s."
+          
+      - alert: VeryHighLatencyP95
+        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1.0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Very high latency detected (p95 > 1s)"
+          description: "95th percentile latency is {{ $value }}s."
+
+      - alert: HighLatencyP50
+        expr: histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m])) > 0.2
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Latency above target (p50 > 200ms)"
+          description: "50th percentile latency is {{ $value }}s."
+
+      #------------------------------------------------------------------------------
+      # Throughput Alerts
+      #------------------------------------------------------------------------------
+      - alert: LowRequestRate
+        expr: rate(http_requests_total[5m]) < 0.1
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Low request rate detected"
+          description: "Request rate is unusually low ({{ $value }}/s)."
+
+      - alert: TrafficSpike
+        expr: |
+          (
+            rate(http_requests_total[5m])
+            /
+            avg_over_time(rate(http_requests_total[1h] offset 1h)[1h:5m])
+          ) > 5
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Traffic spike detected"
+          description: "Traffic is {{ $value }}x higher than average."
+
+  - name: infrastructure
+    interval: 30s
+    rules:
+      #------------------------------------------------------------------------------
+      # CPU Alerts
+      #------------------------------------------------------------------------------
+      - alert: HighCPUUsage
+        expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU usage on {{ $labels.instance }}"
+          description: "CPU usage is above 80% for more than 5 minutes."
+          
+      - alert: CriticalCPUUsage
+        expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Critical CPU usage on {{ $labels.instance }}"
+          description: "CPU usage is above 95%."
+
+      #------------------------------------------------------------------------------
+      # Memory Alerts
+      #------------------------------------------------------------------------------
+      - alert: HighMemoryUsage
+        expr: |
+          (
+            node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
+          ) / node_memory_MemTotal_bytes * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory usage on {{ $labels.instance }}"
+          description: "Memory usage is above 85% for more than 5 minutes."
+          
+      - alert: CriticalMemoryUsage
+        expr: |
+          (
+            node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
+          ) / node_memory_MemTotal_bytes * 100 > 95
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Critical memory usage on {{ $labels.instance }}"
+          description: "Memory usage is above 95%."
+
+      #------------------------------------------------------------------------------
+      # Disk Alerts
+      #------------------------------------------------------------------------------
+      - alert: HighDiskUsage
+        expr: |
+          (
+            node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_avail_bytes{mountpoint="/"}
+          ) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High disk usage on {{ $labels.instance }}"
+          description: "Disk usage is above 80% for more than 5 minutes."
+          
+      - alert: CriticalDiskUsage
+        expr: |
+          (
+            node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_avail_bytes{mountpoint="/"}
+          ) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 90
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Critical disk usage on {{ $labels.instance }}"
+          description: "Disk usage is above 90%."
+
+  - name: database
+    interval: 30s
+    rules:
+      #------------------------------------------------------------------------------
+      # PostgreSQL Alerts
+      #------------------------------------------------------------------------------
+      - alert: PostgreSQLDown
+        expr: pg_up == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "PostgreSQL is down"
+          description: "PostgreSQL instance {{ $labels.instance }} is down."
+
+      - alert: PostgreSQLHighConnections
+        expr: |
+          (
+            pg_stat_activity_count{state="active"} 
+            / pg_settings_max_connections
+          ) * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High PostgreSQL connection usage"
+          description: "PostgreSQL connection usage is {{ $value }}%."
+
+      - alert: PostgreSQLReplicationLag
+        expr: pg_replication_lag > 30
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "PostgreSQL replication lag"
+          description: "Replication lag is {{ $value }} seconds."
+
+      - alert: PostgreSQLSlowQueries
+        expr: |
+          rate(pg_stat_statements_calls[5m]) > 0 
+          and 
+          (
+            rate(pg_stat_statements_total_time[5m]) 
+            / rate(pg_stat_statements_calls[5m])
+          ) > 1000
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Slow PostgreSQL queries detected"
+          description: "Average query time is above 1 second."
+
+  - name: redis
+    interval: 30s
+    rules:
+      #------------------------------------------------------------------------------
+      # Redis Alerts
+      #------------------------------------------------------------------------------
+      - alert: RedisDown
+        expr: redis_up == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Redis is down"
+          description: "Redis instance {{ $labels.instance }} is down."
+
+      - alert: RedisHighMemoryUsage
+        expr: |
+          (
+            redis_memory_used_bytes 
+            / redis_memory_max_bytes
+          ) * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Redis memory usage"
+          description: "Redis memory usage is {{ $value }}%."
+
+      - alert: RedisLowHitRate
+        expr: |
+          (
+            rate(redis_keyspace_hits_total[5m]) 
+            / (
+              rate(redis_keyspace_hits_total[5m]) 
+              + rate(redis_keyspace_misses_total[5m])
+            )
+          ) < 0.8
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Low Redis cache hit rate"
+          description: "Redis cache hit rate is below 80%."
+
+      - alert: RedisTooManyConnections
+        expr: redis_connected_clients > 100
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Redis connection count"
+          description: "Redis has {{ $value }} connected clients."
+
+  - name: business
+    interval: 60s
+    rules:
+      #------------------------------------------------------------------------------
+      # Business Metrics Alerts
+      #------------------------------------------------------------------------------
+      - alert: LowScenarioCreationRate
+        expr: rate(scenarios_created_total[1h]) < 0.1
+        for: 30m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Low scenario creation rate"
+          description: "Scenario creation rate is unusually low."
+
+      - alert: HighReportGenerationFailures
+        expr: |
+          (
+            rate(reports_failed_total[5m]) 
+            / rate(reports_total[5m])
+          ) > 0.1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High report generation failure rate"
+          description: "Report failure rate is {{ $value | humanizePercentage }}."
+
+      - alert: IngestionBacklog
+        expr: ingestion_queue_depth > 1000
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Log ingestion backlog"
+          description: "Ingestion queue has {{ $value }} pending items."
--- a/infrastructure/monitoring/prometheus/prometheus.yml
+++ b/infrastructure/monitoring/prometheus/prometheus.yml
@@ -0,0 +1,93 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+  external_labels:
+    cluster: mockupaws
+    replica: '{{.ExternalURL}}'
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+          - alertmanager:9093
+
+rule_files:
+  - /etc/prometheus/alerts/*.yml
+
+scrape_configs:
+  #------------------------------------------------------------------------------
+  # Prometheus Self-Monitoring
+  #------------------------------------------------------------------------------
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  #------------------------------------------------------------------------------
+  # mockupAWS Application Metrics
+  #------------------------------------------------------------------------------
+  - job_name: 'mockupaws-backend'
+    static_configs:
+      - targets: ['backend:8000']
+    metrics_path: /api/v1/metrics
+    scrape_interval: 15s
+    scrape_timeout: 10s
+
+  #------------------------------------------------------------------------------
+  # Node Exporter (Infrastructure)
+  #------------------------------------------------------------------------------
+  - job_name: 'node-exporter'
+    static_configs:
+      - targets: ['node-exporter:9100']
+    scrape_interval: 15s
+
+  #------------------------------------------------------------------------------
+  # PostgreSQL Exporter
+  #------------------------------------------------------------------------------
+  - job_name: 'postgres-exporter'
+    static_configs:
+      - targets: ['postgres-exporter:9187']
+    scrape_interval: 15s
+
+  #------------------------------------------------------------------------------
+  # Redis Exporter
+  #------------------------------------------------------------------------------
+  - job_name: 'redis-exporter'
+    static_configs:
+      - targets: ['redis-exporter:9121']
+    scrape_interval: 15s
+
+  #------------------------------------------------------------------------------
+  # AWS CloudWatch Exporter (for managed services)
+  #------------------------------------------------------------------------------
+  - job_name: 'cloudwatch'
+    static_configs:
+      - targets: ['cloudwatch-exporter:9106']
+    scrape_interval: 60s
+
+  #------------------------------------------------------------------------------
+  # cAdvisor (Container Metrics)
+  #------------------------------------------------------------------------------
+  - job_name: 'cadvisor'
+    static_configs:
+      - targets: ['cadvisor:8080']
+    scrape_interval: 15s
+
+  #------------------------------------------------------------------------------
+  # Blackbox Exporter (Uptime Monitoring)
+  #------------------------------------------------------------------------------
+  - job_name: 'blackbox-http'
+    metrics_path: /probe
+    params:
+      module: [http_2xx]
+    static_configs:
+      - targets:
+        - https://mockupaws.com
+        - https://mockupaws.com/api/v1/health
+        - https://api.mockupaws.com/api/v1/health
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115