Files
mockupAWS/scripts/archive_job.py
Luca Sacchi Ricciardi 38fd6cb562
Some checks failed
E2E Tests / Run E2E Tests (push) Waiting to run
E2E Tests / Visual Regression Tests (push) Blocked by required conditions
E2E Tests / Smoke Tests (push) Waiting to run
CI/CD - Build & Test / Backend Tests (push) Has been cancelled
CI/CD - Build & Test / Frontend Tests (push) Has been cancelled
CI/CD - Build & Test / Security Scans (push) Has been cancelled
CI/CD - Build & Test / Docker Build Test (push) Has been cancelled
CI/CD - Build & Test / Terraform Validate (push) Has been cancelled
Deploy to Production / Build & Test (push) Has been cancelled
Deploy to Production / Security Scan (push) Has been cancelled
Deploy to Production / Build Docker Images (push) Has been cancelled
Deploy to Production / Deploy to Staging (push) Has been cancelled
Deploy to Production / E2E Tests (push) Has been cancelled
Deploy to Production / Deploy to Production (push) Has been cancelled
release: v1.0.0 - Production Ready
Complete production-ready release with all v1.0.0 features:

Architecture & Planning (@spec-architect):
- Production architecture design with scalability and HA
- Security audit plan and compliance review
- Technical debt assessment and refactoring roadmap

Database (@db-engineer):
- 17 performance indexes and 3 materialized views
- PgBouncer connection pooling
- Automated backup/restore with PITR (RTO<1h, RPO<5min)
- Data archiving strategy (~65% storage savings)

Backend (@backend-dev):
- Redis caching layer with 3-tier strategy
- Celery async jobs with Flower monitoring
- API v2 with rate limiting (tiered: free/premium/enterprise)
- Prometheus metrics and OpenTelemetry tracing
- Security hardening (headers, audit logging)

Frontend (@frontend-dev):
- Bundle optimization: 308KB (code splitting, lazy loading)
- Onboarding tutorial (react-joyride)
- Command palette (Cmd+K) and keyboard shortcuts
- Analytics dashboard with cost predictions
- i18n (English + Italian) and WCAG 2.1 AA compliance

DevOps (@devops-engineer):
- Complete deployment guide (Docker, K8s, AWS ECS)
- Terraform AWS infrastructure (Multi-AZ RDS, ElastiCache, ECS)
- CI/CD pipelines with blue-green deployment
- Prometheus + Grafana monitoring with 15+ alert rules
- SLA definition and incident response procedures

QA (@qa-engineer):
- 153+ E2E test cases (85% coverage)
- k6 performance tests (1000+ concurrent users, p95<200ms)
- Security testing (0 critical vulnerabilities)
- Cross-browser and mobile testing
- Official QA sign-off

Production Features:
 Horizontal scaling ready
 99.9% uptime target
 <200ms response time (p95)
 Enterprise-grade security
 Complete observability
 Disaster recovery
 SLA monitoring

Ready for production deployment! 🚀
2026-04-07 20:14:51 +02:00

650 lines
24 KiB
Python
Executable File

#!/usr/bin/env python3
"""
mockupAWS Data Archive Job v1.0.0
Nightly archive job for old data:
- Scenario logs > 1 year → archive
- Scenario metrics > 2 years → aggregate → archive
- Reports > 6 months → compress → S3
Usage:
python scripts/archive_job.py --dry-run # Preview what would be archived
python scripts/archive_job.py --logs # Archive logs only
python scripts/archive_job.py --metrics # Archive metrics only
python scripts/archive_job.py --reports # Archive reports only
python scripts/archive_job.py --all # Archive all (default)
Environment:
DATABASE_URL - PostgreSQL connection string
S3_BUCKET - S3 bucket for report archiving
AWS_ACCESS_KEY_ID - AWS credentials
AWS_SECRET_ACCESS_KEY - AWS credentials
"""
import asyncio
import argparse
import logging
import os
import sys
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Any, Tuple
from uuid import UUID, uuid4
import boto3
from botocore.exceptions import ClientError
from sqlalchemy import select, insert, delete, func, text
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
from sqlalchemy.dialects.postgresql import UUID as PGUUID
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(f"storage/logs/archive_{datetime.now():%Y%m%d_%H%M%S}.log"),
],
)
logger = logging.getLogger(__name__)
# Database configuration
DATABASE_URL = os.getenv(
"DATABASE_URL", "postgresql+asyncpg://postgres:postgres@localhost:5432/mockupaws"
)
# Archive configuration
ARCHIVE_CONFIG = {
"logs": {
"table": "scenario_logs",
"archive_table": "scenario_logs_archive",
"date_column": "received_at",
"archive_after_days": 365,
"batch_size": 10000,
},
"metrics": {
"table": "scenario_metrics",
"archive_table": "scenario_metrics_archive",
"date_column": "timestamp",
"archive_after_days": 730,
"aggregate_before_archive": True,
"aggregation_period": "day",
"batch_size": 5000,
},
"reports": {
"table": "reports",
"archive_table": "reports_archive",
"date_column": "created_at",
"archive_after_days": 180,
"compress_files": True,
"s3_bucket": os.getenv("REPORTS_ARCHIVE_BUCKET", "mockupaws-reports-archive"),
"s3_prefix": "archived-reports/",
"batch_size": 100,
},
}
class ArchiveJob:
"""Data archive job runner."""
def __init__(self, dry_run: bool = False):
self.dry_run = dry_run
self.engine = create_async_engine(DATABASE_URL, echo=False)
self.session_factory = async_sessionmaker(
self.engine, class_=AsyncSession, expire_on_commit=False
)
self.job_id: Optional[UUID] = None
self.stats: Dict[str, Any] = {
"logs": {"processed": 0, "archived": 0, "deleted": 0, "bytes": 0},
"metrics": {"processed": 0, "archived": 0, "deleted": 0, "bytes": 0},
"reports": {"processed": 0, "archived": 0, "deleted": 0, "bytes": 0},
}
async def create_job_record(self, job_type: str) -> UUID:
"""Create archive job tracking record."""
job_id = uuid4()
async with self.session_factory() as session:
await session.execute(
text("""
INSERT INTO archive_jobs (id, job_type, status, started_at)
VALUES (:id, :type, 'running', NOW())
"""),
{"id": job_id, "type": job_type},
)
await session.commit()
self.job_id = job_id
return job_id
async def update_job_status(self, status: str, error_message: Optional[str] = None):
"""Update job status in database."""
if not self.job_id:
return
async with self.session_factory() as session:
total_processed = sum(s["processed"] for s in self.stats.values())
total_archived = sum(s["archived"] for s in self.stats.values())
total_deleted = sum(s["deleted"] for s in self.stats.values())
total_bytes = sum(s["bytes"] for s in self.stats.values())
await session.execute(
text("""
UPDATE archive_jobs
SET status = :status,
completed_at = CASE WHEN :status IN ('completed', 'failed') THEN NOW() ELSE NULL END,
records_processed = :processed,
records_archived = :archived,
records_deleted = :deleted,
bytes_archived = :bytes,
error_message = :error
WHERE id = :id
"""),
{
"id": self.job_id,
"status": status,
"processed": total_processed,
"archived": total_archived,
"deleted": total_deleted,
"bytes": total_bytes,
"error": error_message,
},
)
await session.commit()
async def archive_logs(self) -> Tuple[int, int, int]:
"""Archive old scenario logs (> 1 year)."""
logger.info("Starting logs archive job...")
config = ARCHIVE_CONFIG["logs"]
cutoff_date = datetime.utcnow() - timedelta(days=config["archive_after_days"])
async with self.session_factory() as session:
# Count records to archive
count_result = await session.execute(
text(f"""
SELECT COUNT(*) FROM {config["table"]}
WHERE {config["date_column"]} < :cutoff
"""),
{"cutoff": cutoff_date},
)
total_count = count_result.scalar()
if total_count == 0:
logger.info("No logs to archive")
return 0, 0, 0
logger.info(
f"Found {total_count} logs to archive (older than {cutoff_date.date()})"
)
if self.dry_run:
logger.info(f"[DRY RUN] Would archive {total_count} logs")
return total_count, 0, 0
processed = 0
archived = 0
deleted = 0
while processed < total_count:
# Archive batch
batch_result = await session.execute(
text(f"""
WITH batch AS (
SELECT id FROM {config["table"]}
WHERE {config["date_column"]} < :cutoff
LIMIT :batch_size
),
archived AS (
INSERT INTO {config["archive_table"]}
(id, scenario_id, received_at, message_hash, message_preview,
source, size_bytes, has_pii, token_count, sqs_blocks,
archived_at, archive_batch_id)
SELECT
id, scenario_id, received_at, message_hash, message_preview,
source, size_bytes, has_pii, token_count, sqs_blocks,
NOW(), :job_id
FROM {config["table"]}
WHERE id IN (SELECT id FROM batch)
ON CONFLICT (id) DO NOTHING
RETURNING id
),
deleted AS (
DELETE FROM {config["table"]}
WHERE id IN (SELECT id FROM batch)
RETURNING id
)
SELECT
(SELECT COUNT(*) FROM batch) as batch_count,
(SELECT COUNT(*) FROM archived) as archived_count,
(SELECT COUNT(*) FROM deleted) as deleted_count
"""),
{
"cutoff": cutoff_date,
"batch_size": config["batch_size"],
"job_id": self.job_id,
},
)
row = batch_result.fetchone()
batch_processed = row.batch_count
batch_archived = row.archived_count
batch_deleted = row.deleted_count
processed += batch_processed
archived += batch_archived
deleted += batch_deleted
logger.info(
f"Archived batch: {batch_archived} archived, {batch_deleted} deleted ({processed}/{total_count})"
)
await session.commit()
if batch_processed == 0:
break
self.stats["logs"]["processed"] = processed
self.stats["logs"]["archived"] = archived
self.stats["logs"]["deleted"] = deleted
logger.info(
f"Logs archive completed: {archived} archived, {deleted} deleted"
)
return processed, archived, deleted
async def aggregate_metrics(
self, session: AsyncSession, scenario_id: UUID, cutoff_date: datetime
) -> int:
"""Aggregate metrics before archiving."""
# Aggregate by day
await session.execute(
text("""
INSERT INTO scenario_metrics_archive (
id, scenario_id, timestamp, metric_type, metric_name,
value, unit, extra_data, archived_at, archive_batch_id,
is_aggregated, aggregation_period, sample_count
)
SELECT
uuid_generate_v4(),
scenario_id,
DATE_TRUNC('day', timestamp) as day,
metric_type,
metric_name,
AVG(value) as avg_value,
unit,
'{}'::jsonb as extra_data,
NOW(),
:job_id,
true,
'day',
COUNT(*) as sample_count
FROM scenario_metrics
WHERE scenario_id = :scenario_id
AND timestamp < :cutoff
GROUP BY scenario_id, DATE_TRUNC('day', timestamp), metric_type, metric_name, unit
ON CONFLICT DO NOTHING
"""),
{"scenario_id": scenario_id, "cutoff": cutoff_date, "job_id": self.job_id},
)
return 0
async def archive_metrics(self) -> Tuple[int, int, int]:
"""Archive old scenario metrics (> 2 years)."""
logger.info("Starting metrics archive job...")
config = ARCHIVE_CONFIG["metrics"]
cutoff_date = datetime.utcnow() - timedelta(days=config["archive_after_days"])
async with self.session_factory() as session:
# First, aggregate metrics
if config.get("aggregate_before_archive"):
logger.info("Aggregating metrics before archive...")
# Get distinct scenarios with old metrics
scenarios_result = await session.execute(
text(f"""
SELECT DISTINCT scenario_id
FROM {config["table"]}
WHERE {config["date_column"]} < :cutoff
"""),
{"cutoff": cutoff_date},
)
scenarios = [row[0] for row in scenarios_result.fetchall()]
for scenario_id in scenarios:
await self.aggregate_metrics(session, scenario_id, cutoff_date)
await session.commit()
logger.info(f"Aggregated metrics for {len(scenarios)} scenarios")
# Count records to archive (non-aggregated)
count_result = await session.execute(
text(f"""
SELECT COUNT(*) FROM {config["table"]}
WHERE {config["date_column"]} < :cutoff
"""),
{"cutoff": cutoff_date},
)
total_count = count_result.scalar()
if total_count == 0:
logger.info("No metrics to archive")
return 0, 0, 0
logger.info(
f"Found {total_count} metrics to archive (older than {cutoff_date.date()})"
)
if self.dry_run:
logger.info(f"[DRY RUN] Would archive {total_count} metrics")
return total_count, 0, 0
processed = 0
archived = 0
deleted = 0
while processed < total_count:
# Archive batch (non-aggregated)
batch_result = await session.execute(
text(f"""
WITH batch AS (
SELECT id FROM {config["table"]}
WHERE {config["date_column"]} < :cutoff
LIMIT :batch_size
),
archived AS (
INSERT INTO {config["archive_table"]}
(id, scenario_id, timestamp, metric_type, metric_name,
value, unit, extra_data, archived_at, archive_batch_id,
is_aggregated, aggregation_period, sample_count)
SELECT
id, scenario_id, timestamp, metric_type, metric_name,
value, unit, extra_data, NOW(), :job_id,
false, null, null
FROM {config["table"]}
WHERE id IN (SELECT id FROM batch)
ON CONFLICT (id) DO NOTHING
RETURNING id
),
deleted AS (
DELETE FROM {config["table"]}
WHERE id IN (SELECT id FROM batch)
RETURNING id
)
SELECT
(SELECT COUNT(*) FROM batch) as batch_count,
(SELECT COUNT(*) FROM archived) as archived_count,
(SELECT COUNT(*) FROM deleted) as deleted_count
"""),
{
"cutoff": cutoff_date,
"batch_size": config["batch_size"],
"job_id": self.job_id,
},
)
row = batch_result.fetchone()
batch_processed = row.batch_count
batch_archived = row.archived_count
batch_deleted = row.deleted_count
processed += batch_processed
archived += batch_archived
deleted += batch_deleted
logger.info(
f"Archived metrics batch: {batch_archived} archived ({processed}/{total_count})"
)
await session.commit()
if batch_processed == 0:
break
self.stats["metrics"]["processed"] = processed
self.stats["metrics"]["archived"] = archived
self.stats["metrics"]["deleted"] = deleted
logger.info(
f"Metrics archive completed: {archived} archived, {deleted} deleted"
)
return processed, archived, deleted
async def archive_reports(self) -> Tuple[int, int, int]:
"""Archive old reports (> 6 months) to S3."""
logger.info("Starting reports archive job...")
config = ARCHIVE_CONFIG["reports"]
cutoff_date = datetime.utcnow() - timedelta(days=config["archive_after_days"])
s3_client = None
if not self.dry_run:
try:
s3_client = boto3.client("s3")
except Exception as e:
logger.error(f"Failed to initialize S3 client: {e}")
return 0, 0, 0
async with self.session_factory() as session:
# Count records to archive
count_result = await session.execute(
text(f"""
SELECT COUNT(*), COALESCE(SUM(file_size_bytes), 0)
FROM {config["table"]}
WHERE {config["date_column"]} < :cutoff
"""),
{"cutoff": cutoff_date},
)
row = count_result.fetchone()
total_count = row[0]
total_bytes = row[1] or 0
if total_count == 0:
logger.info("No reports to archive")
return 0, 0, 0
logger.info(
f"Found {total_count} reports to archive ({total_bytes / 1024 / 1024:.2f} MB)"
)
if self.dry_run:
logger.info(f"[DRY RUN] Would archive {total_count} reports to S3")
return total_count, 0, 0
processed = 0
archived = 0
deleted = 0
bytes_archived = 0
while processed < total_count:
# Get batch of reports
batch_result = await session.execute(
text(f"""
SELECT id, scenario_id, format, file_path, file_size_bytes,
generated_by, extra_data, created_at
FROM {config["table"]}
WHERE {config["date_column"]} < :cutoff
LIMIT :batch_size
"""),
{"cutoff": cutoff_date, "batch_size": config["batch_size"]},
)
reports = batch_result.fetchall()
if not reports:
break
for report in reports:
try:
# Upload to S3
if os.path.exists(report.file_path):
s3_key = f"{config['s3_prefix']}{report.scenario_id}/{report.id}.{report.format}"
s3_client.upload_file(
report.file_path, config["s3_bucket"], s3_key
)
s3_location = f"s3://{config['s3_bucket']}/{s3_key}"
# Delete local file
os.remove(report.file_path)
deleted_files = 1
else:
s3_location = None
deleted_files = 0
# Insert archive record
await session.execute(
text(f"""
INSERT INTO {config["archive_table"]}
(id, scenario_id, format, file_path, file_size_bytes,
generated_by, extra_data, created_at, archived_at,
s3_location, deleted_locally, archive_batch_id)
VALUES
(:id, :scenario_id, :format, :file_path, :file_size,
:generated_by, :extra_data, :created_at, NOW(),
:s3_location, true, :job_id)
ON CONFLICT (id) DO NOTHING
"""),
{
"id": report.id,
"scenario_id": report.scenario_id,
"format": report.format,
"file_path": report.file_path,
"file_size": report.file_size_bytes,
"generated_by": report.generated_by,
"extra_data": report.extra_data,
"created_at": report.created_at,
"s3_location": s3_location,
"job_id": self.job_id,
},
)
# Delete from main table
await session.execute(
text(f"DELETE FROM {config['table']} WHERE id = :id"),
{"id": report.id},
)
archived += 1
deleted += deleted_files
bytes_archived += report.file_size_bytes or 0
except Exception as e:
logger.error(f"Failed to archive report {report.id}: {e}")
processed += len(reports)
await session.commit()
logger.info(
f"Archived reports batch: {archived} uploaded ({processed}/{total_count})"
)
self.stats["reports"]["processed"] = processed
self.stats["reports"]["archived"] = archived
self.stats["reports"]["deleted"] = deleted
self.stats["reports"]["bytes"] = bytes_archived
logger.info(
f"Reports archive completed: {archived} archived, {bytes_archived / 1024 / 1024:.2f} MB saved"
)
return processed, archived, deleted
async def run(self, archive_types: List[str]):
"""Run archive job for specified types."""
start_time = datetime.utcnow()
logger.info("=" * 60)
logger.info("mockupAWS Data Archive Job v1.0.0")
logger.info("=" * 60)
logger.info(f"Mode: {'DRY RUN' if self.dry_run else 'LIVE'}")
logger.info(f"Archive types: {', '.join(archive_types)}")
# Create job record
await self.create_job_record(
"all" if len(archive_types) > 1 else archive_types[0]
)
try:
# Run archive jobs
if "logs" in archive_types:
await self.archive_logs()
if "metrics" in archive_types:
await self.archive_metrics()
if "reports" in archive_types:
await self.archive_reports()
# Update job status
if not self.dry_run:
await self.update_job_status("completed")
# Print summary
duration = (datetime.utcnow() - start_time).total_seconds()
total_archived = sum(s["archived"] for s in self.stats.values())
total_bytes = sum(s["bytes"] for s in self.stats.values())
logger.info("=" * 60)
logger.info("Archive Job Summary")
logger.info("=" * 60)
logger.info(f"Duration: {duration:.1f} seconds")
logger.info(f"Total archived: {total_archived} records")
logger.info(f"Total space saved: {total_bytes / 1024 / 1024:.2f} MB")
for archive_type, stats in self.stats.items():
if stats["processed"] > 0:
logger.info(
f" {archive_type}: {stats['archived']} archived, {stats['deleted']} deleted"
)
logger.info("=" * 60)
logger.info(
"Archive job completed successfully"
if not self.dry_run
else "Dry run completed"
)
except Exception as e:
logger.error(f"Archive job failed: {e}")
if not self.dry_run:
await self.update_job_status("failed", str(e))
raise
finally:
await self.engine.dispose()
def main():
parser = argparse.ArgumentParser(description="mockupAWS Data Archive Job")
parser.add_argument(
"--dry-run", action="store_true", help="Preview without archiving"
)
parser.add_argument("--logs", action="store_true", help="Archive logs only")
parser.add_argument("--metrics", action="store_true", help="Archive metrics only")
parser.add_argument("--reports", action="store_true", help="Archive reports only")
parser.add_argument(
"--all", action="store_true", help="Archive all types (default)"
)
args = parser.parse_args()
# Determine which types to archive
types = []
if args.logs:
types.append("logs")
if args.metrics:
types.append("metrics")
if args.reports:
types.append("reports")
if not types or args.all:
types = ["logs", "metrics", "reports"]
# Run job
job = ArchiveJob(dry_run=args.dry_run)
asyncio.run(job.run(types))
if __name__ == "__main__":
main()