#!/usr/bin/env python3 """ mockupAWS Data Archive Job v1.0.0 Nightly archive job for old data: - Scenario logs > 1 year → archive - Scenario metrics > 2 years → aggregate → archive - Reports > 6 months → compress → S3 Usage: python scripts/archive_job.py --dry-run # Preview what would be archived python scripts/archive_job.py --logs # Archive logs only python scripts/archive_job.py --metrics # Archive metrics only python scripts/archive_job.py --reports # Archive reports only python scripts/archive_job.py --all # Archive all (default) Environment: DATABASE_URL - PostgreSQL connection string S3_BUCKET - S3 bucket for report archiving AWS_ACCESS_KEY_ID - AWS credentials AWS_SECRET_ACCESS_KEY - AWS credentials """ import asyncio import argparse import logging import os import sys from datetime import datetime, timedelta from typing import Optional, List, Dict, Any, Tuple from uuid import UUID, uuid4 import boto3 from botocore.exceptions import ClientError from sqlalchemy import select, insert, delete, func, text from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker from sqlalchemy.dialects.postgresql import UUID as PGUUID # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler(f"storage/logs/archive_{datetime.now():%Y%m%d_%H%M%S}.log"), ], ) logger = logging.getLogger(__name__) # Database configuration DATABASE_URL = os.getenv( "DATABASE_URL", "postgresql+asyncpg://postgres:postgres@localhost:5432/mockupaws" ) # Archive configuration ARCHIVE_CONFIG = { "logs": { "table": "scenario_logs", "archive_table": "scenario_logs_archive", "date_column": "received_at", "archive_after_days": 365, "batch_size": 10000, }, "metrics": { "table": "scenario_metrics", "archive_table": "scenario_metrics_archive", "date_column": "timestamp", "archive_after_days": 730, "aggregate_before_archive": True, "aggregation_period": "day", "batch_size": 5000, }, "reports": { "table": "reports", "archive_table": "reports_archive", "date_column": "created_at", "archive_after_days": 180, "compress_files": True, "s3_bucket": os.getenv("REPORTS_ARCHIVE_BUCKET", "mockupaws-reports-archive"), "s3_prefix": "archived-reports/", "batch_size": 100, }, } class ArchiveJob: """Data archive job runner.""" def __init__(self, dry_run: bool = False): self.dry_run = dry_run self.engine = create_async_engine(DATABASE_URL, echo=False) self.session_factory = async_sessionmaker( self.engine, class_=AsyncSession, expire_on_commit=False ) self.job_id: Optional[UUID] = None self.stats: Dict[str, Any] = { "logs": {"processed": 0, "archived": 0, "deleted": 0, "bytes": 0}, "metrics": {"processed": 0, "archived": 0, "deleted": 0, "bytes": 0}, "reports": {"processed": 0, "archived": 0, "deleted": 0, "bytes": 0}, } async def create_job_record(self, job_type: str) -> UUID: """Create archive job tracking record.""" job_id = uuid4() async with self.session_factory() as session: await session.execute( text(""" INSERT INTO archive_jobs (id, job_type, status, started_at) VALUES (:id, :type, 'running', NOW()) """), {"id": job_id, "type": job_type}, ) await session.commit() self.job_id = job_id return job_id async def update_job_status(self, status: str, error_message: Optional[str] = None): """Update job status in database.""" if not self.job_id: return async with self.session_factory() as session: total_processed = sum(s["processed"] for s in self.stats.values()) total_archived = sum(s["archived"] for s in self.stats.values()) total_deleted = sum(s["deleted"] for s in self.stats.values()) total_bytes = sum(s["bytes"] for s in self.stats.values()) await session.execute( text(""" UPDATE archive_jobs SET status = :status, completed_at = CASE WHEN :status IN ('completed', 'failed') THEN NOW() ELSE NULL END, records_processed = :processed, records_archived = :archived, records_deleted = :deleted, bytes_archived = :bytes, error_message = :error WHERE id = :id """), { "id": self.job_id, "status": status, "processed": total_processed, "archived": total_archived, "deleted": total_deleted, "bytes": total_bytes, "error": error_message, }, ) await session.commit() async def archive_logs(self) -> Tuple[int, int, int]: """Archive old scenario logs (> 1 year).""" logger.info("Starting logs archive job...") config = ARCHIVE_CONFIG["logs"] cutoff_date = datetime.utcnow() - timedelta(days=config["archive_after_days"]) async with self.session_factory() as session: # Count records to archive count_result = await session.execute( text(f""" SELECT COUNT(*) FROM {config["table"]} WHERE {config["date_column"]} < :cutoff """), {"cutoff": cutoff_date}, ) total_count = count_result.scalar() if total_count == 0: logger.info("No logs to archive") return 0, 0, 0 logger.info( f"Found {total_count} logs to archive (older than {cutoff_date.date()})" ) if self.dry_run: logger.info(f"[DRY RUN] Would archive {total_count} logs") return total_count, 0, 0 processed = 0 archived = 0 deleted = 0 while processed < total_count: # Archive batch batch_result = await session.execute( text(f""" WITH batch AS ( SELECT id FROM {config["table"]} WHERE {config["date_column"]} < :cutoff LIMIT :batch_size ), archived AS ( INSERT INTO {config["archive_table"]} (id, scenario_id, received_at, message_hash, message_preview, source, size_bytes, has_pii, token_count, sqs_blocks, archived_at, archive_batch_id) SELECT id, scenario_id, received_at, message_hash, message_preview, source, size_bytes, has_pii, token_count, sqs_blocks, NOW(), :job_id FROM {config["table"]} WHERE id IN (SELECT id FROM batch) ON CONFLICT (id) DO NOTHING RETURNING id ), deleted AS ( DELETE FROM {config["table"]} WHERE id IN (SELECT id FROM batch) RETURNING id ) SELECT (SELECT COUNT(*) FROM batch) as batch_count, (SELECT COUNT(*) FROM archived) as archived_count, (SELECT COUNT(*) FROM deleted) as deleted_count """), { "cutoff": cutoff_date, "batch_size": config["batch_size"], "job_id": self.job_id, }, ) row = batch_result.fetchone() batch_processed = row.batch_count batch_archived = row.archived_count batch_deleted = row.deleted_count processed += batch_processed archived += batch_archived deleted += batch_deleted logger.info( f"Archived batch: {batch_archived} archived, {batch_deleted} deleted ({processed}/{total_count})" ) await session.commit() if batch_processed == 0: break self.stats["logs"]["processed"] = processed self.stats["logs"]["archived"] = archived self.stats["logs"]["deleted"] = deleted logger.info( f"Logs archive completed: {archived} archived, {deleted} deleted" ) return processed, archived, deleted async def aggregate_metrics( self, session: AsyncSession, scenario_id: UUID, cutoff_date: datetime ) -> int: """Aggregate metrics before archiving.""" # Aggregate by day await session.execute( text(""" INSERT INTO scenario_metrics_archive ( id, scenario_id, timestamp, metric_type, metric_name, value, unit, extra_data, archived_at, archive_batch_id, is_aggregated, aggregation_period, sample_count ) SELECT uuid_generate_v4(), scenario_id, DATE_TRUNC('day', timestamp) as day, metric_type, metric_name, AVG(value) as avg_value, unit, '{}'::jsonb as extra_data, NOW(), :job_id, true, 'day', COUNT(*) as sample_count FROM scenario_metrics WHERE scenario_id = :scenario_id AND timestamp < :cutoff GROUP BY scenario_id, DATE_TRUNC('day', timestamp), metric_type, metric_name, unit ON CONFLICT DO NOTHING """), {"scenario_id": scenario_id, "cutoff": cutoff_date, "job_id": self.job_id}, ) return 0 async def archive_metrics(self) -> Tuple[int, int, int]: """Archive old scenario metrics (> 2 years).""" logger.info("Starting metrics archive job...") config = ARCHIVE_CONFIG["metrics"] cutoff_date = datetime.utcnow() - timedelta(days=config["archive_after_days"]) async with self.session_factory() as session: # First, aggregate metrics if config.get("aggregate_before_archive"): logger.info("Aggregating metrics before archive...") # Get distinct scenarios with old metrics scenarios_result = await session.execute( text(f""" SELECT DISTINCT scenario_id FROM {config["table"]} WHERE {config["date_column"]} < :cutoff """), {"cutoff": cutoff_date}, ) scenarios = [row[0] for row in scenarios_result.fetchall()] for scenario_id in scenarios: await self.aggregate_metrics(session, scenario_id, cutoff_date) await session.commit() logger.info(f"Aggregated metrics for {len(scenarios)} scenarios") # Count records to archive (non-aggregated) count_result = await session.execute( text(f""" SELECT COUNT(*) FROM {config["table"]} WHERE {config["date_column"]} < :cutoff """), {"cutoff": cutoff_date}, ) total_count = count_result.scalar() if total_count == 0: logger.info("No metrics to archive") return 0, 0, 0 logger.info( f"Found {total_count} metrics to archive (older than {cutoff_date.date()})" ) if self.dry_run: logger.info(f"[DRY RUN] Would archive {total_count} metrics") return total_count, 0, 0 processed = 0 archived = 0 deleted = 0 while processed < total_count: # Archive batch (non-aggregated) batch_result = await session.execute( text(f""" WITH batch AS ( SELECT id FROM {config["table"]} WHERE {config["date_column"]} < :cutoff LIMIT :batch_size ), archived AS ( INSERT INTO {config["archive_table"]} (id, scenario_id, timestamp, metric_type, metric_name, value, unit, extra_data, archived_at, archive_batch_id, is_aggregated, aggregation_period, sample_count) SELECT id, scenario_id, timestamp, metric_type, metric_name, value, unit, extra_data, NOW(), :job_id, false, null, null FROM {config["table"]} WHERE id IN (SELECT id FROM batch) ON CONFLICT (id) DO NOTHING RETURNING id ), deleted AS ( DELETE FROM {config["table"]} WHERE id IN (SELECT id FROM batch) RETURNING id ) SELECT (SELECT COUNT(*) FROM batch) as batch_count, (SELECT COUNT(*) FROM archived) as archived_count, (SELECT COUNT(*) FROM deleted) as deleted_count """), { "cutoff": cutoff_date, "batch_size": config["batch_size"], "job_id": self.job_id, }, ) row = batch_result.fetchone() batch_processed = row.batch_count batch_archived = row.archived_count batch_deleted = row.deleted_count processed += batch_processed archived += batch_archived deleted += batch_deleted logger.info( f"Archived metrics batch: {batch_archived} archived ({processed}/{total_count})" ) await session.commit() if batch_processed == 0: break self.stats["metrics"]["processed"] = processed self.stats["metrics"]["archived"] = archived self.stats["metrics"]["deleted"] = deleted logger.info( f"Metrics archive completed: {archived} archived, {deleted} deleted" ) return processed, archived, deleted async def archive_reports(self) -> Tuple[int, int, int]: """Archive old reports (> 6 months) to S3.""" logger.info("Starting reports archive job...") config = ARCHIVE_CONFIG["reports"] cutoff_date = datetime.utcnow() - timedelta(days=config["archive_after_days"]) s3_client = None if not self.dry_run: try: s3_client = boto3.client("s3") except Exception as e: logger.error(f"Failed to initialize S3 client: {e}") return 0, 0, 0 async with self.session_factory() as session: # Count records to archive count_result = await session.execute( text(f""" SELECT COUNT(*), COALESCE(SUM(file_size_bytes), 0) FROM {config["table"]} WHERE {config["date_column"]} < :cutoff """), {"cutoff": cutoff_date}, ) row = count_result.fetchone() total_count = row[0] total_bytes = row[1] or 0 if total_count == 0: logger.info("No reports to archive") return 0, 0, 0 logger.info( f"Found {total_count} reports to archive ({total_bytes / 1024 / 1024:.2f} MB)" ) if self.dry_run: logger.info(f"[DRY RUN] Would archive {total_count} reports to S3") return total_count, 0, 0 processed = 0 archived = 0 deleted = 0 bytes_archived = 0 while processed < total_count: # Get batch of reports batch_result = await session.execute( text(f""" SELECT id, scenario_id, format, file_path, file_size_bytes, generated_by, extra_data, created_at FROM {config["table"]} WHERE {config["date_column"]} < :cutoff LIMIT :batch_size """), {"cutoff": cutoff_date, "batch_size": config["batch_size"]}, ) reports = batch_result.fetchall() if not reports: break for report in reports: try: # Upload to S3 if os.path.exists(report.file_path): s3_key = f"{config['s3_prefix']}{report.scenario_id}/{report.id}.{report.format}" s3_client.upload_file( report.file_path, config["s3_bucket"], s3_key ) s3_location = f"s3://{config['s3_bucket']}/{s3_key}" # Delete local file os.remove(report.file_path) deleted_files = 1 else: s3_location = None deleted_files = 0 # Insert archive record await session.execute( text(f""" INSERT INTO {config["archive_table"]} (id, scenario_id, format, file_path, file_size_bytes, generated_by, extra_data, created_at, archived_at, s3_location, deleted_locally, archive_batch_id) VALUES (:id, :scenario_id, :format, :file_path, :file_size, :generated_by, :extra_data, :created_at, NOW(), :s3_location, true, :job_id) ON CONFLICT (id) DO NOTHING """), { "id": report.id, "scenario_id": report.scenario_id, "format": report.format, "file_path": report.file_path, "file_size": report.file_size_bytes, "generated_by": report.generated_by, "extra_data": report.extra_data, "created_at": report.created_at, "s3_location": s3_location, "job_id": self.job_id, }, ) # Delete from main table await session.execute( text(f"DELETE FROM {config['table']} WHERE id = :id"), {"id": report.id}, ) archived += 1 deleted += deleted_files bytes_archived += report.file_size_bytes or 0 except Exception as e: logger.error(f"Failed to archive report {report.id}: {e}") processed += len(reports) await session.commit() logger.info( f"Archived reports batch: {archived} uploaded ({processed}/{total_count})" ) self.stats["reports"]["processed"] = processed self.stats["reports"]["archived"] = archived self.stats["reports"]["deleted"] = deleted self.stats["reports"]["bytes"] = bytes_archived logger.info( f"Reports archive completed: {archived} archived, {bytes_archived / 1024 / 1024:.2f} MB saved" ) return processed, archived, deleted async def run(self, archive_types: List[str]): """Run archive job for specified types.""" start_time = datetime.utcnow() logger.info("=" * 60) logger.info("mockupAWS Data Archive Job v1.0.0") logger.info("=" * 60) logger.info(f"Mode: {'DRY RUN' if self.dry_run else 'LIVE'}") logger.info(f"Archive types: {', '.join(archive_types)}") # Create job record await self.create_job_record( "all" if len(archive_types) > 1 else archive_types[0] ) try: # Run archive jobs if "logs" in archive_types: await self.archive_logs() if "metrics" in archive_types: await self.archive_metrics() if "reports" in archive_types: await self.archive_reports() # Update job status if not self.dry_run: await self.update_job_status("completed") # Print summary duration = (datetime.utcnow() - start_time).total_seconds() total_archived = sum(s["archived"] for s in self.stats.values()) total_bytes = sum(s["bytes"] for s in self.stats.values()) logger.info("=" * 60) logger.info("Archive Job Summary") logger.info("=" * 60) logger.info(f"Duration: {duration:.1f} seconds") logger.info(f"Total archived: {total_archived} records") logger.info(f"Total space saved: {total_bytes / 1024 / 1024:.2f} MB") for archive_type, stats in self.stats.items(): if stats["processed"] > 0: logger.info( f" {archive_type}: {stats['archived']} archived, {stats['deleted']} deleted" ) logger.info("=" * 60) logger.info( "Archive job completed successfully" if not self.dry_run else "Dry run completed" ) except Exception as e: logger.error(f"Archive job failed: {e}") if not self.dry_run: await self.update_job_status("failed", str(e)) raise finally: await self.engine.dispose() def main(): parser = argparse.ArgumentParser(description="mockupAWS Data Archive Job") parser.add_argument( "--dry-run", action="store_true", help="Preview without archiving" ) parser.add_argument("--logs", action="store_true", help="Archive logs only") parser.add_argument("--metrics", action="store_true", help="Archive metrics only") parser.add_argument("--reports", action="store_true", help="Archive reports only") parser.add_argument( "--all", action="store_true", help="Archive all types (default)" ) args = parser.parse_args() # Determine which types to archive types = [] if args.logs: types.append("logs") if args.metrics: types.append("metrics") if args.reports: types.append("reports") if not types or args.all: types = ["logs", "metrics", "reports"] # Run job job = ArchiveJob(dry_run=args.dry_run) asyncio.run(job.run(types)) if __name__ == "__main__": main()