mockupAWS/scripts/archive_job.py

#!/usr/bin/env python3
"""
mockupAWS Data Archive Job v1.0.0

Nightly archive job for old data:
- Scenario logs > 1 year → archive
- Scenario metrics > 2 years → aggregate → archive
- Reports > 6 months → compress → S3

Usage:
    python scripts/archive_job.py --dry-run      # Preview what would be archived
    python scripts/archive_job.py --logs         # Archive logs only
    python scripts/archive_job.py --metrics      # Archive metrics only
    python scripts/archive_job.py --reports      # Archive reports only
    python scripts/archive_job.py --all          # Archive all (default)

Environment:
    DATABASE_URL - PostgreSQL connection string
    S3_BUCKET    - S3 bucket for report archiving
    AWS_ACCESS_KEY_ID - AWS credentials
    AWS_SECRET_ACCESS_KEY - AWS credentials
"""

import asyncio
import argparse
import logging
import os
import sys
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Any, Tuple
from uuid import UUID, uuid4

import boto3
from botocore.exceptions import ClientError
from sqlalchemy import select, insert, delete, func, text
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
from sqlalchemy.dialects.postgresql import UUID as PGUUID

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler(f"storage/logs/archive_{datetime.now():%Y%m%d_%H%M%S}.log"),
    ],
)
logger = logging.getLogger(__name__)

# Database configuration
DATABASE_URL = os.getenv(
    "DATABASE_URL", "postgresql+asyncpg://postgres:postgres@localhost:5432/mockupaws"
)

# Archive configuration
ARCHIVE_CONFIG = {
    "logs": {
        "table": "scenario_logs",
        "archive_table": "scenario_logs_archive",
        "date_column": "received_at",
        "archive_after_days": 365,
        "batch_size": 10000,
    },
    "metrics": {
        "table": "scenario_metrics",
        "archive_table": "scenario_metrics_archive",
        "date_column": "timestamp",
        "archive_after_days": 730,
        "aggregate_before_archive": True,
        "aggregation_period": "day",
        "batch_size": 5000,
    },
    "reports": {
        "table": "reports",
        "archive_table": "reports_archive",
        "date_column": "created_at",
        "archive_after_days": 180,
        "compress_files": True,
        "s3_bucket": os.getenv("REPORTS_ARCHIVE_BUCKET", "mockupaws-reports-archive"),
        "s3_prefix": "archived-reports/",
        "batch_size": 100,
    },
}


class ArchiveJob:
    """Data archive job runner."""

    def __init__(self, dry_run: bool = False):
        self.dry_run = dry_run
        self.engine = create_async_engine(DATABASE_URL, echo=False)
        self.session_factory = async_sessionmaker(
            self.engine, class_=AsyncSession, expire_on_commit=False
        )
        self.job_id: Optional[UUID] = None
        self.stats: Dict[str, Any] = {
            "logs": {"processed": 0, "archived": 0, "deleted": 0, "bytes": 0},
            "metrics": {"processed": 0, "archived": 0, "deleted": 0, "bytes": 0},
            "reports": {"processed": 0, "archived": 0, "deleted": 0, "bytes": 0},
        }

    async def create_job_record(self, job_type: str) -> UUID:
        """Create archive job tracking record."""
        job_id = uuid4()

        async with self.session_factory() as session:
            await session.execute(
                text("""
                    INSERT INTO archive_jobs (id, job_type, status, started_at)
                    VALUES (:id, :type, 'running', NOW())
                """),
                {"id": job_id, "type": job_type},
            )
            await session.commit()

        self.job_id = job_id
        return job_id

    async def update_job_status(self, status: str, error_message: Optional[str] = None):
        """Update job status in database."""
        if not self.job_id:
            return

        async with self.session_factory() as session:
            total_processed = sum(s["processed"] for s in self.stats.values())
            total_archived = sum(s["archived"] for s in self.stats.values())
            total_deleted = sum(s["deleted"] for s in self.stats.values())
            total_bytes = sum(s["bytes"] for s in self.stats.values())

            await session.execute(
                text("""
                    UPDATE archive_jobs
                    SET status = :status,
                        completed_at = CASE WHEN :status IN ('completed', 'failed') THEN NOW() ELSE NULL END,
                        records_processed = :processed,
                        records_archived = :archived,
                        records_deleted = :deleted,
                        bytes_archived = :bytes,
                        error_message = :error
                    WHERE id = :id
                """),
                {
                    "id": self.job_id,
                    "status": status,
                    "processed": total_processed,
                    "archived": total_archived,
                    "deleted": total_deleted,
                    "bytes": total_bytes,
                    "error": error_message,
                },
            )
            await session.commit()

    async def archive_logs(self) -> Tuple[int, int, int]:
        """Archive old scenario logs (> 1 year)."""
        logger.info("Starting logs archive job...")

        config = ARCHIVE_CONFIG["logs"]
        cutoff_date = datetime.utcnow() - timedelta(days=config["archive_after_days"])

        async with self.session_factory() as session:
            # Count records to archive
            count_result = await session.execute(
                text(f"""
                    SELECT COUNT(*) FROM {config["table"]}
                    WHERE {config["date_column"]} < :cutoff
                """),
                {"cutoff": cutoff_date},
            )
            total_count = count_result.scalar()

            if total_count == 0:
                logger.info("No logs to archive")
                return 0, 0, 0

            logger.info(
                f"Found {total_count} logs to archive (older than {cutoff_date.date()})"
            )

            if self.dry_run:
                logger.info(f"[DRY RUN] Would archive {total_count} logs")
                return total_count, 0, 0

            processed = 0
            archived = 0
            deleted = 0

            while processed < total_count:
                # Archive batch
                batch_result = await session.execute(
                    text(f"""
                        WITH batch AS (
                            SELECT id FROM {config["table"]}
                            WHERE {config["date_column"]} < :cutoff
                            LIMIT :batch_size
                        ),
                        archived AS (
                            INSERT INTO {config["archive_table"]}
                                (id, scenario_id, received_at, message_hash, message_preview,
                                 source, size_bytes, has_pii, token_count, sqs_blocks,
                                 archived_at, archive_batch_id)
                            SELECT
                                id, scenario_id, received_at, message_hash, message_preview,
                                source, size_bytes, has_pii, token_count, sqs_blocks,
                                NOW(), :job_id
                            FROM {config["table"]}
                            WHERE id IN (SELECT id FROM batch)
                            ON CONFLICT (id) DO NOTHING
                            RETURNING id
                        ),
                        deleted AS (
                            DELETE FROM {config["table"]}
                            WHERE id IN (SELECT id FROM batch)
                            RETURNING id
                        )
                        SELECT
                            (SELECT COUNT(*) FROM batch) as batch_count,
                            (SELECT COUNT(*) FROM archived) as archived_count,
                            (SELECT COUNT(*) FROM deleted) as deleted_count
                    """),
                    {
                        "cutoff": cutoff_date,
                        "batch_size": config["batch_size"],
                        "job_id": self.job_id,
                    },
                )

                row = batch_result.fetchone()
                batch_processed = row.batch_count
                batch_archived = row.archived_count
                batch_deleted = row.deleted_count

                processed += batch_processed
                archived += batch_archived
                deleted += batch_deleted

                logger.info(
                    f"Archived batch: {batch_archived} archived, {batch_deleted} deleted ({processed}/{total_count})"
                )

                await session.commit()

                if batch_processed == 0:
                    break

            self.stats["logs"]["processed"] = processed
            self.stats["logs"]["archived"] = archived
            self.stats["logs"]["deleted"] = deleted

            logger.info(
                f"Logs archive completed: {archived} archived, {deleted} deleted"
            )
            return processed, archived, deleted

    async def aggregate_metrics(
        self, session: AsyncSession, scenario_id: UUID, cutoff_date: datetime
    ) -> int:
        """Aggregate metrics before archiving."""
        # Aggregate by day
        await session.execute(
            text("""
                INSERT INTO scenario_metrics_archive (
                    id, scenario_id, timestamp, metric_type, metric_name,
                    value, unit, extra_data, archived_at, archive_batch_id,
                    is_aggregated, aggregation_period, sample_count
                )
                SELECT
                    uuid_generate_v4(),
                    scenario_id,
                    DATE_TRUNC('day', timestamp) as day,
                    metric_type,
                    metric_name,
                    AVG(value) as avg_value,
                    unit,
                    '{}'::jsonb as extra_data,
                    NOW(),
                    :job_id,
                    true,
                    'day',
                    COUNT(*) as sample_count
                FROM scenario_metrics
                WHERE scenario_id = :scenario_id
                  AND timestamp < :cutoff
                GROUP BY scenario_id, DATE_TRUNC('day', timestamp), metric_type, metric_name, unit
                ON CONFLICT DO NOTHING
            """),
            {"scenario_id": scenario_id, "cutoff": cutoff_date, "job_id": self.job_id},
        )

        return 0

    async def archive_metrics(self) -> Tuple[int, int, int]:
        """Archive old scenario metrics (> 2 years)."""
        logger.info("Starting metrics archive job...")

        config = ARCHIVE_CONFIG["metrics"]
        cutoff_date = datetime.utcnow() - timedelta(days=config["archive_after_days"])

        async with self.session_factory() as session:
            # First, aggregate metrics
            if config.get("aggregate_before_archive"):
                logger.info("Aggregating metrics before archive...")

                # Get distinct scenarios with old metrics
                scenarios_result = await session.execute(
                    text(f"""
                        SELECT DISTINCT scenario_id
                        FROM {config["table"]}
                        WHERE {config["date_column"]} < :cutoff
                    """),
                    {"cutoff": cutoff_date},
                )
                scenarios = [row[0] for row in scenarios_result.fetchall()]

                for scenario_id in scenarios:
                    await self.aggregate_metrics(session, scenario_id, cutoff_date)

                await session.commit()
                logger.info(f"Aggregated metrics for {len(scenarios)} scenarios")

            # Count records to archive (non-aggregated)
            count_result = await session.execute(
                text(f"""
                    SELECT COUNT(*) FROM {config["table"]}
                    WHERE {config["date_column"]} < :cutoff
                """),
                {"cutoff": cutoff_date},
            )
            total_count = count_result.scalar()

            if total_count == 0:
                logger.info("No metrics to archive")
                return 0, 0, 0

            logger.info(
                f"Found {total_count} metrics to archive (older than {cutoff_date.date()})"
            )

            if self.dry_run:
                logger.info(f"[DRY RUN] Would archive {total_count} metrics")
                return total_count, 0, 0

            processed = 0
            archived = 0
            deleted = 0

            while processed < total_count:
                # Archive batch (non-aggregated)
                batch_result = await session.execute(
                    text(f"""
                        WITH batch AS (
                            SELECT id FROM {config["table"]}
                            WHERE {config["date_column"]} < :cutoff
                            LIMIT :batch_size
                        ),
                        archived AS (
                            INSERT INTO {config["archive_table"]}
                                (id, scenario_id, timestamp, metric_type, metric_name,
                                 value, unit, extra_data, archived_at, archive_batch_id,
                                 is_aggregated, aggregation_period, sample_count)
                            SELECT
                                id, scenario_id, timestamp, metric_type, metric_name,
                                value, unit, extra_data, NOW(), :job_id,
                                false, null, null
                            FROM {config["table"]}
                            WHERE id IN (SELECT id FROM batch)
                            ON CONFLICT (id) DO NOTHING
                            RETURNING id
                        ),
                        deleted AS (
                            DELETE FROM {config["table"]}
                            WHERE id IN (SELECT id FROM batch)
                            RETURNING id
                        )
                        SELECT
                            (SELECT COUNT(*) FROM batch) as batch_count,
                            (SELECT COUNT(*) FROM archived) as archived_count,
                            (SELECT COUNT(*) FROM deleted) as deleted_count
                    """),
                    {
                        "cutoff": cutoff_date,
                        "batch_size": config["batch_size"],
                        "job_id": self.job_id,
                    },
                )

                row = batch_result.fetchone()
                batch_processed = row.batch_count
                batch_archived = row.archived_count
                batch_deleted = row.deleted_count

                processed += batch_processed
                archived += batch_archived
                deleted += batch_deleted

                logger.info(
                    f"Archived metrics batch: {batch_archived} archived ({processed}/{total_count})"
                )

                await session.commit()

                if batch_processed == 0:
                    break

            self.stats["metrics"]["processed"] = processed
            self.stats["metrics"]["archived"] = archived
            self.stats["metrics"]["deleted"] = deleted

            logger.info(
                f"Metrics archive completed: {archived} archived, {deleted} deleted"
            )
            return processed, archived, deleted

    async def archive_reports(self) -> Tuple[int, int, int]:
        """Archive old reports (> 6 months) to S3."""
        logger.info("Starting reports archive job...")

        config = ARCHIVE_CONFIG["reports"]
        cutoff_date = datetime.utcnow() - timedelta(days=config["archive_after_days"])

        s3_client = None
        if not self.dry_run:
            try:
                s3_client = boto3.client("s3")
            except Exception as e:
                logger.error(f"Failed to initialize S3 client: {e}")
                return 0, 0, 0

        async with self.session_factory() as session:
            # Count records to archive
            count_result = await session.execute(
                text(f"""
                    SELECT COUNT(*), COALESCE(SUM(file_size_bytes), 0)
                    FROM {config["table"]}
                    WHERE {config["date_column"]} < :cutoff
                """),
                {"cutoff": cutoff_date},
            )
            row = count_result.fetchone()
            total_count = row[0]
            total_bytes = row[1] or 0

            if total_count == 0:
                logger.info("No reports to archive")
                return 0, 0, 0

            logger.info(
                f"Found {total_count} reports to archive ({total_bytes / 1024 / 1024:.2f} MB)"
            )

            if self.dry_run:
                logger.info(f"[DRY RUN] Would archive {total_count} reports to S3")
                return total_count, 0, 0

            processed = 0
            archived = 0
            deleted = 0
            bytes_archived = 0

            while processed < total_count:
                # Get batch of reports
                batch_result = await session.execute(
                    text(f"""
                        SELECT id, scenario_id, format, file_path, file_size_bytes,
                               generated_by, extra_data, created_at
                        FROM {config["table"]}
                        WHERE {config["date_column"]} < :cutoff
                        LIMIT :batch_size
                    """),
                    {"cutoff": cutoff_date, "batch_size": config["batch_size"]},
                )

                reports = batch_result.fetchall()
                if not reports:
                    break

                for report in reports:
                    try:
                        # Upload to S3
                        if os.path.exists(report.file_path):
                            s3_key = f"{config['s3_prefix']}{report.scenario_id}/{report.id}.{report.format}"

                            s3_client.upload_file(
                                report.file_path, config["s3_bucket"], s3_key
                            )

                            s3_location = f"s3://{config['s3_bucket']}/{s3_key}"

                            # Delete local file
                            os.remove(report.file_path)
                            deleted_files = 1
                        else:
                            s3_location = None
                            deleted_files = 0

                        # Insert archive record
                        await session.execute(
                            text(f"""
                                INSERT INTO {config["archive_table"]}
                                    (id, scenario_id, format, file_path, file_size_bytes,
                                     generated_by, extra_data, created_at, archived_at,
                                     s3_location, deleted_locally, archive_batch_id)
                                VALUES
                                    (:id, :scenario_id, :format, :file_path, :file_size,
                                     :generated_by, :extra_data, :created_at, NOW(),
                                     :s3_location, true, :job_id)
                                ON CONFLICT (id) DO NOTHING
                            """),
                            {
                                "id": report.id,
                                "scenario_id": report.scenario_id,
                                "format": report.format,
                                "file_path": report.file_path,
                                "file_size": report.file_size_bytes,
                                "generated_by": report.generated_by,
                                "extra_data": report.extra_data,
                                "created_at": report.created_at,
                                "s3_location": s3_location,
                                "job_id": self.job_id,
                            },
                        )

                        # Delete from main table
                        await session.execute(
                            text(f"DELETE FROM {config['table']} WHERE id = :id"),
                            {"id": report.id},
                        )

                        archived += 1
                        deleted += deleted_files
                        bytes_archived += report.file_size_bytes or 0

                    except Exception as e:
                        logger.error(f"Failed to archive report {report.id}: {e}")

                processed += len(reports)
                await session.commit()

                logger.info(
                    f"Archived reports batch: {archived} uploaded ({processed}/{total_count})"
                )

            self.stats["reports"]["processed"] = processed
            self.stats["reports"]["archived"] = archived
            self.stats["reports"]["deleted"] = deleted
            self.stats["reports"]["bytes"] = bytes_archived

            logger.info(
                f"Reports archive completed: {archived} archived, {bytes_archived / 1024 / 1024:.2f} MB saved"
            )
            return processed, archived, deleted

    async def run(self, archive_types: List[str]):
        """Run archive job for specified types."""
        start_time = datetime.utcnow()

        logger.info("=" * 60)
        logger.info("mockupAWS Data Archive Job v1.0.0")
        logger.info("=" * 60)
        logger.info(f"Mode: {'DRY RUN' if self.dry_run else 'LIVE'}")
        logger.info(f"Archive types: {', '.join(archive_types)}")

        # Create job record
        await self.create_job_record(
            "all" if len(archive_types) > 1 else archive_types[0]
        )

        try:
            # Run archive jobs
            if "logs" in archive_types:
                await self.archive_logs()

            if "metrics" in archive_types:
                await self.archive_metrics()

            if "reports" in archive_types:
                await self.archive_reports()

            # Update job status
            if not self.dry_run:
                await self.update_job_status("completed")

            # Print summary
            duration = (datetime.utcnow() - start_time).total_seconds()
            total_archived = sum(s["archived"] for s in self.stats.values())
            total_bytes = sum(s["bytes"] for s in self.stats.values())

            logger.info("=" * 60)
            logger.info("Archive Job Summary")
            logger.info("=" * 60)
            logger.info(f"Duration: {duration:.1f} seconds")
            logger.info(f"Total archived: {total_archived} records")
            logger.info(f"Total space saved: {total_bytes / 1024 / 1024:.2f} MB")

            for archive_type, stats in self.stats.items():
                if stats["processed"] > 0:
                    logger.info(
                        f"  {archive_type}: {stats['archived']} archived, {stats['deleted']} deleted"
                    )

            logger.info("=" * 60)
            logger.info(
                "Archive job completed successfully"
                if not self.dry_run
                else "Dry run completed"
            )

        except Exception as e:
            logger.error(f"Archive job failed: {e}")
            if not self.dry_run:
                await self.update_job_status("failed", str(e))
            raise
        finally:
            await self.engine.dispose()


def main():
    parser = argparse.ArgumentParser(description="mockupAWS Data Archive Job")
    parser.add_argument(
        "--dry-run", action="store_true", help="Preview without archiving"
    )
    parser.add_argument("--logs", action="store_true", help="Archive logs only")
    parser.add_argument("--metrics", action="store_true", help="Archive metrics only")
    parser.add_argument("--reports", action="store_true", help="Archive reports only")
    parser.add_argument(
        "--all", action="store_true", help="Archive all types (default)"
    )

    args = parser.parse_args()

    # Determine which types to archive
    types = []
    if args.logs:
        types.append("logs")
    if args.metrics:
        types.append("metrics")
    if args.reports:
        types.append("reports")

    if not types or args.all:
        types = ["logs", "metrics", "reports"]

    # Run job
    job = ArchiveJob(dry_run=args.dry_run)
    asyncio.run(job.run(types))


if __name__ == "__main__":
    main()