Big-Link-Man/scripts/add_robots_txt_to_buckets.py

"""
Script to add robots.txt files to all storage buckets (both S3 and Bunny)

This script:
1. Queries the database for all site deployments
2. Generates a standard robots.txt that blocks SEO tools/bad bots while allowing search engines and AI
3. Uploads robots.txt to each bucket using the appropriate storage client
4. Handles both S3 and Bunny storage providers
5. Overwrites existing robots.txt files (idempotent - safe to run multiple times)
6. Continues processing on errors and reports failures at the end

Usage:
    python scripts/add_robots_txt_to_buckets.py                    # Actually upload
    python scripts/add_robots_txt_to_buckets.py --dry-run         # Preview only
    python scripts/add_robots_txt_to_buckets.py --provider s3     # Only S3 buckets
    python scripts/add_robots_txt_to_buckets.py --provider bunny  # Only Bunny buckets
"""

import sys
import os
import argparse
import logging
from pathlib import Path
from typing import List, Tuple

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from dotenv import load_dotenv

from src.database.models import SiteDeployment
from src.deployment.storage_factory import create_storage_client
from src.deployment.bunny_storage import UploadResult

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def get_robots_txt_content() -> str:
    """
    Generate the robots.txt content

    This configuration:
    - Allows legitimate search engines (Google, Bing, Yahoo, DuckDuckGo, Baidu, Yandex)
    - Allows AI crawlers (GPT, Claude, Common Crawl, Perplexity, ByteDance)
    - Blocks SEO tools (Ahrefs, Semrush, Moz, etc.)
    - Blocks bad bots and site scrapers
    - Allows everything else by default

    Returns:
        String content of robots.txt file
    """
    return """# Allow legitimate search engines
User-agent: Googlebot
Allow: /

User-agent: Bingbot
Allow: /

User-agent: Slurp
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Baiduspider
Allow: /

User-agent: YandexBot
Allow: /

# Allow AI crawlers
User-agent: GPTBot
Allow: /

User-agent: ChatGPT-User
Allow: /

User-agent: CCBot
Allow: /

User-agent: Claude-Web
Allow: /

User-agent: anthropic-ai
Allow: /

User-agent: PerplexityBot
Allow: /

User-agent: Bytespider
Allow: /

# Block SEO tools
User-agent: AhrefsBot
Disallow: /

User-agent: SemrushBot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: Mj12bot
Disallow: /

User-agent: BLEXBot
Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: PetalBot
Disallow: /

User-agent: SeznamBot
Disallow: /

# Block common bad bots
User-agent: MauiBot
Disallow: /

User-agent: AlphaBot
Disallow: /

User-agent: SiteSnagger
Disallow: /

User-agent: WebStripper
Disallow: /

User-agent: WebCopier
Disallow: /

User-agent: WebZIP
Disallow: /

User-agent: Teleport
Disallow: /

User-agent: TeleportPro
Disallow: /

User-agent: Wget
Disallow: /

User-agent: HTTrack
Disallow: /

User-agent: Microsoft.URL
Disallow: /

User-agent: Xenu
Disallow: /

User-agent: larbin
Disallow: /

User-agent: libwww
Disallow: /

User-agent: ZyBORG
Disallow: /

User-agent: Download
Disallow: /

# Default - allow everyone else (mostly for legitimate indexing)
User-agent: *
Allow: /
"""


def get_database_session() -> Session:
    """
    Create and return a database session

    Reads DATABASE_URL from environment variables (.env file)

    Returns:
        SQLAlchemy Session object

    Raises:
        ValueError: If DATABASE_URL is not set
    """
    load_dotenv()

    database_url = os.getenv("DATABASE_URL")
    if not database_url:
        raise ValueError("DATABASE_URL environment variable is required")

    engine = create_engine(database_url)
    return Session(engine)


def get_all_site_deployments(session: Session, provider_filter: str = None) -> List[SiteDeployment]:
    """
    Query database for all site deployments

    Args:
        session: Database session
        provider_filter: Optional filter - 's3', 'bunny', or None for all

    Returns:
        List of SiteDeployment objects
    """
    query = session.query(SiteDeployment)

    # Apply provider filter if specified
    if provider_filter == 's3':
        # Include both 's3' and 's3_compatible'
        query = query.filter(SiteDeployment.storage_provider.in_(['s3', 's3_compatible']))
    elif provider_filter == 'bunny':
        query = query.filter(SiteDeployment.storage_provider == 'bunny')

    return query.all()


def upload_robots_txt(
    site: SiteDeployment,
    robots_content: str,
    dry_run: bool = False
) -> Tuple[bool, str]:
    """
    Upload robots.txt to a single site's storage bucket

    Args:
        site: SiteDeployment object with storage configuration
        robots_content: Content of robots.txt file
        dry_run: If True, only log what would be done without uploading

    Returns:
        Tuple of (success: bool, message: str)
    """
    try:
        # Determine hostname for logging
        hostname = site.custom_hostname or site.pull_zone_bcdn_hostname

        # For S3, also show bucket name
        if site.storage_provider in ['s3', 's3_compatible']:
            bucket_info = f" (bucket: {site.s3_bucket_name})" if site.s3_bucket_name else ""
        else:
            bucket_info = f" (zone: {site.storage_zone_name})"

        if dry_run:
            logger.info(
                f"[DRY RUN] Would upload robots.txt to {site.storage_provider} - "
                f"{hostname}{bucket_info}"
            )
            return True, f"Dry run - would upload to {hostname}"

        # Create appropriate storage client based on provider
        storage_client = create_storage_client(site)

        # Upload robots.txt file
        # Note: upload_file handles both str and bytes content
        result: UploadResult = storage_client.upload_file(
            site=site,
            file_path='robots.txt',  # Root level file
            content=robots_content
        )

        if result.success:
            logger.info(
                f"✓ Successfully uploaded robots.txt to {site.storage_provider} - "
                f"{hostname}{bucket_info}"
            )
            return True, result.message
        else:
            logger.error(
                f"✗ Failed to upload robots.txt to {site.storage_provider} - "
                f"{hostname}{bucket_info}: {result.message}"
            )
            return False, result.message

    except Exception as e:
        hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
        error_msg = f"Exception during upload: {str(e)}"
        logger.error(f"✗ Error uploading to {hostname}: {error_msg}")
        return False, error_msg


def main():
    """
    Main script execution

    Process:
    1. Parse command line arguments
    2. Load robots.txt content
    3. Connect to database and fetch all site deployments
    4. Iterate through each site and upload robots.txt
    5. Track successes and failures
    6. Report summary at the end
    """
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Add robots.txt files to all storage buckets'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Preview what would be done without actually uploading'
    )
    parser.add_argument(
        '--provider',
        choices=['s3', 'bunny'],
        help='Only process specific provider (s3 or bunny). Default: process all'
    )
    args = parser.parse_args()

    # Load robots.txt content
    robots_content = get_robots_txt_content()

    logger.info("=" * 80)
    logger.info("Starting robots.txt upload to all storage buckets")
    if args.dry_run:
        logger.info("DRY RUN MODE - No actual uploads will be performed")
    if args.provider:
        logger.info(f"Provider filter: {args.provider}")
    logger.info("=" * 80)

    try:
        # Connect to database
        session = get_database_session()

        # Get all site deployments (optionally filtered by provider)
        sites = get_all_site_deployments(session, args.provider)

        if not sites:
            logger.warning("No site deployments found in database")
            return

        logger.info(f"Found {len(sites)} site deployment(s) to process")
        logger.info("")

        # Track results
        successes = []
        failures = []

        # Process each site
        for idx, site in enumerate(sites, 1):
            hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
            logger.info(f"[{idx}/{len(sites)}] Processing {hostname}...")

            success, message = upload_robots_txt(site, robots_content, args.dry_run)

            if success:
                successes.append((site, message))
            else:
                failures.append((site, message))

            logger.info("")  # Blank line for readability

        # Print summary
        logger.info("=" * 80)
        logger.info("SUMMARY")
        logger.info("=" * 80)
        logger.info(f"Total processed: {len(sites)}")
        logger.info(f"Successful: {len(successes)}")
        logger.info(f"Failed: {len(failures)}")

        # Print failures if any
        if failures:
            logger.info("")
            logger.info("FAILURES:")
            for site, error_msg in failures:
                hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
                logger.error(f"  ✗ {hostname}: {error_msg}")

        logger.info("=" * 80)

        # Exit with error code if there were failures
        if failures and not args.dry_run:
            sys.exit(1)

    except Exception as e:
        logger.error(f"Fatal error: {str(e)}", exc_info=True)
        sys.exit(1)
    finally:
        # Close database session if it exists
        if 'session' in locals():
            session.close()


if __name__ == "__main__":
    main()