""" Script to add robots.txt files to all storage buckets (both S3 and Bunny) This script: 1. Queries the database for all site deployments 2. Generates a standard robots.txt that blocks SEO tools/bad bots while allowing search engines and AI 3. Uploads robots.txt to each bucket using the appropriate storage client 4. Handles both S3 and Bunny storage providers 5. Overwrites existing robots.txt files (idempotent - safe to run multiple times) 6. Continues processing on errors and reports failures at the end Usage: python scripts/add_robots_txt_to_buckets.py # Actually upload python scripts/add_robots_txt_to_buckets.py --dry-run # Preview only python scripts/add_robots_txt_to_buckets.py --provider s3 # Only S3 buckets python scripts/add_robots_txt_to_buckets.py --provider bunny # Only Bunny buckets """ import sys import os import argparse import logging from pathlib import Path from typing import List, Tuple # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) from sqlalchemy import create_engine from sqlalchemy.orm import Session from dotenv import load_dotenv from src.database.models import SiteDeployment from src.deployment.storage_factory import create_storage_client from src.deployment.bunny_storage import UploadResult # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) def get_robots_txt_content() -> str: """ Generate the robots.txt content This configuration: - Allows legitimate search engines (Google, Bing, Yahoo, DuckDuckGo, Baidu, Yandex) - Allows AI crawlers (GPT, Claude, Common Crawl, Perplexity, ByteDance) - Blocks SEO tools (Ahrefs, Semrush, Moz, etc.) - Blocks bad bots and site scrapers - Allows everything else by default Returns: String content of robots.txt file """ return """# Allow legitimate search engines User-agent: Googlebot Allow: / User-agent: Bingbot Allow: / User-agent: Slurp Allow: / User-agent: DuckDuckBot Allow: / User-agent: Baiduspider Allow: / User-agent: YandexBot Allow: / # Allow AI crawlers User-agent: GPTBot Allow: / User-agent: ChatGPT-User Allow: / User-agent: CCBot Allow: / User-agent: Claude-Web Allow: / User-agent: anthropic-ai Allow: / User-agent: PerplexityBot Allow: / User-agent: Bytespider Allow: / # Block SEO tools User-agent: AhrefsBot Disallow: / User-agent: SemrushBot Disallow: / User-agent: DotBot Disallow: / User-agent: Mj12bot Disallow: / User-agent: BLEXBot Disallow: / User-agent: DataForSeoBot Disallow: / User-agent: PetalBot Disallow: / User-agent: SeznamBot Disallow: / # Block common bad bots User-agent: MauiBot Disallow: / User-agent: AlphaBot Disallow: / User-agent: SiteSnagger Disallow: / User-agent: WebStripper Disallow: / User-agent: WebCopier Disallow: / User-agent: WebZIP Disallow: / User-agent: Teleport Disallow: / User-agent: TeleportPro Disallow: / User-agent: Wget Disallow: / User-agent: HTTrack Disallow: / User-agent: Microsoft.URL Disallow: / User-agent: Xenu Disallow: / User-agent: larbin Disallow: / User-agent: libwww Disallow: / User-agent: ZyBORG Disallow: / User-agent: Download Disallow: / # Default - allow everyone else (mostly for legitimate indexing) User-agent: * Allow: / """ def get_database_session() -> Session: """ Create and return a database session Reads DATABASE_URL from environment variables (.env file) Returns: SQLAlchemy Session object Raises: ValueError: If DATABASE_URL is not set """ load_dotenv() database_url = os.getenv("DATABASE_URL") if not database_url: raise ValueError("DATABASE_URL environment variable is required") engine = create_engine(database_url) return Session(engine) def get_all_site_deployments(session: Session, provider_filter: str = None) -> List[SiteDeployment]: """ Query database for all site deployments Args: session: Database session provider_filter: Optional filter - 's3', 'bunny', or None for all Returns: List of SiteDeployment objects """ query = session.query(SiteDeployment) # Apply provider filter if specified if provider_filter == 's3': # Include both 's3' and 's3_compatible' query = query.filter(SiteDeployment.storage_provider.in_(['s3', 's3_compatible'])) elif provider_filter == 'bunny': query = query.filter(SiteDeployment.storage_provider == 'bunny') return query.all() def upload_robots_txt( site: SiteDeployment, robots_content: str, dry_run: bool = False ) -> Tuple[bool, str]: """ Upload robots.txt to a single site's storage bucket Args: site: SiteDeployment object with storage configuration robots_content: Content of robots.txt file dry_run: If True, only log what would be done without uploading Returns: Tuple of (success: bool, message: str) """ try: # Determine hostname for logging hostname = site.custom_hostname or site.pull_zone_bcdn_hostname # For S3, also show bucket name if site.storage_provider in ['s3', 's3_compatible']: bucket_info = f" (bucket: {site.s3_bucket_name})" if site.s3_bucket_name else "" else: bucket_info = f" (zone: {site.storage_zone_name})" if dry_run: logger.info( f"[DRY RUN] Would upload robots.txt to {site.storage_provider} - " f"{hostname}{bucket_info}" ) return True, f"Dry run - would upload to {hostname}" # Create appropriate storage client based on provider storage_client = create_storage_client(site) # Upload robots.txt file # Note: upload_file handles both str and bytes content result: UploadResult = storage_client.upload_file( site=site, file_path='robots.txt', # Root level file content=robots_content ) if result.success: logger.info( f"✓ Successfully uploaded robots.txt to {site.storage_provider} - " f"{hostname}{bucket_info}" ) return True, result.message else: logger.error( f"✗ Failed to upload robots.txt to {site.storage_provider} - " f"{hostname}{bucket_info}: {result.message}" ) return False, result.message except Exception as e: hostname = site.custom_hostname or site.pull_zone_bcdn_hostname error_msg = f"Exception during upload: {str(e)}" logger.error(f"✗ Error uploading to {hostname}: {error_msg}") return False, error_msg def main(): """ Main script execution Process: 1. Parse command line arguments 2. Load robots.txt content 3. Connect to database and fetch all site deployments 4. Iterate through each site and upload robots.txt 5. Track successes and failures 6. Report summary at the end """ # Parse command line arguments parser = argparse.ArgumentParser( description='Add robots.txt files to all storage buckets' ) parser.add_argument( '--dry-run', action='store_true', help='Preview what would be done without actually uploading' ) parser.add_argument( '--provider', choices=['s3', 'bunny'], help='Only process specific provider (s3 or bunny). Default: process all' ) args = parser.parse_args() # Load robots.txt content robots_content = get_robots_txt_content() logger.info("=" * 80) logger.info("Starting robots.txt upload to all storage buckets") if args.dry_run: logger.info("DRY RUN MODE - No actual uploads will be performed") if args.provider: logger.info(f"Provider filter: {args.provider}") logger.info("=" * 80) try: # Connect to database session = get_database_session() # Get all site deployments (optionally filtered by provider) sites = get_all_site_deployments(session, args.provider) if not sites: logger.warning("No site deployments found in database") return logger.info(f"Found {len(sites)} site deployment(s) to process") logger.info("") # Track results successes = [] failures = [] # Process each site for idx, site in enumerate(sites, 1): hostname = site.custom_hostname or site.pull_zone_bcdn_hostname logger.info(f"[{idx}/{len(sites)}] Processing {hostname}...") success, message = upload_robots_txt(site, robots_content, args.dry_run) if success: successes.append((site, message)) else: failures.append((site, message)) logger.info("") # Blank line for readability # Print summary logger.info("=" * 80) logger.info("SUMMARY") logger.info("=" * 80) logger.info(f"Total processed: {len(sites)}") logger.info(f"Successful: {len(successes)}") logger.info(f"Failed: {len(failures)}") # Print failures if any if failures: logger.info("") logger.info("FAILURES:") for site, error_msg in failures: hostname = site.custom_hostname or site.pull_zone_bcdn_hostname logger.error(f" ✗ {hostname}: {error_msg}") logger.info("=" * 80) # Exit with error code if there were failures if failures and not args.dry_run: sys.exit(1) except Exception as e: logger.error(f"Fatal error: {str(e)}", exc_info=True) sys.exit(1) finally: # Close database session if it exists if 'session' in locals(): session.close() if __name__ == "__main__": main()