Big-Link-Man/scripts/add_robots_txt_to_buckets.py

394 lines
9.8 KiB
Python

"""
Script to add robots.txt files to all storage buckets (both S3 and Bunny)
This script:
1. Queries the database for all site deployments
2. Generates a standard robots.txt that blocks SEO tools/bad bots while allowing search engines and AI
3. Uploads robots.txt to each bucket using the appropriate storage client
4. Handles both S3 and Bunny storage providers
5. Overwrites existing robots.txt files (idempotent - safe to run multiple times)
6. Continues processing on errors and reports failures at the end
Usage:
python scripts/add_robots_txt_to_buckets.py # Actually upload
python scripts/add_robots_txt_to_buckets.py --dry-run # Preview only
python scripts/add_robots_txt_to_buckets.py --provider s3 # Only S3 buckets
python scripts/add_robots_txt_to_buckets.py --provider bunny # Only Bunny buckets
"""
import sys
import os
import argparse
import logging
from pathlib import Path
from typing import List, Tuple
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from dotenv import load_dotenv
from src.database.models import SiteDeployment
from src.deployment.storage_factory import create_storage_client
from src.deployment.bunny_storage import UploadResult
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def get_robots_txt_content() -> str:
"""
Generate the robots.txt content
This configuration:
- Allows legitimate search engines (Google, Bing, Yahoo, DuckDuckGo, Baidu, Yandex)
- Allows AI crawlers (GPT, Claude, Common Crawl, Perplexity, ByteDance)
- Blocks SEO tools (Ahrefs, Semrush, Moz, etc.)
- Blocks bad bots and site scrapers
- Allows everything else by default
Returns:
String content of robots.txt file
"""
return """# Allow legitimate search engines
User-agent: Googlebot
Allow: /
User-agent: Bingbot
Allow: /
User-agent: Slurp
Allow: /
User-agent: DuckDuckBot
Allow: /
User-agent: Baiduspider
Allow: /
User-agent: YandexBot
Allow: /
# Allow AI crawlers
User-agent: GPTBot
Allow: /
User-agent: ChatGPT-User
Allow: /
User-agent: CCBot
Allow: /
User-agent: Claude-Web
Allow: /
User-agent: anthropic-ai
Allow: /
User-agent: PerplexityBot
Allow: /
User-agent: Bytespider
Allow: /
# Block SEO tools
User-agent: AhrefsBot
Disallow: /
User-agent: SemrushBot
Disallow: /
User-agent: DotBot
Disallow: /
User-agent: Mj12bot
Disallow: /
User-agent: BLEXBot
Disallow: /
User-agent: DataForSeoBot
Disallow: /
User-agent: PetalBot
Disallow: /
User-agent: SeznamBot
Disallow: /
# Block common bad bots
User-agent: MauiBot
Disallow: /
User-agent: AlphaBot
Disallow: /
User-agent: SiteSnagger
Disallow: /
User-agent: WebStripper
Disallow: /
User-agent: WebCopier
Disallow: /
User-agent: WebZIP
Disallow: /
User-agent: Teleport
Disallow: /
User-agent: TeleportPro
Disallow: /
User-agent: Wget
Disallow: /
User-agent: HTTrack
Disallow: /
User-agent: Microsoft.URL
Disallow: /
User-agent: Xenu
Disallow: /
User-agent: larbin
Disallow: /
User-agent: libwww
Disallow: /
User-agent: ZyBORG
Disallow: /
User-agent: Download
Disallow: /
# Default - allow everyone else (mostly for legitimate indexing)
User-agent: *
Allow: /
"""
def get_database_session() -> Session:
"""
Create and return a database session
Reads DATABASE_URL from environment variables (.env file)
Returns:
SQLAlchemy Session object
Raises:
ValueError: If DATABASE_URL is not set
"""
load_dotenv()
database_url = os.getenv("DATABASE_URL")
if not database_url:
raise ValueError("DATABASE_URL environment variable is required")
engine = create_engine(database_url)
return Session(engine)
def get_all_site_deployments(session: Session, provider_filter: str = None) -> List[SiteDeployment]:
"""
Query database for all site deployments
Args:
session: Database session
provider_filter: Optional filter - 's3', 'bunny', or None for all
Returns:
List of SiteDeployment objects
"""
query = session.query(SiteDeployment)
# Apply provider filter if specified
if provider_filter == 's3':
# Include both 's3' and 's3_compatible'
query = query.filter(SiteDeployment.storage_provider.in_(['s3', 's3_compatible']))
elif provider_filter == 'bunny':
query = query.filter(SiteDeployment.storage_provider == 'bunny')
return query.all()
def upload_robots_txt(
site: SiteDeployment,
robots_content: str,
dry_run: bool = False
) -> Tuple[bool, str]:
"""
Upload robots.txt to a single site's storage bucket
Args:
site: SiteDeployment object with storage configuration
robots_content: Content of robots.txt file
dry_run: If True, only log what would be done without uploading
Returns:
Tuple of (success: bool, message: str)
"""
try:
# Determine hostname for logging
hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
# For S3, also show bucket name
if site.storage_provider in ['s3', 's3_compatible']:
bucket_info = f" (bucket: {site.s3_bucket_name})" if site.s3_bucket_name else ""
else:
bucket_info = f" (zone: {site.storage_zone_name})"
if dry_run:
logger.info(
f"[DRY RUN] Would upload robots.txt to {site.storage_provider} - "
f"{hostname}{bucket_info}"
)
return True, f"Dry run - would upload to {hostname}"
# Create appropriate storage client based on provider
storage_client = create_storage_client(site)
# Upload robots.txt file
# Note: upload_file handles both str and bytes content
result: UploadResult = storage_client.upload_file(
site=site,
file_path='robots.txt', # Root level file
content=robots_content
)
if result.success:
logger.info(
f"✓ Successfully uploaded robots.txt to {site.storage_provider} - "
f"{hostname}{bucket_info}"
)
return True, result.message
else:
logger.error(
f"✗ Failed to upload robots.txt to {site.storage_provider} - "
f"{hostname}{bucket_info}: {result.message}"
)
return False, result.message
except Exception as e:
hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
error_msg = f"Exception during upload: {str(e)}"
logger.error(f"✗ Error uploading to {hostname}: {error_msg}")
return False, error_msg
def main():
"""
Main script execution
Process:
1. Parse command line arguments
2. Load robots.txt content
3. Connect to database and fetch all site deployments
4. Iterate through each site and upload robots.txt
5. Track successes and failures
6. Report summary at the end
"""
# Parse command line arguments
parser = argparse.ArgumentParser(
description='Add robots.txt files to all storage buckets'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Preview what would be done without actually uploading'
)
parser.add_argument(
'--provider',
choices=['s3', 'bunny'],
help='Only process specific provider (s3 or bunny). Default: process all'
)
args = parser.parse_args()
# Load robots.txt content
robots_content = get_robots_txt_content()
logger.info("=" * 80)
logger.info("Starting robots.txt upload to all storage buckets")
if args.dry_run:
logger.info("DRY RUN MODE - No actual uploads will be performed")
if args.provider:
logger.info(f"Provider filter: {args.provider}")
logger.info("=" * 80)
try:
# Connect to database
session = get_database_session()
# Get all site deployments (optionally filtered by provider)
sites = get_all_site_deployments(session, args.provider)
if not sites:
logger.warning("No site deployments found in database")
return
logger.info(f"Found {len(sites)} site deployment(s) to process")
logger.info("")
# Track results
successes = []
failures = []
# Process each site
for idx, site in enumerate(sites, 1):
hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
logger.info(f"[{idx}/{len(sites)}] Processing {hostname}...")
success, message = upload_robots_txt(site, robots_content, args.dry_run)
if success:
successes.append((site, message))
else:
failures.append((site, message))
logger.info("") # Blank line for readability
# Print summary
logger.info("=" * 80)
logger.info("SUMMARY")
logger.info("=" * 80)
logger.info(f"Total processed: {len(sites)}")
logger.info(f"Successful: {len(successes)}")
logger.info(f"Failed: {len(failures)}")
# Print failures if any
if failures:
logger.info("")
logger.info("FAILURES:")
for site, error_msg in failures:
hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
logger.error(f"{hostname}: {error_msg}")
logger.info("=" * 80)
# Exit with error code if there were failures
if failures and not args.dry_run:
sys.exit(1)
except Exception as e:
logger.error(f"Fatal error: {str(e)}", exc_info=True)
sys.exit(1)
finally:
# Close database session if it exists
if 'session' in locals():
session.close()
if __name__ == "__main__":
main()