394 lines
9.8 KiB
Python
394 lines
9.8 KiB
Python
"""
|
|
Script to add robots.txt files to all storage buckets (both S3 and Bunny)
|
|
|
|
This script:
|
|
1. Queries the database for all site deployments
|
|
2. Generates a standard robots.txt that blocks SEO tools/bad bots while allowing search engines and AI
|
|
3. Uploads robots.txt to each bucket using the appropriate storage client
|
|
4. Handles both S3 and Bunny storage providers
|
|
5. Overwrites existing robots.txt files (idempotent - safe to run multiple times)
|
|
6. Continues processing on errors and reports failures at the end
|
|
|
|
Usage:
|
|
python scripts/add_robots_txt_to_buckets.py # Actually upload
|
|
python scripts/add_robots_txt_to_buckets.py --dry-run # Preview only
|
|
python scripts/add_robots_txt_to_buckets.py --provider s3 # Only S3 buckets
|
|
python scripts/add_robots_txt_to_buckets.py --provider bunny # Only Bunny buckets
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import argparse
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import List, Tuple
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
from sqlalchemy import create_engine
|
|
from sqlalchemy.orm import Session
|
|
from dotenv import load_dotenv
|
|
|
|
from src.database.models import SiteDeployment
|
|
from src.deployment.storage_factory import create_storage_client
|
|
from src.deployment.bunny_storage import UploadResult
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_robots_txt_content() -> str:
|
|
"""
|
|
Generate the robots.txt content
|
|
|
|
This configuration:
|
|
- Allows legitimate search engines (Google, Bing, Yahoo, DuckDuckGo, Baidu, Yandex)
|
|
- Allows AI crawlers (GPT, Claude, Common Crawl, Perplexity, ByteDance)
|
|
- Blocks SEO tools (Ahrefs, Semrush, Moz, etc.)
|
|
- Blocks bad bots and site scrapers
|
|
- Allows everything else by default
|
|
|
|
Returns:
|
|
String content of robots.txt file
|
|
"""
|
|
return """# Allow legitimate search engines
|
|
User-agent: Googlebot
|
|
Allow: /
|
|
|
|
User-agent: Bingbot
|
|
Allow: /
|
|
|
|
User-agent: Slurp
|
|
Allow: /
|
|
|
|
User-agent: DuckDuckBot
|
|
Allow: /
|
|
|
|
User-agent: Baiduspider
|
|
Allow: /
|
|
|
|
User-agent: YandexBot
|
|
Allow: /
|
|
|
|
# Allow AI crawlers
|
|
User-agent: GPTBot
|
|
Allow: /
|
|
|
|
User-agent: ChatGPT-User
|
|
Allow: /
|
|
|
|
User-agent: CCBot
|
|
Allow: /
|
|
|
|
User-agent: Claude-Web
|
|
Allow: /
|
|
|
|
User-agent: anthropic-ai
|
|
Allow: /
|
|
|
|
User-agent: PerplexityBot
|
|
Allow: /
|
|
|
|
User-agent: Bytespider
|
|
Allow: /
|
|
|
|
# Block SEO tools
|
|
User-agent: AhrefsBot
|
|
Disallow: /
|
|
|
|
User-agent: SemrushBot
|
|
Disallow: /
|
|
|
|
User-agent: DotBot
|
|
Disallow: /
|
|
|
|
User-agent: Mj12bot
|
|
Disallow: /
|
|
|
|
User-agent: BLEXBot
|
|
Disallow: /
|
|
|
|
User-agent: DataForSeoBot
|
|
Disallow: /
|
|
|
|
User-agent: PetalBot
|
|
Disallow: /
|
|
|
|
User-agent: SeznamBot
|
|
Disallow: /
|
|
|
|
# Block common bad bots
|
|
User-agent: MauiBot
|
|
Disallow: /
|
|
|
|
User-agent: AlphaBot
|
|
Disallow: /
|
|
|
|
User-agent: SiteSnagger
|
|
Disallow: /
|
|
|
|
User-agent: WebStripper
|
|
Disallow: /
|
|
|
|
User-agent: WebCopier
|
|
Disallow: /
|
|
|
|
User-agent: WebZIP
|
|
Disallow: /
|
|
|
|
User-agent: Teleport
|
|
Disallow: /
|
|
|
|
User-agent: TeleportPro
|
|
Disallow: /
|
|
|
|
User-agent: Wget
|
|
Disallow: /
|
|
|
|
User-agent: HTTrack
|
|
Disallow: /
|
|
|
|
User-agent: Microsoft.URL
|
|
Disallow: /
|
|
|
|
User-agent: Xenu
|
|
Disallow: /
|
|
|
|
User-agent: larbin
|
|
Disallow: /
|
|
|
|
User-agent: libwww
|
|
Disallow: /
|
|
|
|
User-agent: ZyBORG
|
|
Disallow: /
|
|
|
|
User-agent: Download
|
|
Disallow: /
|
|
|
|
# Default - allow everyone else (mostly for legitimate indexing)
|
|
User-agent: *
|
|
Allow: /
|
|
"""
|
|
|
|
|
|
def get_database_session() -> Session:
|
|
"""
|
|
Create and return a database session
|
|
|
|
Reads DATABASE_URL from environment variables (.env file)
|
|
|
|
Returns:
|
|
SQLAlchemy Session object
|
|
|
|
Raises:
|
|
ValueError: If DATABASE_URL is not set
|
|
"""
|
|
load_dotenv()
|
|
|
|
database_url = os.getenv("DATABASE_URL")
|
|
if not database_url:
|
|
raise ValueError("DATABASE_URL environment variable is required")
|
|
|
|
engine = create_engine(database_url)
|
|
return Session(engine)
|
|
|
|
|
|
def get_all_site_deployments(session: Session, provider_filter: str = None) -> List[SiteDeployment]:
|
|
"""
|
|
Query database for all site deployments
|
|
|
|
Args:
|
|
session: Database session
|
|
provider_filter: Optional filter - 's3', 'bunny', or None for all
|
|
|
|
Returns:
|
|
List of SiteDeployment objects
|
|
"""
|
|
query = session.query(SiteDeployment)
|
|
|
|
# Apply provider filter if specified
|
|
if provider_filter == 's3':
|
|
# Include both 's3' and 's3_compatible'
|
|
query = query.filter(SiteDeployment.storage_provider.in_(['s3', 's3_compatible']))
|
|
elif provider_filter == 'bunny':
|
|
query = query.filter(SiteDeployment.storage_provider == 'bunny')
|
|
|
|
return query.all()
|
|
|
|
|
|
def upload_robots_txt(
|
|
site: SiteDeployment,
|
|
robots_content: str,
|
|
dry_run: bool = False
|
|
) -> Tuple[bool, str]:
|
|
"""
|
|
Upload robots.txt to a single site's storage bucket
|
|
|
|
Args:
|
|
site: SiteDeployment object with storage configuration
|
|
robots_content: Content of robots.txt file
|
|
dry_run: If True, only log what would be done without uploading
|
|
|
|
Returns:
|
|
Tuple of (success: bool, message: str)
|
|
"""
|
|
try:
|
|
# Determine hostname for logging
|
|
hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
|
|
|
|
# For S3, also show bucket name
|
|
if site.storage_provider in ['s3', 's3_compatible']:
|
|
bucket_info = f" (bucket: {site.s3_bucket_name})" if site.s3_bucket_name else ""
|
|
else:
|
|
bucket_info = f" (zone: {site.storage_zone_name})"
|
|
|
|
if dry_run:
|
|
logger.info(
|
|
f"[DRY RUN] Would upload robots.txt to {site.storage_provider} - "
|
|
f"{hostname}{bucket_info}"
|
|
)
|
|
return True, f"Dry run - would upload to {hostname}"
|
|
|
|
# Create appropriate storage client based on provider
|
|
storage_client = create_storage_client(site)
|
|
|
|
# Upload robots.txt file
|
|
# Note: upload_file handles both str and bytes content
|
|
result: UploadResult = storage_client.upload_file(
|
|
site=site,
|
|
file_path='robots.txt', # Root level file
|
|
content=robots_content
|
|
)
|
|
|
|
if result.success:
|
|
logger.info(
|
|
f"✓ Successfully uploaded robots.txt to {site.storage_provider} - "
|
|
f"{hostname}{bucket_info}"
|
|
)
|
|
return True, result.message
|
|
else:
|
|
logger.error(
|
|
f"✗ Failed to upload robots.txt to {site.storage_provider} - "
|
|
f"{hostname}{bucket_info}: {result.message}"
|
|
)
|
|
return False, result.message
|
|
|
|
except Exception as e:
|
|
hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
|
|
error_msg = f"Exception during upload: {str(e)}"
|
|
logger.error(f"✗ Error uploading to {hostname}: {error_msg}")
|
|
return False, error_msg
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main script execution
|
|
|
|
Process:
|
|
1. Parse command line arguments
|
|
2. Load robots.txt content
|
|
3. Connect to database and fetch all site deployments
|
|
4. Iterate through each site and upload robots.txt
|
|
5. Track successes and failures
|
|
6. Report summary at the end
|
|
"""
|
|
# Parse command line arguments
|
|
parser = argparse.ArgumentParser(
|
|
description='Add robots.txt files to all storage buckets'
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Preview what would be done without actually uploading'
|
|
)
|
|
parser.add_argument(
|
|
'--provider',
|
|
choices=['s3', 'bunny'],
|
|
help='Only process specific provider (s3 or bunny). Default: process all'
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Load robots.txt content
|
|
robots_content = get_robots_txt_content()
|
|
|
|
logger.info("=" * 80)
|
|
logger.info("Starting robots.txt upload to all storage buckets")
|
|
if args.dry_run:
|
|
logger.info("DRY RUN MODE - No actual uploads will be performed")
|
|
if args.provider:
|
|
logger.info(f"Provider filter: {args.provider}")
|
|
logger.info("=" * 80)
|
|
|
|
try:
|
|
# Connect to database
|
|
session = get_database_session()
|
|
|
|
# Get all site deployments (optionally filtered by provider)
|
|
sites = get_all_site_deployments(session, args.provider)
|
|
|
|
if not sites:
|
|
logger.warning("No site deployments found in database")
|
|
return
|
|
|
|
logger.info(f"Found {len(sites)} site deployment(s) to process")
|
|
logger.info("")
|
|
|
|
# Track results
|
|
successes = []
|
|
failures = []
|
|
|
|
# Process each site
|
|
for idx, site in enumerate(sites, 1):
|
|
hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
|
|
logger.info(f"[{idx}/{len(sites)}] Processing {hostname}...")
|
|
|
|
success, message = upload_robots_txt(site, robots_content, args.dry_run)
|
|
|
|
if success:
|
|
successes.append((site, message))
|
|
else:
|
|
failures.append((site, message))
|
|
|
|
logger.info("") # Blank line for readability
|
|
|
|
# Print summary
|
|
logger.info("=" * 80)
|
|
logger.info("SUMMARY")
|
|
logger.info("=" * 80)
|
|
logger.info(f"Total processed: {len(sites)}")
|
|
logger.info(f"Successful: {len(successes)}")
|
|
logger.info(f"Failed: {len(failures)}")
|
|
|
|
# Print failures if any
|
|
if failures:
|
|
logger.info("")
|
|
logger.info("FAILURES:")
|
|
for site, error_msg in failures:
|
|
hostname = site.custom_hostname or site.pull_zone_bcdn_hostname
|
|
logger.error(f" ✗ {hostname}: {error_msg}")
|
|
|
|
logger.info("=" * 80)
|
|
|
|
# Exit with error code if there were failures
|
|
if failures and not args.dry_run:
|
|
sys.exit(1)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Fatal error: {str(e)}", exc_info=True)
|
|
sys.exit(1)
|
|
finally:
|
|
# Close database session if it exists
|
|
if 'session' in locals():
|
|
session.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|