Big-Link-Man/scripts/discover_s3_buckets.py

405 lines
14 KiB
Python

"""
S3 Bucket Discovery and Registration Script
Discovers all AWS S3 buckets and allows interactive selection to register them
as SiteDeployment records for use in the site assignment pool.
"""
import os
import sys
import hashlib
import logging
from typing import List, Dict, Optional
from datetime import datetime
import boto3
import click
from botocore.exceptions import ClientError, BotoCoreError, NoCredentialsError
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.database.session import db_manager
from src.database.repositories import SiteDeploymentRepository
from src.deployment.s3_storage import map_aws_region_to_short_code
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class BucketInfo:
"""Information about an S3 bucket"""
def __init__(self, name: str, region: str, creation_date: Optional[datetime] = None):
self.name = name
self.region = region
self.creation_date = creation_date
self.is_registered = False
def __repr__(self):
return f"BucketInfo(name={self.name}, region={self.region})"
def get_s3_client():
"""
Create and return a boto3 S3 client
Raises:
SystemExit: If AWS credentials are not found
"""
try:
access_key = os.getenv('AWS_ACCESS_KEY_ID')
secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
if not access_key or not secret_key:
click.echo("Error: AWS credentials not found.", err=True)
click.echo("Please set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables.", err=True)
sys.exit(1)
return boto3.client('s3')
except Exception as e:
click.echo(f"Error creating S3 client: {e}", err=True)
sys.exit(1)
def list_all_buckets(s3_client) -> List[BucketInfo]:
"""
List all S3 buckets and retrieve their metadata
Args:
s3_client: boto3 S3 client
Returns:
List of BucketInfo objects
Raises:
SystemExit: If unable to list buckets
"""
try:
response = s3_client.list_buckets()
buckets = []
for bucket in response.get('Buckets', []):
bucket_name = bucket['Name']
creation_date = bucket.get('CreationDate')
# Get bucket region
try:
region_response = s3_client.get_bucket_location(Bucket=bucket_name)
region = region_response.get('LocationConstraint', 'us-east-1')
# AWS returns None for us-east-1, so normalize it
if region is None or region == '':
region = 'us-east-1'
except ClientError as e:
error_code = e.response.get('Error', {}).get('Code', '')
if error_code == 'AccessDenied':
logger.warning(f"Access denied to get region for bucket {bucket_name}, using default")
region = 'us-east-1'
else:
logger.warning(f"Could not get region for bucket {bucket_name}: {e}, using default")
region = 'us-east-1'
buckets.append(BucketInfo(
name=bucket_name,
region=region,
creation_date=creation_date
))
return buckets
except NoCredentialsError:
click.echo("Error: AWS credentials not found or invalid.", err=True)
click.echo("Please configure AWS credentials using:", err=True)
click.echo(" - Environment variables: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY", err=True)
click.echo(" - AWS credentials file: ~/.aws/credentials", err=True)
click.echo(" - IAM role (if running on EC2)", err=True)
sys.exit(1)
except ClientError as e:
error_code = e.response.get('Error', {}).get('Code', '')
error_message = e.response.get('Error', {}).get('Message', str(e))
click.echo(f"Error listing buckets: {error_code} - {error_message}", err=True)
if error_code == 'AccessDenied':
click.echo("Insufficient permissions. Ensure your AWS credentials have s3:ListAllMyBuckets permission.", err=True)
sys.exit(1)
except Exception as e:
click.echo(f"Unexpected error listing buckets: {e}", err=True)
sys.exit(1)
def check_existing_deployments(site_repo: SiteDeploymentRepository, bucket_names: List[str]) -> Dict[str, bool]:
"""
Check which buckets are already registered in the database
Args:
site_repo: SiteDeploymentRepository instance
bucket_names: List of bucket names to check
Returns:
Dictionary mapping bucket names to boolean (True if registered)
"""
existing = {}
all_sites = site_repo.get_all()
registered_buckets = {
site.s3_bucket_name
for site in all_sites
if site.s3_bucket_name and site.storage_provider in ('s3', 's3_compatible')
}
for bucket_name in bucket_names:
existing[bucket_name] = bucket_name in registered_buckets
return existing
def generate_unique_hostname(bucket_name: str, site_repo: SiteDeploymentRepository, attempt: int = 0) -> str:
"""
Generate a unique hostname for the pull_zone_bcdn_hostname field
Args:
bucket_name: S3 bucket name
site_repo: SiteDeploymentRepository to check for existing hostnames
attempt: Retry attempt number (for appending suffix)
Returns:
Unique hostname string
"""
if attempt == 0:
base_hostname = f"s3-{bucket_name}.b-cdn.net"
else:
base_hostname = f"s3-{bucket_name}-{attempt}.b-cdn.net"
# Check if hostname already exists
existing = site_repo.get_by_bcdn_hostname(base_hostname)
if existing is None:
return base_hostname
# Try again with incremented suffix
return generate_unique_hostname(bucket_name, site_repo, attempt + 1)
def generate_bucket_hash(bucket_name: str) -> int:
"""
Generate a numeric hash from bucket name for placeholder IDs
Args:
bucket_name: S3 bucket name
Returns:
Integer hash (positive, within reasonable range)
"""
hash_obj = hashlib.md5(bucket_name.encode())
hash_int = int(hash_obj.hexdigest(), 16)
# Take modulo to keep it reasonable, but ensure it's positive
return abs(hash_int % 1000000)
def register_bucket(
bucket_info: BucketInfo,
site_repo: SiteDeploymentRepository,
site_name: Optional[str] = None,
custom_domain: Optional[str] = None
) -> bool:
"""
Register an S3 bucket as a SiteDeployment record
Args:
bucket_info: BucketInfo object with bucket details
site_repo: SiteDeploymentRepository instance
site_name: Optional site name (defaults to bucket name)
custom_domain: Optional custom domain for S3
Returns:
True if successful, False otherwise
"""
bucket_name = bucket_info.name
bucket_region = bucket_info.region
# Check if already registered
all_sites = site_repo.get_all()
for site in all_sites:
if site.s3_bucket_name == bucket_name and site.storage_provider == 's3':
click.echo(f" [SKIP] Bucket '{bucket_name}' is already registered (site_id={site.id})")
return False
# Generate placeholder values for Bunny.net fields
bucket_hash = generate_bucket_hash(bucket_name)
short_region = map_aws_region_to_short_code(bucket_region)
unique_hostname = generate_unique_hostname(bucket_name, site_repo)
# Use provided site_name or default to bucket name
final_site_name = site_name or bucket_name
try:
deployment = site_repo.create(
site_name=final_site_name,
storage_provider='s3',
storage_zone_id=bucket_hash,
storage_zone_name=f"s3-{bucket_name}",
storage_zone_password="s3-placeholder",
storage_zone_region=short_region,
pull_zone_id=bucket_hash,
pull_zone_bcdn_hostname=unique_hostname,
custom_hostname=None,
s3_bucket_name=bucket_name,
s3_bucket_region=bucket_region,
s3_custom_domain=custom_domain,
s3_endpoint_url=None
)
click.echo(f" [OK] Registered bucket '{bucket_name}' as site_id={deployment.id}")
return True
except ValueError as e:
click.echo(f" [ERROR] Failed to register bucket '{bucket_name}': {e}", err=True)
return False
except Exception as e:
click.echo(f" [ERROR] Unexpected error registering bucket '{bucket_name}': {e}", err=True)
return False
def display_buckets(buckets: List[BucketInfo], existing_map: Dict[str, bool]):
"""
Display buckets in a formatted table
Args:
buckets: List of BucketInfo objects
existing_map: Dictionary mapping bucket names to registration status
"""
click.echo("\n" + "=" * 80)
click.echo("Available S3 Buckets")
click.echo("=" * 80)
click.echo(f"{'#':<4} {'Bucket Name':<40} {'Region':<15} {'Status':<15}")
click.echo("-" * 80)
for idx, bucket in enumerate(buckets, 1):
bucket.is_registered = existing_map.get(bucket.name, False)
status = "[REGISTERED]" if bucket.is_registered else "[AVAILABLE]"
click.echo(f"{idx:<4} {bucket.name:<40} {bucket.region:<15} {status:<15}")
click.echo("=" * 80)
def main():
"""Main entry point for the discovery script"""
click.echo("S3 Bucket Discovery and Registration")
click.echo("=" * 80)
# Initialize database
try:
db_manager.initialize()
except Exception as e:
click.echo(f"Error initializing database: {e}", err=True)
sys.exit(1)
session = db_manager.get_session()
site_repo = SiteDeploymentRepository(session)
try:
# Get S3 client
click.echo("\nConnecting to AWS S3...")
s3_client = get_s3_client()
# List all buckets
click.echo("Discovering S3 buckets...")
buckets = list_all_buckets(s3_client)
if not buckets:
click.echo("No S3 buckets found in your AWS account.")
return
# Check which buckets are already registered
bucket_names = [b.name for b in buckets]
existing_map = check_existing_deployments(site_repo, bucket_names)
# Display buckets
display_buckets(buckets, existing_map)
# Filter out already registered buckets
available_buckets = [b for b in buckets if not existing_map.get(b.name, False)]
if not available_buckets:
click.echo("\nAll buckets are already registered.")
return
# Prompt for bucket selection
click.echo(f"\nFound {len(available_buckets)} available bucket(s) to register.")
click.echo("Enter bucket numbers to register (comma-separated, e.g., 1,3,5):")
click.echo("Or press Enter to skip registration.")
selection_input = click.prompt("Selection", default="", type=str).strip()
if not selection_input:
click.echo("No buckets selected. Exiting.")
return
# Parse selection
try:
selected_indices = [int(x.strip()) - 1 for x in selection_input.split(',')]
except ValueError:
click.echo("Error: Invalid selection format. Use comma-separated numbers (e.g., 1,3,5)", err=True)
return
# Validate indices
valid_selections = []
for idx in selected_indices:
if 0 <= idx < len(buckets):
if buckets[idx].name in [b.name for b in available_buckets]:
valid_selections.append(buckets[idx])
else:
click.echo(f"Warning: Bucket #{idx + 1} is already registered, skipping.", err=True)
else:
click.echo(f"Warning: Invalid bucket number {idx + 1}, skipping.", err=True)
if not valid_selections:
click.echo("No valid buckets selected.")
return
# Register selected buckets
click.echo(f"\nRegistering {len(valid_selections)} bucket(s)...")
success_count = 0
for bucket_info in valid_selections:
click.echo(f"\nRegistering bucket: {bucket_info.name}")
# Prompt for site name
default_site_name = bucket_info.name
site_name = click.prompt("Site name", default=default_site_name, type=str).strip()
if not site_name:
site_name = default_site_name
# Prompt for custom domain (optional)
custom_domain = click.prompt(
"Custom domain (optional, press Enter to skip)",
default="",
type=str
).strip()
if not custom_domain:
custom_domain = None
# Confirm registration
if click.confirm(f"Register '{bucket_info.name}' as '{site_name}'?"):
if register_bucket(bucket_info, site_repo, site_name, custom_domain):
success_count += 1
else:
click.echo(f" [SKIP] Registration cancelled for '{bucket_info.name}'")
click.echo(f"\n{'=' * 80}")
click.echo(f"Registration complete: {success_count}/{len(valid_selections)} bucket(s) registered.")
click.echo("=" * 80)
except KeyboardInterrupt:
click.echo("\n\nOperation cancelled by user.")
sys.exit(0)
except Exception as e:
click.echo(f"\nUnexpected error: {e}", err=True)
logger.exception("Unexpected error in bucket discovery")
sys.exit(1)
finally:
session.close()
if __name__ == "__main__":
main()