Big-Link-Man/src/generation/colinkri_processor.py

import os
import re
import random
import requests
from urllib.parse import quote
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
load_dotenv()


def process_colinkri_urls(dripfeed=7):
    """
    Process URL files and send them to Colinkri API.

    Args:
        dripfeed (int): Number of days for drip feed. Default is 7.

    Returns:
        dict: Summary of processed, successful, and failed files
    """
    api_key = os.getenv('COLINKRI_API_KEY')
    if not api_key:
        raise ValueError("COLINKRI_API_KEY not found in environment variables")

    # Setup directories
    base_dir = Path('deployment_logs')
    done_dir = base_dir / 'Done'
    failed_dir = base_dir / 'Failed'

    # Create directories if they don't exist
    done_dir.mkdir(parents=True, exist_ok=True)
    failed_dir.mkdir(parents=True, exist_ok=True)

    # Pattern to match files: YYYY-MM-DD_other_tiers_urls.txt
    pattern = re.compile(r'^\d{4}-\d{2}-\d{2}_other_tiers_urls\.txt$')

    # Get matching files
    matching_files = [f for f in base_dir.iterdir()
                      if f.is_file() and pattern.match(f.name)]

    if not matching_files:
        print("No matching files found.")
        return {'processed': 0, 'successful': 0, 'failed': 0}

    results = {'processed': 0, 'successful': 0, 'failed': 0}

    for file_path in matching_files:
        results['processed'] += 1
        campaign_name = file_path.stem  # Filename without .txt

        print(f"\nProcessing: {file_path.name}")

        try:
            # Read URLs from file
            with open(file_path, 'r', encoding='utf-8') as f:
                urls = [line.strip() for line in f if line.strip()]

            if not urls:
                print(f"  ⚠️  No URLs found in {file_path.name}")

                # Handle potential duplicate filenames in Failed folder
                destination = failed_dir / file_path.name
                counter = 1
                while destination.exists():
                    new_name = f"{file_path.stem}_{counter}{file_path.suffix}"
                    destination = failed_dir / new_name
                    counter += 1

                file_path.rename(destination)
                results['failed'] += 1
                continue
            # Randomize URL order
            random.shuffle(urls)
            # Join URLs with pipe separator
            urls_param = '|'.join(urls)

            # Prepare API request
            api_url = 'https://www.colinkri.com/amember/crawler/api'

            # URL encode the parameters
            data = {
                'apikey': api_key,
                'campaignname': campaign_name,
                'dripfeed': str(dripfeed),
                'urls': urls_param
            }

            headers = {
                'Content-Type': 'application/x-www-form-urlencoded'
            }

            # Send request
            print(f"  📤 Sending {len(urls)} URLs to Colinkri API...")
            response = requests.post(api_url, data=data, headers=headers, timeout=30)

            # Check response
            if response.status_code == 200:
                print(f"  ✅ Success! Campaign: {campaign_name}")

                # Handle potential duplicate filenames in Done folder
                destination = done_dir / file_path.name
                counter = 1
                while destination.exists():
                    # Add counter to filename if it already exists
                    new_name = f"{file_path.stem}_{counter}{file_path.suffix}"
                    destination = done_dir / new_name
                    counter += 1

                file_path.rename(destination)
                results['successful'] += 1
            else:
                error_msg = f"API returned status code {response.status_code}: {response.text}"
                print(f"  ❌ Failed: {error_msg}")

                # Handle potential duplicate filenames in Failed folder
                destination = failed_dir / file_path.name
                counter = 1
                while destination.exists():
                    new_name = f"{file_path.stem}_{counter}{file_path.suffix}"
                    destination = failed_dir / new_name
                    counter += 1

                # Log error to file
                error_log = failed_dir / f"{destination.stem}_error.log"
                with open(error_log, 'w', encoding='utf-8') as f:
                    f.write(f"Error processing {file_path.name}\n")
                    f.write(f"Status Code: {response.status_code}\n")
                    f.write(f"Response: {response.text}\n")

                file_path.rename(destination)
                results['failed'] += 1

        except Exception as e:
            print(f"  ❌ Error: {str(e)}")

            # Handle potential duplicate filenames in Failed folder
            destination = failed_dir / file_path.name
            counter = 1
            while destination.exists():
                new_name = f"{file_path.stem}_{counter}{file_path.suffix}"
                destination = failed_dir / new_name
                counter += 1

            # Log error to file
            error_log = failed_dir / f"{destination.stem}_error.log"
            with open(error_log, 'w', encoding='utf-8') as f:
                f.write(f"Error processing {file_path.name}\n")
                f.write(f"Exception: {str(e)}\n")

            file_path.rename(destination)
            results['failed'] += 1

    # Print summary
    print("\n" + "="*50)
    print("SUMMARY")
    print("="*50)
    print(f"Files processed:  {results['processed']}")
    print(f"Successful:       {results['successful']}")
    print(f"Failed:           {results['failed']}")
    print("="*50)

    return results


if __name__ == '__main__':
    # Example usage
    process_colinkri_urls(dripfeed=7)