Big-Link-Man/src/generation/colinkri_processor.py

169 lines
6.0 KiB
Python

import os
import re
import random
import requests
from urllib.parse import quote
from pathlib import Path
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
def process_colinkri_urls(dripfeed=7):
"""
Process URL files and send them to Colinkri API.
Args:
dripfeed (int): Number of days for drip feed. Default is 7.
Returns:
dict: Summary of processed, successful, and failed files
"""
api_key = os.getenv('COLINKRI_API_KEY')
if not api_key:
raise ValueError("COLINKRI_API_KEY not found in environment variables")
# Setup directories
base_dir = Path('deployment_logs')
done_dir = base_dir / 'Done'
failed_dir = base_dir / 'Failed'
# Create directories if they don't exist
done_dir.mkdir(parents=True, exist_ok=True)
failed_dir.mkdir(parents=True, exist_ok=True)
# Pattern to match files: YYYY-MM-DD_other_tiers_urls.txt
pattern = re.compile(r'^\d{4}-\d{2}-\d{2}_other_tiers_urls\.txt$')
# Get matching files
matching_files = [f for f in base_dir.iterdir()
if f.is_file() and pattern.match(f.name)]
if not matching_files:
print("No matching files found.")
return {'processed': 0, 'successful': 0, 'failed': 0}
results = {'processed': 0, 'successful': 0, 'failed': 0}
for file_path in matching_files:
results['processed'] += 1
campaign_name = file_path.stem # Filename without .txt
print(f"\nProcessing: {file_path.name}")
try:
# Read URLs from file
with open(file_path, 'r', encoding='utf-8') as f:
urls = [line.strip() for line in f if line.strip()]
if not urls:
print(f" ⚠️ No URLs found in {file_path.name}")
# Handle potential duplicate filenames in Failed folder
destination = failed_dir / file_path.name
counter = 1
while destination.exists():
new_name = f"{file_path.stem}_{counter}{file_path.suffix}"
destination = failed_dir / new_name
counter += 1
file_path.rename(destination)
results['failed'] += 1
continue
# Randomize URL order
random.shuffle(urls)
# Join URLs with pipe separator
urls_param = '|'.join(urls)
# Prepare API request
api_url = 'https://www.colinkri.com/amember/crawler/api'
# URL encode the parameters
data = {
'apikey': api_key,
'campaignname': campaign_name,
'dripfeed': str(dripfeed),
'urls': urls_param
}
headers = {
'Content-Type': 'application/x-www-form-urlencoded'
}
# Send request
print(f" 📤 Sending {len(urls)} URLs to Colinkri API...")
response = requests.post(api_url, data=data, headers=headers, timeout=30)
# Check response
if response.status_code == 200:
print(f" ✅ Success! Campaign: {campaign_name}")
# Handle potential duplicate filenames in Done folder
destination = done_dir / file_path.name
counter = 1
while destination.exists():
# Add counter to filename if it already exists
new_name = f"{file_path.stem}_{counter}{file_path.suffix}"
destination = done_dir / new_name
counter += 1
file_path.rename(destination)
results['successful'] += 1
else:
error_msg = f"API returned status code {response.status_code}: {response.text}"
print(f" ❌ Failed: {error_msg}")
# Handle potential duplicate filenames in Failed folder
destination = failed_dir / file_path.name
counter = 1
while destination.exists():
new_name = f"{file_path.stem}_{counter}{file_path.suffix}"
destination = failed_dir / new_name
counter += 1
# Log error to file
error_log = failed_dir / f"{destination.stem}_error.log"
with open(error_log, 'w', encoding='utf-8') as f:
f.write(f"Error processing {file_path.name}\n")
f.write(f"Status Code: {response.status_code}\n")
f.write(f"Response: {response.text}\n")
file_path.rename(destination)
results['failed'] += 1
except Exception as e:
print(f" ❌ Error: {str(e)}")
# Handle potential duplicate filenames in Failed folder
destination = failed_dir / file_path.name
counter = 1
while destination.exists():
new_name = f"{file_path.stem}_{counter}{file_path.suffix}"
destination = failed_dir / new_name
counter += 1
# Log error to file
error_log = failed_dir / f"{destination.stem}_error.log"
with open(error_log, 'w', encoding='utf-8') as f:
f.write(f"Error processing {file_path.name}\n")
f.write(f"Exception: {str(e)}\n")
file_path.rename(destination)
results['failed'] += 1
# Print summary
print("\n" + "="*50)
print("SUMMARY")
print("="*50)
print(f"Files processed: {results['processed']}")
print(f"Successful: {results['successful']}")
print(f"Failed: {results['failed']}")
print("="*50)
return results
if __name__ == '__main__':
# Example usage
process_colinkri_urls(dripfeed=7)