CheddahBot/cheddahbot/tools/autocora.py

476 lines
15 KiB
Python

"""AutoCora job submission and result polling tools.
Submits Cora SEO report jobs to a shared folder queue and polls for results.
Jobs are JSON files written to a network share; a worker on another machine
picks them up, runs Cora, and writes result files back.
"""
from __future__ import annotations
import json
import logging
import re
import time
from datetime import UTC, datetime
from pathlib import Path
from . import tool
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _slugify(text: str) -> str:
"""Convert text to a filesystem-safe slug."""
text = text.lower().strip()
text = re.sub(r"[^\w\s-]", "", text)
text = re.sub(r"[\s_]+", "-", text)
return re.sub(r"-+", "-", text).strip("-")[:80]
def _make_job_id(keyword: str) -> str:
"""Create a unique job ID from keyword + timestamp."""
ts = str(int(time.time() * 1000))
slug = _slugify(keyword)
return f"job-{ts}-{slug}"
def _get_clickup_client(ctx: dict):
"""Build a ClickUp client from context config."""
from ..clickup import ClickUpClient
config = ctx["config"]
return ClickUpClient(
api_token=config.clickup.api_token,
workspace_id=config.clickup.workspace_id,
task_type_field_name=config.clickup.task_type_field_name,
)
def _find_qualifying_tasks(client, config, target_date: str, categories: list[str]):
"""Find 'to do' tasks in cora_categories due on target_date (single day).
Used when target_date is explicitly provided.
Returns list of ClickUpTask objects.
"""
space_id = config.clickup.space_id
if not space_id:
return []
try:
dt = datetime.strptime(target_date, "%Y-%m-%d").replace(tzinfo=UTC)
except ValueError:
log.warning("Invalid target_date format: %s", target_date)
return []
day_start_ms = int(dt.timestamp() * 1000)
day_end_ms = day_start_ms + 24 * 60 * 60 * 1000
tasks = client.get_tasks_from_space(
space_id,
statuses=["to do"],
due_date_lt=day_end_ms,
)
qualifying = []
for task in tasks:
if task.task_type not in categories:
continue
if not task.due_date:
continue
try:
task_due_ms = int(task.due_date)
except (ValueError, TypeError):
continue
if task_due_ms < day_start_ms or task_due_ms >= day_end_ms:
continue
qualifying.append(task)
return qualifying
def _find_qualifying_tasks_sweep(client, config, categories: list[str]):
"""Multi-pass sweep for qualifying tasks when no explicit date is given.
Pass 1: Tasks due today
Pass 2: Overdue tasks tagged with current month (e.g. "feb26")
Pass 3: Tasks tagged with last month (e.g. "jan26"), still "to do"
Pass 4: Tasks due in next 2 days (look-ahead)
Deduplicates across passes by task ID.
Returns list of ClickUpTask objects.
"""
space_id = config.clickup.space_id
if not space_id:
return []
now = datetime.now(UTC)
today_start_ms = int(
now.replace(hour=0, minute=0, second=0, microsecond=0).timestamp() * 1000
)
today_end_ms = today_start_ms + 24 * 60 * 60 * 1000
lookahead_end_ms = today_start_ms + 3 * 24 * 60 * 60 * 1000 # +2 days
# Current and last month tags (e.g. "feb26", "jan26")
current_month_tag = now.strftime("%b%y").lower()
# Go back one month
if now.month == 1:
last_month = now.replace(year=now.year - 1, month=12)
else:
last_month = now.replace(month=now.month - 1)
last_month_tag = last_month.strftime("%b%y").lower()
# Fetch all "to do" tasks with due dates up to lookahead
all_tasks = client.get_tasks_from_space(
space_id,
statuses=["to do"],
due_date_lt=lookahead_end_ms,
)
# Filter to cora categories
cora_tasks = [t for t in all_tasks if t.task_type in categories]
seen_ids: set[str] = set()
qualifying: list = []
def _add(task):
if task.id not in seen_ids:
seen_ids.add(task.id)
qualifying.append(task)
# Pass 1: Due today
for task in cora_tasks:
if not task.due_date:
continue
try:
due_ms = int(task.due_date)
except (ValueError, TypeError):
continue
if today_start_ms <= due_ms < today_end_ms:
_add(task)
# Pass 2: Overdue + tagged with current month
for task in cora_tasks:
if not task.due_date:
continue
try:
due_ms = int(task.due_date)
except (ValueError, TypeError):
continue
if due_ms < today_start_ms and current_month_tag in task.tags:
_add(task)
# Pass 3: Tagged with last month, still "to do"
for task in cora_tasks:
if last_month_tag in task.tags:
_add(task)
# Pass 4: Look-ahead (due in next 2 days, excluding today which was pass 1)
for task in cora_tasks:
if not task.due_date:
continue
try:
due_ms = int(task.due_date)
except (ValueError, TypeError):
continue
if today_end_ms <= due_ms < lookahead_end_ms:
_add(task)
log.info(
"AutoCora sweep: %d qualifying tasks "
"(today=%d, overdue+month=%d, last_month=%d, lookahead=%d)",
len(qualifying),
sum(1 for t in qualifying if _is_due_today(t, today_start_ms, today_end_ms)),
sum(1 for t in qualifying if _is_overdue_with_tag(t, today_start_ms, current_month_tag)),
sum(1 for t in qualifying if last_month_tag in t.tags),
sum(1 for t in qualifying if _is_lookahead(t, today_end_ms, lookahead_end_ms)),
)
return qualifying
def _is_due_today(task, start_ms, end_ms) -> bool:
try:
due = int(task.due_date)
return start_ms <= due < end_ms
except (ValueError, TypeError):
return False
def _is_overdue_with_tag(task, today_start_ms, tag) -> bool:
try:
due = int(task.due_date)
return due < today_start_ms and tag in task.tags
except (ValueError, TypeError):
return False
def _is_lookahead(task, today_end_ms, lookahead_end_ms) -> bool:
try:
due = int(task.due_date)
return today_end_ms <= due < lookahead_end_ms
except (ValueError, TypeError):
return False
def _group_by_keyword(tasks, all_tasks):
"""Group tasks by normalized keyword, pulling in sibling tasks from all_tasks.
Returns dict: {keyword_lower: {"keyword": str, "url": str, "task_ids": [str]}}
Alerts list for tasks missing Keyword or IMSURL.
"""
alerts = []
groups: dict[str, dict] = {}
# Index all tasks by keyword for sibling lookup
all_by_keyword: dict[str, list] = {}
for t in all_tasks:
kw = t.custom_fields.get("Keyword", "") or ""
kw = str(kw).strip()
if kw:
all_by_keyword.setdefault(kw.lower(), []).append(t)
for task in tasks:
keyword = task.custom_fields.get("Keyword", "") or ""
keyword = str(keyword).strip()
if not keyword:
alerts.append(f"Task '{task.name}' (id={task.id}) missing Keyword field")
continue
url = task.custom_fields.get("IMSURL", "") or ""
url = str(url).strip()
if not url:
url = "https://seotoollab.com/blank.html"
kw_lower = keyword.lower()
if kw_lower not in groups:
# Collect ALL task IDs sharing this keyword
sibling_ids = set()
for sibling in all_by_keyword.get(kw_lower, []):
sibling_ids.add(sibling.id)
sibling_ids.add(task.id)
groups[kw_lower] = {
"keyword": keyword,
"url": url,
"task_ids": sorted(sibling_ids),
}
else:
# Add this task's ID if not already there
if task.id not in groups[kw_lower]["task_ids"]:
groups[kw_lower]["task_ids"].append(task.id)
groups[kw_lower]["task_ids"].sort()
return groups, alerts
# ---------------------------------------------------------------------------
# Tools
# ---------------------------------------------------------------------------
@tool(
"submit_autocora_jobs",
"Submit Cora SEO report jobs for ClickUp tasks. Uses a multi-pass sweep "
"(today, overdue, last month, look-ahead) unless a specific date is given. "
"Writes job JSON files to the AutoCora shared folder queue.",
category="autocora",
)
def submit_autocora_jobs(target_date: str = "", ctx: dict | None = None) -> str:
"""Submit AutoCora jobs for qualifying ClickUp tasks.
Args:
target_date: Date to check (YYYY-MM-DD). Empty = multi-pass sweep.
ctx: Injected context with config, db, etc.
"""
if not ctx:
return "Error: context not available"
config = ctx["config"]
autocora = config.autocora
if not autocora.enabled:
return "AutoCora is disabled in config."
if not config.clickup.api_token:
return "Error: ClickUp API token not configured"
client = _get_clickup_client(ctx)
# Find qualifying tasks — sweep or single-day
if target_date:
qualifying = _find_qualifying_tasks(client, config, target_date, autocora.cora_categories)
label = target_date
else:
qualifying = _find_qualifying_tasks_sweep(client, config, autocora.cora_categories)
label = "sweep"
if not qualifying:
return f"No qualifying tasks found ({label})."
# Group by keyword — only siblings that also passed the sweep qualify
groups, alerts = _group_by_keyword(qualifying, qualifying)
if not groups and alerts:
return "No jobs submitted.\n\n" + "\n".join(f"- {a}" for a in alerts)
# Ensure jobs directory exists
jobs_dir = Path(autocora.jobs_dir)
jobs_dir.mkdir(parents=True, exist_ok=True)
submitted = []
skipped = []
for kw_lower, group in groups.items():
# Check if a job file already exists for this keyword (dedup by file)
existing_jobs = list(jobs_dir.glob(f"job-*-{_slugify(group['keyword'])}*.json"))
if existing_jobs:
skipped.append(group["keyword"])
continue
# Write job file (contains task_ids for the result poller)
job_id = _make_job_id(group["keyword"])
job_data = {
"keyword": group["keyword"],
"url": group["url"],
"task_ids": group["task_ids"],
}
job_path = jobs_dir / f"{job_id}.json"
job_path.write_text(json.dumps(job_data, indent=2), encoding="utf-8")
# Move ClickUp tasks to "automation underway"
for tid in group["task_ids"]:
client.update_task_status(tid, "automation underway")
submitted.append(group["keyword"])
log.info("Submitted AutoCora job: %s -> %s", group["keyword"], job_id)
# Build response
lines = [f"AutoCora submission ({label}):"]
if submitted:
lines.append(f"\nSubmitted {len(submitted)} job(s):")
for kw in submitted:
lines.append(f" - {kw}")
if skipped:
lines.append(f"\nSkipped {len(skipped)} (job file already exists):")
for kw in skipped:
lines.append(f" - {kw}")
if alerts:
lines.append(f"\nAlerts ({len(alerts)}):")
for a in alerts:
lines.append(f" - {a}")
return "\n".join(lines)
@tool(
"poll_autocora_results",
"Poll the AutoCora results folder for completed Cora SEO report jobs. "
"Scans for .result files, reads task_ids from the JSON, updates ClickUp, "
"then moves the result file to a processed/ subfolder.",
category="autocora",
)
def poll_autocora_results(ctx: dict | None = None) -> str:
"""Poll for AutoCora results and update ClickUp tasks.
Scans the results folder for .result files. Each result file is JSON
containing {status, task_ids, keyword, ...}. After processing, the
result file is moved to results/processed/ to avoid re-processing.
"""
if not ctx:
return "Error: context not available"
config = ctx["config"]
autocora = config.autocora
if not autocora.enabled:
return "AutoCora is disabled in config."
results_dir = Path(autocora.results_dir)
if not results_dir.exists():
return f"Results directory does not exist: {results_dir}"
# Scan for .result files
result_files = list(results_dir.glob("*.result"))
if not result_files:
return "No result files found in results folder."
client = None
if config.clickup.api_token:
client = _get_clickup_client(ctx)
processed_dir = results_dir / "processed"
processed = []
for result_path in result_files:
raw = result_path.read_text(encoding="utf-8").strip()
result_data = _parse_result(raw)
task_ids = result_data.get("task_ids", [])
status = result_data.get("status", "UNKNOWN")
keyword = result_data.get("keyword", result_path.stem)
if status == "SUCCESS":
if client and task_ids:
for tid in task_ids:
client.update_task_status(tid, autocora.success_status)
client.add_comment(tid, f"Cora report generated for \"{keyword}\" — ready for you to look at it.")
processed.append(f"SUCCESS: {keyword}")
log.info("AutoCora SUCCESS: %s", keyword)
elif status == "FAILURE":
reason = result_data.get("reason", "unknown error")
if client and task_ids:
for tid in task_ids:
client.update_task_status(tid, autocora.error_status)
client.add_comment(
tid, f"Cora report failed for keyword: {keyword}\nReason: {reason}"
)
processed.append(f"FAILURE: {keyword} ({reason})")
log.info("AutoCora FAILURE: %s%s", keyword, reason)
else:
processed.append(f"UNKNOWN: {keyword} (status={status})")
# Move result file to processed/ so it's not re-processed
processed_dir.mkdir(exist_ok=True)
try:
result_path.rename(processed_dir / result_path.name)
except OSError as e:
log.warning("Could not move result file %s: %s", result_path.name, e)
# Build response
lines = ["AutoCora poll results:"]
if processed:
lines.append(f"\nProcessed {len(processed)} result(s):")
for p in processed:
lines.append(f" - {p}")
return "\n".join(lines)
def _parse_result(raw: str) -> dict:
"""Parse a result file — JSON format or legacy plain text."""
# Try JSON first
try:
data = json.loads(raw)
if isinstance(data, dict):
return data
except json.JSONDecodeError:
pass
# Legacy plain text: "SUCCESS" or "FAILURE: reason"
if raw.startswith("SUCCESS"):
return {"status": "SUCCESS"}
if raw.startswith("FAILURE"):
reason = raw.split(":", 1)[1].strip() if ":" in raw else "unknown"
return {"status": "FAILURE", "reason": reason}
return {"status": "UNKNOWN", "raw": raw}