CheddahBot/cheddahbot/tools/autocora.py

"""AutoCora job submission and result polling tools.

Submits Cora SEO report jobs to a shared folder queue and polls for results.
Jobs are JSON files written to a network share; a worker on another machine
picks them up, runs Cora, and writes result files back.
"""

from __future__ import annotations

import json
import logging
import re
import time
from datetime import UTC, datetime
from pathlib import Path

from . import tool

log = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _slugify(text: str) -> str:
    """Convert text to a filesystem-safe slug."""
    text = text.lower().strip()
    text = re.sub(r"[^\w\s-]", "", text)
    text = re.sub(r"[\s_]+", "-", text)
    return re.sub(r"-+", "-", text).strip("-")[:80]


def _make_job_id(keyword: str) -> str:
    """Create a unique job ID from keyword + timestamp."""
    ts = str(int(time.time() * 1000))
    slug = _slugify(keyword)
    return f"job-{ts}-{slug}"


def _get_clickup_client(ctx: dict):
    """Build a ClickUp client from context config."""
    from ..clickup import ClickUpClient

    config = ctx["config"]
    return ClickUpClient(
        api_token=config.clickup.api_token,
        workspace_id=config.clickup.workspace_id,
        task_type_field_name=config.clickup.task_type_field_name,
    )


def _find_qualifying_tasks(client, config, target_date: str, categories: list[str]):
    """Find 'to do' tasks in cora_categories due on target_date (single day).

    Used when target_date is explicitly provided.
    Returns list of ClickUpTask objects.
    """
    space_id = config.clickup.space_id
    if not space_id:
        return []

    try:
        dt = datetime.strptime(target_date, "%Y-%m-%d").replace(tzinfo=UTC)
    except ValueError:
        log.warning("Invalid target_date format: %s", target_date)
        return []

    day_start_ms = int(dt.timestamp() * 1000)
    day_end_ms = day_start_ms + 24 * 60 * 60 * 1000

    tasks = client.get_tasks_from_space(
        space_id,
        statuses=["to do"],
        due_date_lt=day_end_ms,
    )

    qualifying = []
    for task in tasks:
        if task.task_type not in categories:
            continue
        if not task.due_date:
            continue
        try:
            task_due_ms = int(task.due_date)
        except (ValueError, TypeError):
            continue
        if task_due_ms < day_start_ms or task_due_ms >= day_end_ms:
            continue
        qualifying.append(task)

    return qualifying


def _find_qualifying_tasks_sweep(client, config, categories: list[str]):
    """Multi-pass sweep for qualifying tasks when no explicit date is given.

    Pass 1: Tasks due today
    Pass 2: Overdue tasks tagged with current month (e.g. "feb26")
    Pass 3: Tasks tagged with last month (e.g. "jan26"), still "to do"
    Pass 4: Tasks due in next 2 days (look-ahead)

    Deduplicates across passes by task ID.
    Returns list of ClickUpTask objects.
    """
    space_id = config.clickup.space_id
    if not space_id:
        return []

    now = datetime.now(UTC)
    today_start_ms = int(
        now.replace(hour=0, minute=0, second=0, microsecond=0).timestamp() * 1000
    )
    today_end_ms = today_start_ms + 24 * 60 * 60 * 1000
    lookahead_end_ms = today_start_ms + 3 * 24 * 60 * 60 * 1000  # +2 days

    # Current and last month tags (e.g. "feb26", "jan26")
    current_month_tag = now.strftime("%b%y").lower()
    # Go back one month
    if now.month == 1:
        last_month = now.replace(year=now.year - 1, month=12)
    else:
        last_month = now.replace(month=now.month - 1)
    last_month_tag = last_month.strftime("%b%y").lower()

    # Fetch all "to do" tasks with due dates up to lookahead
    all_tasks = client.get_tasks_from_space(
        space_id,
        statuses=["to do"],
        due_date_lt=lookahead_end_ms,
    )

    # Filter to cora categories
    cora_tasks = [t for t in all_tasks if t.task_type in categories]

    seen_ids: set[str] = set()
    qualifying: list = []

    def _add(task):
        if task.id not in seen_ids:
            seen_ids.add(task.id)
            qualifying.append(task)

    # Pass 1: Due today
    for task in cora_tasks:
        if not task.due_date:
            continue
        try:
            due_ms = int(task.due_date)
        except (ValueError, TypeError):
            continue
        if today_start_ms <= due_ms < today_end_ms:
            _add(task)

    # Pass 2: Overdue + tagged with current month
    for task in cora_tasks:
        if not task.due_date:
            continue
        try:
            due_ms = int(task.due_date)
        except (ValueError, TypeError):
            continue
        if due_ms < today_start_ms and current_month_tag in task.tags:
            _add(task)

    # Pass 3: Tagged with last month, still "to do"
    for task in cora_tasks:
        if last_month_tag in task.tags:
            _add(task)

    # Pass 4: Look-ahead (due in next 2 days, excluding today which was pass 1)
    for task in cora_tasks:
        if not task.due_date:
            continue
        try:
            due_ms = int(task.due_date)
        except (ValueError, TypeError):
            continue
        if today_end_ms <= due_ms < lookahead_end_ms:
            _add(task)

    log.info(
        "AutoCora sweep: %d qualifying tasks "
        "(today=%d, overdue+month=%d, last_month=%d, lookahead=%d)",
        len(qualifying),
        sum(1 for t in qualifying if _is_due_today(t, today_start_ms, today_end_ms)),
        sum(1 for t in qualifying if _is_overdue_with_tag(t, today_start_ms, current_month_tag)),
        sum(1 for t in qualifying if last_month_tag in t.tags),
        sum(1 for t in qualifying if _is_lookahead(t, today_end_ms, lookahead_end_ms)),
    )

    return qualifying


def _is_due_today(task, start_ms, end_ms) -> bool:
    try:
        due = int(task.due_date)
        return start_ms <= due < end_ms
    except (ValueError, TypeError):
        return False


def _is_overdue_with_tag(task, today_start_ms, tag) -> bool:
    try:
        due = int(task.due_date)
        return due < today_start_ms and tag in task.tags
    except (ValueError, TypeError):
        return False


def _is_lookahead(task, today_end_ms, lookahead_end_ms) -> bool:
    try:
        due = int(task.due_date)
        return today_end_ms <= due < lookahead_end_ms
    except (ValueError, TypeError):
        return False


def _group_by_keyword(tasks, all_tasks):
    """Group tasks by normalized keyword, pulling in sibling tasks from all_tasks.

    Returns dict: {keyword_lower: {"keyword": str, "url": str, "task_ids": [str]}}
    Alerts list for tasks missing Keyword or IMSURL.
    """
    alerts = []
    groups: dict[str, dict] = {}

    # Index all tasks by keyword for sibling lookup
    all_by_keyword: dict[str, list] = {}
    for t in all_tasks:
        kw = t.custom_fields.get("Keyword", "") or ""
        kw = str(kw).strip()
        if kw:
            all_by_keyword.setdefault(kw.lower(), []).append(t)

    for task in tasks:
        keyword = task.custom_fields.get("Keyword", "") or ""
        keyword = str(keyword).strip()
        if not keyword:
            alerts.append(f"Task '{task.name}' (id={task.id}) missing Keyword field")
            continue

        url = task.custom_fields.get("IMSURL", "") or ""
        url = str(url).strip()
        if not url:
            url = "https://seotoollab.com/blank.html"

        kw_lower = keyword.lower()
        if kw_lower not in groups:
            # Collect ALL task IDs sharing this keyword
            sibling_ids = set()
            for sibling in all_by_keyword.get(kw_lower, []):
                sibling_ids.add(sibling.id)
            sibling_ids.add(task.id)
            groups[kw_lower] = {
                "keyword": keyword,
                "url": url,
                "task_ids": sorted(sibling_ids),
            }
        else:
            # Add this task's ID if not already there
            if task.id not in groups[kw_lower]["task_ids"]:
                groups[kw_lower]["task_ids"].append(task.id)
                groups[kw_lower]["task_ids"].sort()

    return groups, alerts


# ---------------------------------------------------------------------------
# Tools
# ---------------------------------------------------------------------------


@tool(
    "submit_autocora_jobs",
    "Submit Cora SEO report jobs for ClickUp tasks. Uses a multi-pass sweep "
    "(today, overdue, last month, look-ahead) unless a specific date is given. "
    "Writes job JSON files to the AutoCora shared folder queue.",
    category="autocora",
)
def submit_autocora_jobs(target_date: str = "", ctx: dict | None = None) -> str:
    """Submit AutoCora jobs for qualifying ClickUp tasks.

    Args:
        target_date: Date to check (YYYY-MM-DD). Empty = multi-pass sweep.
        ctx: Injected context with config, db, etc.
    """
    if not ctx:
        return "Error: context not available"

    config = ctx["config"]
    autocora = config.autocora

    if not autocora.enabled:
        return "AutoCora is disabled in config."

    if not config.clickup.api_token:
        return "Error: ClickUp API token not configured"

    client = _get_clickup_client(ctx)

    # Find qualifying tasks — sweep or single-day
    if target_date:
        qualifying = _find_qualifying_tasks(client, config, target_date, autocora.cora_categories)
        label = target_date
    else:
        qualifying = _find_qualifying_tasks_sweep(client, config, autocora.cora_categories)
        label = "sweep"

    if not qualifying:
        return f"No qualifying tasks found ({label})."

    # Group by keyword — only siblings that also passed the sweep qualify
    groups, alerts = _group_by_keyword(qualifying, qualifying)

    if not groups and alerts:
        return "No jobs submitted.\n\n" + "\n".join(f"- {a}" for a in alerts)

    # Ensure jobs directory exists
    jobs_dir = Path(autocora.jobs_dir)
    jobs_dir.mkdir(parents=True, exist_ok=True)

    submitted = []
    skipped = []

    for kw_lower, group in groups.items():
        # Check if a job file already exists for this keyword (dedup by file)
        existing_jobs = list(jobs_dir.glob(f"job-*-{_slugify(group['keyword'])}*.json"))
        if existing_jobs:
            skipped.append(group["keyword"])
            continue

        # Write job file (contains task_ids for the result poller)
        job_id = _make_job_id(group["keyword"])
        job_data = {
            "keyword": group["keyword"],
            "url": group["url"],
            "task_ids": group["task_ids"],
        }
        job_path = jobs_dir / f"{job_id}.json"
        job_path.write_text(json.dumps(job_data, indent=2), encoding="utf-8")

        # Move ClickUp tasks to "automation underway"
        for tid in group["task_ids"]:
            client.update_task_status(tid, "automation underway")

        submitted.append(group["keyword"])
        log.info("Submitted AutoCora job: %s -> %s", group["keyword"], job_id)

    # Build response
    lines = [f"AutoCora submission ({label}):"]
    if submitted:
        lines.append(f"\nSubmitted {len(submitted)} job(s):")
        for kw in submitted:
            lines.append(f"  - {kw}")
    if skipped:
        lines.append(f"\nSkipped {len(skipped)} (job file already exists):")
        for kw in skipped:
            lines.append(f"  - {kw}")
    if alerts:
        lines.append(f"\nAlerts ({len(alerts)}):")
        for a in alerts:
            lines.append(f"  - {a}")

    return "\n".join(lines)


@tool(
    "poll_autocora_results",
    "Poll the AutoCora results folder for completed Cora SEO report jobs. "
    "Scans for .result files, reads task_ids from the JSON, updates ClickUp, "
    "then moves the result file to a processed/ subfolder.",
    category="autocora",
)
def poll_autocora_results(ctx: dict | None = None) -> str:
    """Poll for AutoCora results and update ClickUp tasks.

    Scans the results folder for .result files. Each result file is JSON
    containing {status, task_ids, keyword, ...}. After processing, the
    result file is moved to results/processed/ to avoid re-processing.
    """
    if not ctx:
        return "Error: context not available"

    config = ctx["config"]
    autocora = config.autocora

    if not autocora.enabled:
        return "AutoCora is disabled in config."

    results_dir = Path(autocora.results_dir)
    if not results_dir.exists():
        return f"Results directory does not exist: {results_dir}"

    # Scan for .result files
    result_files = list(results_dir.glob("*.result"))
    if not result_files:
        return "No result files found in results folder."

    client = None
    if config.clickup.api_token:
        client = _get_clickup_client(ctx)

    processed_dir = results_dir / "processed"
    processed = []

    for result_path in result_files:
        raw = result_path.read_text(encoding="utf-8").strip()
        result_data = _parse_result(raw)

        task_ids = result_data.get("task_ids", [])
        status = result_data.get("status", "UNKNOWN")
        keyword = result_data.get("keyword", result_path.stem)

        if status == "SUCCESS":
            if client and task_ids:
                for tid in task_ids:
                    client.update_task_status(tid, autocora.success_status)
                    client.add_comment(tid, f"Cora report generated for \"{keyword}\" — ready for you to look at it.")

            processed.append(f"SUCCESS: {keyword}")
            log.info("AutoCora SUCCESS: %s", keyword)

        elif status == "FAILURE":
            reason = result_data.get("reason", "unknown error")
            if client and task_ids:
                for tid in task_ids:
                    client.update_task_status(tid, autocora.error_status)
                    client.add_comment(
                        tid, f"Cora report failed for keyword: {keyword}\nReason: {reason}"
                    )

            processed.append(f"FAILURE: {keyword} ({reason})")
            log.info("AutoCora FAILURE: %s — %s", keyword, reason)

        else:
            processed.append(f"UNKNOWN: {keyword} (status={status})")

        # Move result file to processed/ so it's not re-processed
        processed_dir.mkdir(exist_ok=True)
        try:
            result_path.rename(processed_dir / result_path.name)
        except OSError as e:
            log.warning("Could not move result file %s: %s", result_path.name, e)

    # Build response
    lines = ["AutoCora poll results:"]
    if processed:
        lines.append(f"\nProcessed {len(processed)} result(s):")
        for p in processed:
            lines.append(f"  - {p}")

    return "\n".join(lines)


def _parse_result(raw: str) -> dict:
    """Parse a result file — JSON format or legacy plain text."""
    # Try JSON first
    try:
        data = json.loads(raw)
        if isinstance(data, dict):
            return data
    except json.JSONDecodeError:
        pass

    # Legacy plain text: "SUCCESS" or "FAILURE: reason"
    if raw.startswith("SUCCESS"):
        return {"status": "SUCCESS"}
    if raw.startswith("FAILURE"):
        reason = raw.split(":", 1)[1].strip() if ":" in raw else "unknown"
        return {"status": "FAILURE", "reason": reason}

    return {"status": "UNKNOWN", "raw": raw}