Fix 3: Add staleness recovery for stuck automation tasks

Add date_updated field to ClickUpTask dataclass. Add _recover_stale_tasks() to scheduler that resets tasks stuck in "automation underway" for >2 hours back to "to do" with an explanatory comment. This prevents tasks from being permanently stuck if CheddahBot crashes mid-execution. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 15:52:04 -06:00 · 2026-02-27 15:52:04 -06:00 · 7d44014d7a
parent 41487c8d6b
commit 7d44014d7a
2 changed files with 61 additions and 0 deletions
--- a/cheddahbot/clickup.py
+++ b/cheddahbot/clickup.py
@ -31,6 +31,7 @@ class ClickUpTask:
    list_name: str = ""
    tags: list[str] = field(default_factory=list)
    date_done: str = ""
+    date_updated: str = ""

    @classmethod
    def from_api(cls, data: dict, task_type_field_name: str = "Task Type") -> ClickUpTask:
@ -67,6 +68,9 @@ class ClickUpTask:
        raw_done = data.get("date_done") or data.get("date_closed")
        date_done = str(raw_done) if raw_done else ""

+        raw_updated = data.get("date_updated")
+        date_updated = str(raw_updated) if raw_updated else ""
+
        return cls(
            id=data["id"],
            name=data.get("name", ""),
@ -80,6 +84,7 @@ class ClickUpTask:
            list_name=data.get("list", {}).get("name", ""),
            tags=tags,
            date_done=date_done,
+            date_updated=date_updated,
        )


--- a/cheddahbot/scheduler.py
+++ b/cheddahbot/scheduler.py
@ -267,6 +267,9 @@ class Scheduler:
            )
        return self._clickup_client

+    # Maximum time a task can stay in "automation underway" before recovery (seconds)
+    STALE_TASK_THRESHOLD_SECONDS = 2 * 60 * 60  # 2 hours
+
    def _clickup_loop(self):
        """Poll ClickUp for tasks on a regular interval."""
        interval = self.config.clickup.poll_interval_minutes * 60
@ -277,6 +280,7 @@ class Scheduler:
        while not self._stop_event.is_set():
            try:
                self._poll_clickup()
+                self._recover_stale_tasks()
                self.db.kv_set(
                    "system:loop:clickup:last_run", datetime.now(UTC).isoformat()
                )
@ -516,6 +520,58 @@ class Scheduler:
            )
            log.error("ClickUp task failed: %s — %s", task.name, e)

+    def _recover_stale_tasks(self):
+        """Reset tasks stuck in 'automation underway' for too long.
+
+        If a task has been in the automation status for more than
+        STALE_TASK_THRESHOLD_SECONDS (default 2 hours), reset it to
+        the first poll status (usually 'to do') so it gets retried.
+        """
+        client = self._get_clickup_client()
+        space_id = self.config.clickup.space_id
+        if not space_id:
+            return
+
+        automation_status = self.config.clickup.automation_status
+        try:
+            stale_tasks = client.get_tasks_from_space(
+                space_id, statuses=[automation_status]
+            )
+        except Exception as e:
+            log.warning("Failed to query stale tasks: %s", e)
+            return
+
+        now_ms = int(datetime.now(UTC).timestamp() * 1000)
+        threshold_ms = self.STALE_TASK_THRESHOLD_SECONDS * 1000
+
+        for task in stale_tasks:
+            if not task.date_updated:
+                continue
+            try:
+                updated_ms = int(task.date_updated)
+            except (ValueError, TypeError):
+                continue
+
+            age_ms = now_ms - updated_ms
+            if age_ms > threshold_ms:
+                reset_status = self.config.clickup.poll_statuses[0] if self.config.clickup.poll_statuses else "to do"
+                log.warning(
+                    "Recovering stale task %s (%s) — stuck in '%s' for %.1f hours",
+                    task.id, task.name, automation_status, age_ms / 3_600_000,
+                )
+                client.update_task_status(task.id, reset_status)
+                client.add_comment(
+                    task.id,
+                    f"⚠️ CheddahBot auto-recovered this task. It was stuck in "
+                    f"'{automation_status}' for {age_ms / 3_600_000:.1f} hours. "
+                    f"Reset to '{reset_status}' for retry.",
+                )
+                self._notify(
+                    f"Recovered stale task: **{task.name}** — "
+                    f"reset from '{automation_status}' to '{reset_status}'",
+                    category="clickup",
+                )
+
    def _build_tool_args(self, state: dict) -> dict:
        """Build tool arguments from ClickUp task fields using the field mapping."""
        skill_map = self.config.clickup.skill_map