Fix 3: Add staleness recovery for stuck automation tasks
Add date_updated field to ClickUpTask dataclass. Add _recover_stale_tasks() to scheduler that resets tasks stuck in "automation underway" for >2 hours back to "to do" with an explanatory comment. This prevents tasks from being permanently stuck if CheddahBot crashes mid-execution. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>fix/customer-field-migration
parent
41487c8d6b
commit
7d44014d7a
|
|
@ -31,6 +31,7 @@ class ClickUpTask:
|
||||||
list_name: str = ""
|
list_name: str = ""
|
||||||
tags: list[str] = field(default_factory=list)
|
tags: list[str] = field(default_factory=list)
|
||||||
date_done: str = ""
|
date_done: str = ""
|
||||||
|
date_updated: str = ""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_api(cls, data: dict, task_type_field_name: str = "Task Type") -> ClickUpTask:
|
def from_api(cls, data: dict, task_type_field_name: str = "Task Type") -> ClickUpTask:
|
||||||
|
|
@ -67,6 +68,9 @@ class ClickUpTask:
|
||||||
raw_done = data.get("date_done") or data.get("date_closed")
|
raw_done = data.get("date_done") or data.get("date_closed")
|
||||||
date_done = str(raw_done) if raw_done else ""
|
date_done = str(raw_done) if raw_done else ""
|
||||||
|
|
||||||
|
raw_updated = data.get("date_updated")
|
||||||
|
date_updated = str(raw_updated) if raw_updated else ""
|
||||||
|
|
||||||
return cls(
|
return cls(
|
||||||
id=data["id"],
|
id=data["id"],
|
||||||
name=data.get("name", ""),
|
name=data.get("name", ""),
|
||||||
|
|
@ -80,6 +84,7 @@ class ClickUpTask:
|
||||||
list_name=data.get("list", {}).get("name", ""),
|
list_name=data.get("list", {}).get("name", ""),
|
||||||
tags=tags,
|
tags=tags,
|
||||||
date_done=date_done,
|
date_done=date_done,
|
||||||
|
date_updated=date_updated,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -267,6 +267,9 @@ class Scheduler:
|
||||||
)
|
)
|
||||||
return self._clickup_client
|
return self._clickup_client
|
||||||
|
|
||||||
|
# Maximum time a task can stay in "automation underway" before recovery (seconds)
|
||||||
|
STALE_TASK_THRESHOLD_SECONDS = 2 * 60 * 60 # 2 hours
|
||||||
|
|
||||||
def _clickup_loop(self):
|
def _clickup_loop(self):
|
||||||
"""Poll ClickUp for tasks on a regular interval."""
|
"""Poll ClickUp for tasks on a regular interval."""
|
||||||
interval = self.config.clickup.poll_interval_minutes * 60
|
interval = self.config.clickup.poll_interval_minutes * 60
|
||||||
|
|
@ -277,6 +280,7 @@ class Scheduler:
|
||||||
while not self._stop_event.is_set():
|
while not self._stop_event.is_set():
|
||||||
try:
|
try:
|
||||||
self._poll_clickup()
|
self._poll_clickup()
|
||||||
|
self._recover_stale_tasks()
|
||||||
self.db.kv_set(
|
self.db.kv_set(
|
||||||
"system:loop:clickup:last_run", datetime.now(UTC).isoformat()
|
"system:loop:clickup:last_run", datetime.now(UTC).isoformat()
|
||||||
)
|
)
|
||||||
|
|
@ -516,6 +520,58 @@ class Scheduler:
|
||||||
)
|
)
|
||||||
log.error("ClickUp task failed: %s — %s", task.name, e)
|
log.error("ClickUp task failed: %s — %s", task.name, e)
|
||||||
|
|
||||||
|
def _recover_stale_tasks(self):
|
||||||
|
"""Reset tasks stuck in 'automation underway' for too long.
|
||||||
|
|
||||||
|
If a task has been in the automation status for more than
|
||||||
|
STALE_TASK_THRESHOLD_SECONDS (default 2 hours), reset it to
|
||||||
|
the first poll status (usually 'to do') so it gets retried.
|
||||||
|
"""
|
||||||
|
client = self._get_clickup_client()
|
||||||
|
space_id = self.config.clickup.space_id
|
||||||
|
if not space_id:
|
||||||
|
return
|
||||||
|
|
||||||
|
automation_status = self.config.clickup.automation_status
|
||||||
|
try:
|
||||||
|
stale_tasks = client.get_tasks_from_space(
|
||||||
|
space_id, statuses=[automation_status]
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("Failed to query stale tasks: %s", e)
|
||||||
|
return
|
||||||
|
|
||||||
|
now_ms = int(datetime.now(UTC).timestamp() * 1000)
|
||||||
|
threshold_ms = self.STALE_TASK_THRESHOLD_SECONDS * 1000
|
||||||
|
|
||||||
|
for task in stale_tasks:
|
||||||
|
if not task.date_updated:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
updated_ms = int(task.date_updated)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
age_ms = now_ms - updated_ms
|
||||||
|
if age_ms > threshold_ms:
|
||||||
|
reset_status = self.config.clickup.poll_statuses[0] if self.config.clickup.poll_statuses else "to do"
|
||||||
|
log.warning(
|
||||||
|
"Recovering stale task %s (%s) — stuck in '%s' for %.1f hours",
|
||||||
|
task.id, task.name, automation_status, age_ms / 3_600_000,
|
||||||
|
)
|
||||||
|
client.update_task_status(task.id, reset_status)
|
||||||
|
client.add_comment(
|
||||||
|
task.id,
|
||||||
|
f"⚠️ CheddahBot auto-recovered this task. It was stuck in "
|
||||||
|
f"'{automation_status}' for {age_ms / 3_600_000:.1f} hours. "
|
||||||
|
f"Reset to '{reset_status}' for retry.",
|
||||||
|
)
|
||||||
|
self._notify(
|
||||||
|
f"Recovered stale task: **{task.name}** — "
|
||||||
|
f"reset from '{automation_status}' to '{reset_status}'",
|
||||||
|
category="clickup",
|
||||||
|
)
|
||||||
|
|
||||||
def _build_tool_args(self, state: dict) -> dict:
|
def _build_tool_args(self, state: dict) -> dict:
|
||||||
"""Build tool arguments from ClickUp task fields using the field mapping."""
|
"""Build tool arguments from ClickUp task fields using the field mapping."""
|
||||||
skill_map = self.config.clickup.skill_map
|
skill_map = self.config.clickup.skill_map
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue