Add adversarial fact-check step to press release pipeline
Sonnet + WebSearch reviews each PR between generation and schema steps. Returns [NO_ERRORS] or [CORRECTED] with change log; rewrites that shift word count by more than 15% are rejected. Fact-check failures are graceful -- PR still ships with a ClickUp note that manual review is recommended. Wired into both the legacy pipeline and the headless clickup_runner. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>clickup-runner
parent
38a88987a0
commit
abb6e1841b
|
|
@ -4,8 +4,9 @@ Autonomous workflow:
|
||||||
1. Generate 7 compliant headlines (chat brain)
|
1. Generate 7 compliant headlines (chat brain)
|
||||||
2. AI judge picks the 2 best (chat brain)
|
2. AI judge picks the 2 best (chat brain)
|
||||||
3. Write 2 full press releases (execution brain x 2)
|
3. Write 2 full press releases (execution brain x 2)
|
||||||
|
3.5. Adversarial fact-check (Sonnet + WebSearch, graceful failure)
|
||||||
4. Generate 2 JSON-LD schemas (execution brain x 2, Sonnet + WebSearch)
|
4. Generate 2 JSON-LD schemas (execution brain x 2, Sonnet + WebSearch)
|
||||||
5. Save 4 files, return cost summary
|
5. Save files, return cost summary
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
@ -35,6 +36,7 @@ _COMPANIES_FILE = _SKILLS_DIR / "companies.md"
|
||||||
_HEADLINES_FILE = _SKILLS_DIR / "headlines.md"
|
_HEADLINES_FILE = _SKILLS_DIR / "headlines.md"
|
||||||
|
|
||||||
SONNET_CLI_MODEL = "sonnet"
|
SONNET_CLI_MODEL = "sonnet"
|
||||||
|
FACT_CHECK_MODEL = "sonnet"
|
||||||
|
|
||||||
|
|
||||||
def _set_status(ctx: dict | None, message: str) -> None:
|
def _set_status(ctx: dict | None, message: str) -> None:
|
||||||
|
|
@ -524,6 +526,103 @@ def _build_schema_prompt(pr_text: str, company_name: str, url: str, skill_text:
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
def _build_fact_check_prompt(
|
||||||
|
pr_text: str,
|
||||||
|
company_name: str,
|
||||||
|
url: str,
|
||||||
|
topic: str,
|
||||||
|
keyword: str,
|
||||||
|
) -> str:
|
||||||
|
"""Build the prompt for the adversarial fact-checker step."""
|
||||||
|
return (
|
||||||
|
"You are a factual accuracy reviewer for press releases. Your ONLY job is to "
|
||||||
|
"find and correct statements that are factually wrong. You are NOT an editor.\n\n"
|
||||||
|
"GROUND TRUTH -- the following data was provided by the client and is correct "
|
||||||
|
"by definition. Do NOT change, question, or 'correct' any of it, even if your "
|
||||||
|
"web search suggests something different:\n"
|
||||||
|
f" - Company name: {company_name}\n"
|
||||||
|
f" - Target URL: {url}\n"
|
||||||
|
f" - Topic: {topic}\n"
|
||||||
|
f" - Keyword: {keyword}\n"
|
||||||
|
" - Any person names, titles, quotes, or contact details in the PR\n"
|
||||||
|
" - Any product names, service names, or brand names\n"
|
||||||
|
" - The overall framing, angle, and tone of the PR\n\n"
|
||||||
|
"WHAT TO CHECK (use WebSearch/WebFetch to verify):\n"
|
||||||
|
" - Industry statistics or market size claims\n"
|
||||||
|
" - Historical dates or facts\n"
|
||||||
|
" - Technical specifications not sourced from the client data\n"
|
||||||
|
" - General knowledge claims (e.g. 'X is the leading cause of Y')\n"
|
||||||
|
" - Geographic or regulatory facts\n\n"
|
||||||
|
"RULES:\n"
|
||||||
|
" - ONLY fix actual factual errors -- wrong numbers, wrong dates, wrong facts\n"
|
||||||
|
" - Do NOT add content, remove content, restructure, or 'improve' anything\n"
|
||||||
|
" - Do NOT change tone, style, word choice, or sentence structure\n"
|
||||||
|
" - Do NOT suggest additions or enhancements\n"
|
||||||
|
" - Make the MINIMUM change needed to fix each error\n"
|
||||||
|
" - Preserve the exact formatting, paragraph breaks, and headline\n\n"
|
||||||
|
"OUTPUT FORMAT:\n"
|
||||||
|
" - If you find NO factual errors: output exactly [NO_ERRORS] and nothing else\n"
|
||||||
|
" - If you find errors: output [CORRECTED] on the first line, then the full "
|
||||||
|
"corrected PR text (preserving all formatting), then a blank line, then "
|
||||||
|
"CHANGES: followed by a numbered list of what you changed and why\n\n"
|
||||||
|
"Press release to review:\n"
|
||||||
|
"---\n"
|
||||||
|
f"{pr_text}\n"
|
||||||
|
"---"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_fact_check(
|
||||||
|
raw_output: str, original_text: str
|
||||||
|
) -> tuple[str, str, str]:
|
||||||
|
"""Parse fact-checker output. Returns (text, status, changes).
|
||||||
|
|
||||||
|
status is one of: "clean", "corrected", "skipped"
|
||||||
|
On any parse failure or suspect rewrite, returns original text unchanged.
|
||||||
|
"""
|
||||||
|
if not raw_output or not raw_output.strip():
|
||||||
|
return original_text, "skipped", ""
|
||||||
|
|
||||||
|
stripped = raw_output.strip()
|
||||||
|
|
||||||
|
# No errors found
|
||||||
|
if stripped.startswith("[NO_ERRORS]"):
|
||||||
|
return original_text, "clean", ""
|
||||||
|
|
||||||
|
# Corrections found
|
||||||
|
if stripped.startswith("[CORRECTED]"):
|
||||||
|
# Split off the [CORRECTED] prefix
|
||||||
|
body = stripped[len("[CORRECTED]"):].strip()
|
||||||
|
|
||||||
|
# Split into corrected text and change log
|
||||||
|
changes = ""
|
||||||
|
if "\nCHANGES:" in body:
|
||||||
|
text_part, changes = body.split("\nCHANGES:", 1)
|
||||||
|
corrected = text_part.strip()
|
||||||
|
changes = changes.strip()
|
||||||
|
else:
|
||||||
|
corrected = body
|
||||||
|
|
||||||
|
if not corrected:
|
||||||
|
return original_text, "skipped", ""
|
||||||
|
|
||||||
|
# Safety: reject if word count differs by more than 15%
|
||||||
|
orig_wc = _word_count(original_text)
|
||||||
|
new_wc = _word_count(corrected)
|
||||||
|
if orig_wc > 0 and abs(new_wc - orig_wc) / orig_wc > 0.15:
|
||||||
|
log.warning(
|
||||||
|
"Fact-check rejected: word count changed too much "
|
||||||
|
"(%d -> %d, %.0f%% delta)",
|
||||||
|
orig_wc, new_wc, abs(new_wc - orig_wc) / orig_wc * 100,
|
||||||
|
)
|
||||||
|
return original_text, "skipped", "rejected -- word count delta too large"
|
||||||
|
|
||||||
|
return corrected, "corrected", changes
|
||||||
|
|
||||||
|
# Unparseable output
|
||||||
|
return original_text, "skipped", ""
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Main tool
|
# Main tool
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -601,8 +700,8 @@ def write_press_releases(
|
||||||
cost_log: list[dict] = []
|
cost_log: list[dict] = []
|
||||||
|
|
||||||
# ── Step 1: Generate 7 headlines (chat brain) ─────────────────────────
|
# ── Step 1: Generate 7 headlines (chat brain) ─────────────────────────
|
||||||
log.info("[PR Pipeline] Step 1/4: Generating 7 headlines for %s...", company_name)
|
log.info("[PR Pipeline] Step 1/5: Generating 7 headlines for %s...", company_name)
|
||||||
_set_status(ctx, f"Step 1/4: Generating 7 headlines for {company_name}...")
|
_set_status(ctx, f"Step 1/5: Generating 7 headlines for {company_name}...")
|
||||||
step_start = time.time()
|
step_start = time.time()
|
||||||
headline_prompt = _build_headline_prompt(topic, company_name, url, lsi_terms, headlines_ref)
|
headline_prompt = _build_headline_prompt(topic, company_name, url, lsi_terms, headlines_ref)
|
||||||
messages = [
|
messages = [
|
||||||
|
|
@ -627,8 +726,8 @@ def write_press_releases(
|
||||||
headlines_file.write_text(headlines_raw.strip(), encoding="utf-8")
|
headlines_file.write_text(headlines_raw.strip(), encoding="utf-8")
|
||||||
|
|
||||||
# ── Step 2: AI judge picks best 2 (chat brain) ───────────────────────
|
# ── Step 2: AI judge picks best 2 (chat brain) ───────────────────────
|
||||||
log.info("[PR Pipeline] Step 2/4: AI judge selecting best 2 headlines...")
|
log.info("[PR Pipeline] Step 2/5: AI judge selecting best 2 headlines...")
|
||||||
_set_status(ctx, "Step 2/4: AI judge selecting best 2 headlines...")
|
_set_status(ctx, "Step 2/5: AI judge selecting best 2 headlines...")
|
||||||
step_start = time.time()
|
step_start = time.time()
|
||||||
judge_prompt = _build_judge_prompt(headlines_raw, headlines_ref, topic)
|
judge_prompt = _build_judge_prompt(headlines_raw, headlines_ref, topic)
|
||||||
messages = [
|
messages = [
|
||||||
|
|
@ -666,7 +765,7 @@ def write_press_releases(
|
||||||
winners = winners[:2]
|
winners = winners[:2]
|
||||||
|
|
||||||
# ── Step 3: Write 2 press releases (execution brain x 2) ─────────────
|
# ── Step 3: Write 2 press releases (execution brain x 2) ─────────────
|
||||||
log.info("[PR Pipeline] Step 3/4: Writing 2 press releases...")
|
log.info("[PR Pipeline] Step 3/5: Writing 2 press releases...")
|
||||||
anchor_phrase = _derive_anchor_phrase(company_name, keyword) if keyword else ""
|
anchor_phrase = _derive_anchor_phrase(company_name, keyword) if keyword else ""
|
||||||
pr_texts: list[str] = []
|
pr_texts: list[str] = []
|
||||||
pr_files: list[str] = []
|
pr_files: list[str] = []
|
||||||
|
|
@ -674,7 +773,7 @@ def write_press_releases(
|
||||||
anchor_warnings: list[str] = []
|
anchor_warnings: list[str] = []
|
||||||
for i, headline in enumerate(winners):
|
for i, headline in enumerate(winners):
|
||||||
log.info("[PR Pipeline] Writing PR %d/2: %s", i + 1, headline[:60])
|
log.info("[PR Pipeline] Writing PR %d/2: %s", i + 1, headline[:60])
|
||||||
_set_status(ctx, f"Step 3/4: Writing press release {i + 1}/2 — {headline[:60]}...")
|
_set_status(ctx, f"Step 3/5: Writing press release {i + 1}/2 — {headline[:60]}...")
|
||||||
step_start = time.time()
|
step_start = time.time()
|
||||||
pr_prompt = _build_pr_prompt(
|
pr_prompt = _build_pr_prompt(
|
||||||
headline,
|
headline,
|
||||||
|
|
@ -737,6 +836,65 @@ def write_press_releases(
|
||||||
text_to_docx(clean_result, docx_path)
|
text_to_docx(clean_result, docx_path)
|
||||||
docx_files.append(str(docx_path))
|
docx_files.append(str(docx_path))
|
||||||
|
|
||||||
|
# ── Step 3.5: Adversarial fact-check (Sonnet + WebSearch) ───────────
|
||||||
|
log.info("[PR Pipeline] Step 3.5/5: Running adversarial fact-check...")
|
||||||
|
fact_check_statuses: list[str] = [] # per-PR: "clean", "corrected", "skipped"
|
||||||
|
fact_check_changes: list[str] = [] # per-PR change log (empty if clean/skipped)
|
||||||
|
fact_check_failed = False
|
||||||
|
for i, pr_text in enumerate(pr_texts):
|
||||||
|
log.info("[PR Pipeline] Fact-checking PR %d/2...", i + 1)
|
||||||
|
_set_status(ctx, f"Step 3.5/5: Fact-checking PR {i + 1}/2...")
|
||||||
|
step_start = time.time()
|
||||||
|
try:
|
||||||
|
fc_prompt = _build_fact_check_prompt(
|
||||||
|
pr_text, company_name, url, topic, keyword
|
||||||
|
)
|
||||||
|
fc_result = agent.execute_task(
|
||||||
|
fc_prompt, tools="WebSearch,WebFetch", model=FACT_CHECK_MODEL
|
||||||
|
)
|
||||||
|
corrected, status, changes = _apply_fact_check(fc_result, pr_text)
|
||||||
|
fact_check_statuses.append(status)
|
||||||
|
fact_check_changes.append(changes)
|
||||||
|
|
||||||
|
if status == "corrected":
|
||||||
|
pr_texts[i] = corrected
|
||||||
|
# Re-write files with corrected text
|
||||||
|
Path(pr_files[i]).write_text(corrected, encoding="utf-8")
|
||||||
|
text_to_docx(corrected, Path(docx_files[i]))
|
||||||
|
log.info(
|
||||||
|
"[PR Pipeline] PR %d: %d correction(s) applied",
|
||||||
|
i + 1, changes.count("\n") + 1 if changes else 1,
|
||||||
|
)
|
||||||
|
elif status == "clean":
|
||||||
|
log.info("[PR Pipeline] PR %d: no factual errors found", i + 1)
|
||||||
|
else:
|
||||||
|
log.warning("[PR Pipeline] PR %d: fact-check skipped (unparseable output)", i + 1)
|
||||||
|
|
||||||
|
elapsed = round(time.time() - step_start, 1)
|
||||||
|
cost_log.append(
|
||||||
|
{
|
||||||
|
"step": f"3.5{chr(97 + i)}. Fact-check PR {i + 1}",
|
||||||
|
"model": FACT_CHECK_MODEL,
|
||||||
|
"elapsed_s": elapsed,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
fact_check_failed = True
|
||||||
|
fact_check_statuses.append("skipped")
|
||||||
|
fact_check_changes.append("")
|
||||||
|
log.warning("[PR Pipeline] PR %d fact-check failed: %s", i + 1, e)
|
||||||
|
|
||||||
|
# Notify ClickUp if fact-check could not run at all
|
||||||
|
if fact_check_failed and clickup_task_id and cu_client:
|
||||||
|
try:
|
||||||
|
cu_client.add_comment(
|
||||||
|
clickup_task_id,
|
||||||
|
"Note: factual accuracy check could not be run on this PR. "
|
||||||
|
"Manual review recommended.",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("ClickUp fact-check warning failed for %s: %s", clickup_task_id, e)
|
||||||
|
|
||||||
# ── ClickUp: upload docx attachments + comment ─────────────────────
|
# ── ClickUp: upload docx attachments + comment ─────────────────────
|
||||||
uploaded_count = 0
|
uploaded_count = 0
|
||||||
failed_uploads: list[str] = []
|
failed_uploads: list[str] = []
|
||||||
|
|
@ -755,11 +913,27 @@ def write_press_releases(
|
||||||
f"\n[WARNING]Warning: {len(failed_uploads)} attachment(s) failed to upload. "
|
f"\n[WARNING]Warning: {len(failed_uploads)} attachment(s) failed to upload. "
|
||||||
f"Files saved locally at:\n{paths_list}"
|
f"Files saved locally at:\n{paths_list}"
|
||||||
)
|
)
|
||||||
|
# Build fact-check summary for comment
|
||||||
|
fc_summary = ""
|
||||||
|
for fi, fc_status in enumerate(fact_check_statuses):
|
||||||
|
label = f"PR {chr(65 + fi)}"
|
||||||
|
if fc_status == "corrected":
|
||||||
|
fc_summary += f"\nFact-check {label}: corrections applied"
|
||||||
|
if fact_check_changes[fi]:
|
||||||
|
fc_summary += f"\n {fact_check_changes[fi]}"
|
||||||
|
elif fc_status == "clean":
|
||||||
|
fc_summary += f"\nFact-check {label}: no errors found"
|
||||||
|
else:
|
||||||
|
fc_summary += (
|
||||||
|
f"\nFact-check {label}: could not run -- manual review recommended"
|
||||||
|
)
|
||||||
|
|
||||||
cu_client.add_comment(
|
cu_client.add_comment(
|
||||||
clickup_task_id,
|
clickup_task_id,
|
||||||
f"📎 Saved {len(docx_files)} press release(s). "
|
f"Saved {len(docx_files)} press release(s). "
|
||||||
f"{uploaded_count} file(s) attached.\n"
|
f"{uploaded_count} file(s) attached.\n"
|
||||||
f"Generating JSON-LD schemas next...{upload_warning}",
|
f"Generating JSON-LD schemas next...{upload_warning}"
|
||||||
|
f"{fc_summary}",
|
||||||
)
|
)
|
||||||
log.info(
|
log.info(
|
||||||
"ClickUp: uploaded %d attachments for task %s", uploaded_count, clickup_task_id
|
"ClickUp: uploaded %d attachments for task %s", uploaded_count, clickup_task_id
|
||||||
|
|
@ -768,12 +942,12 @@ def write_press_releases(
|
||||||
log.warning("ClickUp attachment upload failed for %s: %s", clickup_task_id, e)
|
log.warning("ClickUp attachment upload failed for %s: %s", clickup_task_id, e)
|
||||||
|
|
||||||
# ── Step 4: Generate 2 JSON-LD schemas (Sonnet + WebSearch) ───────────
|
# ── Step 4: Generate 2 JSON-LD schemas (Sonnet + WebSearch) ───────────
|
||||||
log.info("[PR Pipeline] Step 4/4: Generating 2 JSON-LD schemas...")
|
log.info("[PR Pipeline] Step 4/5: Generating 2 JSON-LD schemas...")
|
||||||
schema_texts: list[str] = []
|
schema_texts: list[str] = []
|
||||||
schema_files: list[str] = []
|
schema_files: list[str] = []
|
||||||
for i, pr_text in enumerate(pr_texts):
|
for i, pr_text in enumerate(pr_texts):
|
||||||
log.info("[PR Pipeline] Schema %d/2 for: %s", i + 1, winners[i][:60])
|
log.info("[PR Pipeline] Schema %d/2 for: %s", i + 1, winners[i][:60])
|
||||||
_set_status(ctx, f"Step 4/4: Generating schema {i + 1}/2...")
|
_set_status(ctx, f"Step 4/5: Generating schema {i + 1}/2...")
|
||||||
step_start = time.time()
|
step_start = time.time()
|
||||||
schema_prompt = _build_schema_prompt(pr_text, company_name, url, schema_skill)
|
schema_prompt = _build_schema_prompt(pr_text, company_name, url, schema_skill)
|
||||||
exec_tools = "WebSearch,WebFetch"
|
exec_tools = "WebSearch,WebFetch"
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ from pathlib import Path
|
||||||
|
|
||||||
from .autocora import archive_result, scan_results, submit_job
|
from .autocora import archive_result, scan_results, submit_job
|
||||||
from .blm import find_cora_xlsx, run_generate, run_ingest
|
from .blm import find_cora_xlsx, run_generate, run_ingest
|
||||||
|
from .fact_check import fact_check_pr_files
|
||||||
from .claude_runner import (
|
from .claude_runner import (
|
||||||
RunResult,
|
RunResult,
|
||||||
build_prompt,
|
build_prompt,
|
||||||
|
|
@ -632,6 +633,24 @@ def _dispatch_claude(
|
||||||
_cleanup_work_dir(result.work_dir)
|
_cleanup_work_dir(result.work_dir)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# 5b. Fact-check PR files (Press Release only, graceful failure)
|
||||||
|
fc_status_lines: list[str] = []
|
||||||
|
if task.task_type == "Press Release":
|
||||||
|
log.info("Running adversarial fact-check for task %s", task.id)
|
||||||
|
company = task.get_field_value("Client") or ""
|
||||||
|
pr_topic = task.get_field_value("PR Topic") or ""
|
||||||
|
pr_keyword = task.get_field_value("Keyword") or ""
|
||||||
|
pr_url = task.get_field_value("IMSURL") or ""
|
||||||
|
fc_status_lines, fc_failed = fact_check_pr_files(
|
||||||
|
result.output_files,
|
||||||
|
company_name=company,
|
||||||
|
url=pr_url,
|
||||||
|
topic=pr_topic,
|
||||||
|
keyword=pr_keyword,
|
||||||
|
)
|
||||||
|
if fc_failed:
|
||||||
|
log.warning("Fact-check had failures for task %s", task.id)
|
||||||
|
|
||||||
# 6. Upload output files to ClickUp
|
# 6. Upload output files to ClickUp
|
||||||
uploaded = 0
|
uploaded = 0
|
||||||
for f in result.output_files:
|
for f in result.output_files:
|
||||||
|
|
@ -651,6 +670,8 @@ def _dispatch_claude(
|
||||||
|
|
||||||
# 9. Post success comment
|
# 9. Post success comment
|
||||||
summary = "Stage complete. %d file(s) attached." % uploaded
|
summary = "Stage complete. %d file(s) attached." % uploaded
|
||||||
|
if fc_status_lines:
|
||||||
|
summary += "\n" + "\n".join(fc_status_lines)
|
||||||
if result.output:
|
if result.output:
|
||||||
# Include first 500 chars of Claude's output as context
|
# Include first 500 chars of Claude's output as context
|
||||||
truncated = result.output[:500]
|
truncated = result.output[:500]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,220 @@
|
||||||
|
"""Adversarial fact-checker for press release outputs.
|
||||||
|
|
||||||
|
Runs a second Claude Code pass on generated PR text files to catch
|
||||||
|
factual errors. Treats all client-provided data (company name, titles,
|
||||||
|
URLs, topic) as ground truth and only corrects claims the PR inferred
|
||||||
|
or fabricated beyond what was given.
|
||||||
|
|
||||||
|
Graceful failure: any error returns the original text untouched.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
FACT_CHECK_MODEL = "sonnet"
|
||||||
|
FACT_CHECK_TIMEOUT = 300 # 5 minutes per PR
|
||||||
|
|
||||||
|
|
||||||
|
def build_fact_check_prompt(
|
||||||
|
pr_text: str,
|
||||||
|
company_name: str,
|
||||||
|
url: str,
|
||||||
|
topic: str,
|
||||||
|
keyword: str,
|
||||||
|
) -> str:
|
||||||
|
"""Build the prompt for the adversarial fact-checker."""
|
||||||
|
return (
|
||||||
|
"You are a factual accuracy reviewer for press releases. Your ONLY job is to "
|
||||||
|
"find and correct statements that are factually wrong. You are NOT an editor.\n\n"
|
||||||
|
"GROUND TRUTH -- the following data was provided by the client and is correct "
|
||||||
|
"by definition. Do NOT change, question, or 'correct' any of it, even if your "
|
||||||
|
"web search suggests something different:\n"
|
||||||
|
" - Company name: %s\n"
|
||||||
|
" - Target URL: %s\n"
|
||||||
|
" - Topic: %s\n"
|
||||||
|
" - Keyword: %s\n"
|
||||||
|
" - Any person names, titles, quotes, or contact details in the PR\n"
|
||||||
|
" - Any product names, service names, or brand names\n"
|
||||||
|
" - The overall framing, angle, and tone of the PR\n\n"
|
||||||
|
"WHAT TO CHECK (use WebSearch/WebFetch to verify):\n"
|
||||||
|
" - Industry statistics or market size claims\n"
|
||||||
|
" - Historical dates or facts\n"
|
||||||
|
" - Technical specifications not sourced from the client data\n"
|
||||||
|
" - General knowledge claims (e.g. 'X is the leading cause of Y')\n"
|
||||||
|
" - Geographic or regulatory facts\n\n"
|
||||||
|
"RULES:\n"
|
||||||
|
" - ONLY fix actual factual errors -- wrong numbers, wrong dates, wrong facts\n"
|
||||||
|
" - Do NOT add content, remove content, restructure, or 'improve' anything\n"
|
||||||
|
" - Do NOT change tone, style, word choice, or sentence structure\n"
|
||||||
|
" - Do NOT suggest additions or enhancements\n"
|
||||||
|
" - Make the MINIMUM change needed to fix each error\n"
|
||||||
|
" - Preserve the exact formatting, paragraph breaks, and headline\n\n"
|
||||||
|
"OUTPUT FORMAT:\n"
|
||||||
|
" - If you find NO factual errors: output exactly [NO_ERRORS] and nothing else\n"
|
||||||
|
" - If you find errors: output [CORRECTED] on the first line, then the full "
|
||||||
|
"corrected PR text (preserving all formatting), then a blank line, then "
|
||||||
|
"CHANGES: followed by a numbered list of what you changed and why\n\n"
|
||||||
|
"Press release to review:\n"
|
||||||
|
"---\n"
|
||||||
|
"%s\n"
|
||||||
|
"---"
|
||||||
|
) % (company_name, url, topic, keyword, pr_text)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_fact_check(raw_output: str, original_text: str) -> tuple[str, str, str]:
|
||||||
|
"""Parse fact-checker output. Returns (text, status, changes).
|
||||||
|
|
||||||
|
status is one of: "clean", "corrected", "skipped"
|
||||||
|
On any parse failure or suspect rewrite, returns original text unchanged.
|
||||||
|
"""
|
||||||
|
if not raw_output or not raw_output.strip():
|
||||||
|
return original_text, "skipped", ""
|
||||||
|
|
||||||
|
stripped = raw_output.strip()
|
||||||
|
|
||||||
|
# No errors found
|
||||||
|
if stripped.startswith("[NO_ERRORS]"):
|
||||||
|
return original_text, "clean", ""
|
||||||
|
|
||||||
|
# Corrections found
|
||||||
|
if stripped.startswith("[CORRECTED]"):
|
||||||
|
body = stripped[len("[CORRECTED]"):].strip()
|
||||||
|
|
||||||
|
# Split into corrected text and change log
|
||||||
|
changes = ""
|
||||||
|
if "\nCHANGES:" in body:
|
||||||
|
text_part, changes = body.split("\nCHANGES:", 1)
|
||||||
|
corrected = text_part.strip()
|
||||||
|
changes = changes.strip()
|
||||||
|
else:
|
||||||
|
corrected = body
|
||||||
|
|
||||||
|
if not corrected:
|
||||||
|
return original_text, "skipped", ""
|
||||||
|
|
||||||
|
# Safety: reject if word count differs by more than 15%
|
||||||
|
orig_wc = len(original_text.split())
|
||||||
|
new_wc = len(corrected.split())
|
||||||
|
if orig_wc > 0 and abs(new_wc - orig_wc) / orig_wc > 0.15:
|
||||||
|
log.warning(
|
||||||
|
"Fact-check rejected: word count changed too much "
|
||||||
|
"(%d -> %d, %.0f%% delta)",
|
||||||
|
orig_wc, new_wc, abs(new_wc - orig_wc) / orig_wc * 100,
|
||||||
|
)
|
||||||
|
return original_text, "skipped", "rejected -- word count delta too large"
|
||||||
|
|
||||||
|
return corrected, "corrected", changes
|
||||||
|
|
||||||
|
# Unparseable output
|
||||||
|
return original_text, "skipped", ""
|
||||||
|
|
||||||
|
|
||||||
|
def fact_check_pr_files(
|
||||||
|
output_files: list[Path],
|
||||||
|
company_name: str,
|
||||||
|
url: str,
|
||||||
|
topic: str,
|
||||||
|
keyword: str,
|
||||||
|
timeout: int = FACT_CHECK_TIMEOUT,
|
||||||
|
) -> tuple[list[str], bool]:
|
||||||
|
"""Run fact-check on .txt PR files in the output list.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(status_lines, any_failed) where status_lines is a list of
|
||||||
|
human-readable results per PR, and any_failed is True if the
|
||||||
|
fact-checker could not run on at least one PR.
|
||||||
|
"""
|
||||||
|
claude_bin = shutil.which("claude")
|
||||||
|
if not claude_bin:
|
||||||
|
log.warning("Fact-check: claude CLI not found, skipping")
|
||||||
|
return ["Fact-check: claude CLI not found, skipped"], True
|
||||||
|
|
||||||
|
txt_files = [f for f in output_files if f.suffix == ".txt"]
|
||||||
|
# Skip non-PR files like "Headlines Evaluation.md"
|
||||||
|
# PR files are the .txt files (the actual press releases)
|
||||||
|
if not txt_files:
|
||||||
|
return [], False
|
||||||
|
|
||||||
|
status_lines: list[str] = []
|
||||||
|
any_failed = False
|
||||||
|
|
||||||
|
for i, txt_file in enumerate(txt_files):
|
||||||
|
label = "PR %s" % chr(65 + i) # PR A, PR B, etc.
|
||||||
|
try:
|
||||||
|
original = txt_file.read_text(encoding="utf-8")
|
||||||
|
if not original.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
prompt = build_fact_check_prompt(
|
||||||
|
original, company_name, url, topic, keyword
|
||||||
|
)
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
claude_bin,
|
||||||
|
"-p", prompt,
|
||||||
|
"--output-format", "text",
|
||||||
|
"--permission-mode", "bypassPermissions",
|
||||||
|
"--allowedTools", "WebSearch,WebFetch",
|
||||||
|
"--max-turns", "10",
|
||||||
|
"--model", FACT_CHECK_MODEL,
|
||||||
|
]
|
||||||
|
|
||||||
|
log.info("Fact-checking %s: %s", label, txt_file.name)
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=timeout,
|
||||||
|
cwd=str(txt_file.parent),
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
log.warning(
|
||||||
|
"Fact-check %s failed (exit %d): %s",
|
||||||
|
label, result.returncode, (result.stderr or "")[:500],
|
||||||
|
)
|
||||||
|
status_lines.append(
|
||||||
|
"Fact-check %s: could not run -- manual review recommended" % label
|
||||||
|
)
|
||||||
|
any_failed = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
corrected, status, changes = apply_fact_check(result.stdout, original)
|
||||||
|
|
||||||
|
if status == "corrected":
|
||||||
|
txt_file.write_text(corrected, encoding="utf-8")
|
||||||
|
log.info("Fact-check %s: corrections applied", label)
|
||||||
|
line = "Fact-check %s: corrections applied" % label
|
||||||
|
if changes:
|
||||||
|
line += "\n %s" % changes
|
||||||
|
status_lines.append(line)
|
||||||
|
elif status == "clean":
|
||||||
|
log.info("Fact-check %s: no errors found", label)
|
||||||
|
status_lines.append("Fact-check %s: no errors found" % label)
|
||||||
|
else:
|
||||||
|
log.warning("Fact-check %s: skipped (unparseable output)", label)
|
||||||
|
status_lines.append(
|
||||||
|
"Fact-check %s: could not run -- manual review recommended" % label
|
||||||
|
)
|
||||||
|
any_failed = True
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
log.warning("Fact-check %s timed out after %ds", label, timeout)
|
||||||
|
status_lines.append(
|
||||||
|
"Fact-check %s: timed out -- manual review recommended" % label
|
||||||
|
)
|
||||||
|
any_failed = True
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("Fact-check %s error: %s", label, e)
|
||||||
|
status_lines.append(
|
||||||
|
"Fact-check %s: could not run -- manual review recommended" % label
|
||||||
|
)
|
||||||
|
any_failed = True
|
||||||
|
|
||||||
|
return status_lines, any_failed
|
||||||
|
|
@ -0,0 +1,126 @@
|
||||||
|
"""Tests for the adversarial fact-checker helpers in press_release.py."""
|
||||||
|
|
||||||
|
from cheddahbot.tools.press_release import _apply_fact_check, _build_fact_check_prompt
|
||||||
|
|
||||||
|
|
||||||
|
class TestApplyFactCheck:
|
||||||
|
"""Tests for _apply_fact_check output parsing."""
|
||||||
|
|
||||||
|
ORIGINAL = (
|
||||||
|
"Acme Corp Delivers Advanced Widget Solutions\n\n"
|
||||||
|
"Acme Corp, a leading manufacturer of widgets, today highlighted "
|
||||||
|
"its expanded product line. The company, based in Milwaukee, Wisconsin, "
|
||||||
|
"produces over 500 widget variants for industrial applications."
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_no_errors_returns_original(self):
|
||||||
|
text, status, changes = _apply_fact_check("[NO_ERRORS]", self.ORIGINAL)
|
||||||
|
assert status == "clean"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
assert changes == ""
|
||||||
|
|
||||||
|
def test_no_errors_with_trailing_whitespace(self):
|
||||||
|
text, status, changes = _apply_fact_check("[NO_ERRORS] \n", self.ORIGINAL)
|
||||||
|
assert status == "clean"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
|
||||||
|
def test_corrected_with_changes(self):
|
||||||
|
corrected_pr = self.ORIGINAL.replace("500 widget", "300 widget")
|
||||||
|
raw = (
|
||||||
|
f"[CORRECTED]\n{corrected_pr}\n\n"
|
||||||
|
"CHANGES:\n1. Changed '500 widget variants' to '300 widget variants' "
|
||||||
|
"-- company website lists 300."
|
||||||
|
)
|
||||||
|
text, status, changes = _apply_fact_check(raw, self.ORIGINAL)
|
||||||
|
assert status == "corrected"
|
||||||
|
assert "300 widget" in text
|
||||||
|
assert "500" not in text
|
||||||
|
assert "300 widget variants" in changes
|
||||||
|
|
||||||
|
def test_corrected_without_changes_section(self):
|
||||||
|
corrected_pr = self.ORIGINAL.replace("500", "300")
|
||||||
|
raw = f"[CORRECTED]\n{corrected_pr}"
|
||||||
|
text, status, changes = _apply_fact_check(raw, self.ORIGINAL)
|
||||||
|
assert status == "corrected"
|
||||||
|
assert "300" in text
|
||||||
|
assert changes == ""
|
||||||
|
|
||||||
|
def test_empty_output_returns_skipped(self):
|
||||||
|
text, status, changes = _apply_fact_check("", self.ORIGINAL)
|
||||||
|
assert status == "skipped"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
|
||||||
|
def test_none_like_output_returns_skipped(self):
|
||||||
|
text, status, changes = _apply_fact_check(" \n ", self.ORIGINAL)
|
||||||
|
assert status == "skipped"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
|
||||||
|
def test_garbage_output_returns_skipped(self):
|
||||||
|
text, status, changes = _apply_fact_check(
|
||||||
|
"I reviewed the press release and it looks good overall.", self.ORIGINAL
|
||||||
|
)
|
||||||
|
assert status == "skipped"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
|
||||||
|
def test_rejects_oversized_rewrite(self):
|
||||||
|
"""If fact-checker rewrites too much (>15% word count delta), reject."""
|
||||||
|
# Double the content -- way more than 15%
|
||||||
|
bloated = self.ORIGINAL + "\n\n" + self.ORIGINAL + "\n\nExtra content here."
|
||||||
|
raw = f"[CORRECTED]\n{bloated}\n\nCHANGES:\n1. Added more detail."
|
||||||
|
text, status, changes = _apply_fact_check(raw, self.ORIGINAL)
|
||||||
|
assert status == "skipped"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
assert "word count delta" in changes
|
||||||
|
|
||||||
|
def test_accepts_minor_word_count_change(self):
|
||||||
|
"""Small changes (within 15%) should be accepted."""
|
||||||
|
# Change one word -- well within 15%
|
||||||
|
minor_edit = self.ORIGINAL.replace("500 widget variants", "480 widget variants")
|
||||||
|
raw = (
|
||||||
|
f"[CORRECTED]\n{minor_edit}\n\n"
|
||||||
|
"CHANGES:\n1. Corrected variant count from 500 to 480."
|
||||||
|
)
|
||||||
|
text, status, changes = _apply_fact_check(raw, self.ORIGINAL)
|
||||||
|
assert status == "corrected"
|
||||||
|
assert "480" in text
|
||||||
|
|
||||||
|
def test_corrected_but_empty_body_returns_skipped(self):
|
||||||
|
text, status, changes = _apply_fact_check("[CORRECTED]\n", self.ORIGINAL)
|
||||||
|
assert status == "skipped"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildFactCheckPrompt:
|
||||||
|
"""Tests for _build_fact_check_prompt structure."""
|
||||||
|
|
||||||
|
def test_includes_ground_truth_data(self):
|
||||||
|
prompt = _build_fact_check_prompt(
|
||||||
|
"Some PR text here.",
|
||||||
|
company_name="Acme Corp",
|
||||||
|
url="https://acme.com",
|
||||||
|
topic="widgets",
|
||||||
|
keyword="industrial widgets",
|
||||||
|
)
|
||||||
|
assert "Acme Corp" in prompt
|
||||||
|
assert "https://acme.com" in prompt
|
||||||
|
assert "widgets" in prompt
|
||||||
|
assert "industrial widgets" in prompt
|
||||||
|
assert "ground truth" in prompt.lower() or "GROUND TRUTH" in prompt
|
||||||
|
|
||||||
|
def test_includes_pr_text(self):
|
||||||
|
prompt = _build_fact_check_prompt(
|
||||||
|
"The quick brown fox.",
|
||||||
|
company_name="Test",
|
||||||
|
url="https://test.com",
|
||||||
|
topic="foxes",
|
||||||
|
keyword="brown fox",
|
||||||
|
)
|
||||||
|
assert "The quick brown fox." in prompt
|
||||||
|
|
||||||
|
def test_output_format_instructions(self):
|
||||||
|
prompt = _build_fact_check_prompt(
|
||||||
|
"Text.", company_name="X", url="u", topic="t", keyword="k"
|
||||||
|
)
|
||||||
|
assert "[NO_ERRORS]" in prompt
|
||||||
|
assert "[CORRECTED]" in prompt
|
||||||
|
assert "CHANGES:" in prompt
|
||||||
|
|
@ -0,0 +1,122 @@
|
||||||
|
"""Tests for clickup_runner.fact_check module."""
|
||||||
|
|
||||||
|
from clickup_runner.fact_check import apply_fact_check, build_fact_check_prompt
|
||||||
|
|
||||||
|
|
||||||
|
class TestApplyFactCheck:
|
||||||
|
"""Tests for apply_fact_check output parsing."""
|
||||||
|
|
||||||
|
ORIGINAL = (
|
||||||
|
"Acme Corp Delivers Advanced Widget Solutions\n\n"
|
||||||
|
"Acme Corp, a leading manufacturer of widgets, today highlighted "
|
||||||
|
"its expanded product line. The company, based in Milwaukee, Wisconsin, "
|
||||||
|
"produces over 500 widget variants for industrial applications."
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_no_errors_returns_original(self):
|
||||||
|
text, status, changes = apply_fact_check("[NO_ERRORS]", self.ORIGINAL)
|
||||||
|
assert status == "clean"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
assert changes == ""
|
||||||
|
|
||||||
|
def test_no_errors_with_trailing_whitespace(self):
|
||||||
|
text, status, changes = apply_fact_check("[NO_ERRORS] \n", self.ORIGINAL)
|
||||||
|
assert status == "clean"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
|
||||||
|
def test_corrected_with_changes(self):
|
||||||
|
corrected_pr = self.ORIGINAL.replace("500 widget", "300 widget")
|
||||||
|
raw = (
|
||||||
|
"[CORRECTED]\n%s\n\n"
|
||||||
|
"CHANGES:\n1. Changed '500 widget variants' to '300 widget variants' "
|
||||||
|
"-- company website lists 300." % corrected_pr
|
||||||
|
)
|
||||||
|
text, status, changes = apply_fact_check(raw, self.ORIGINAL)
|
||||||
|
assert status == "corrected"
|
||||||
|
assert "300 widget" in text
|
||||||
|
assert "500" not in text
|
||||||
|
assert "300 widget variants" in changes
|
||||||
|
|
||||||
|
def test_corrected_without_changes_section(self):
|
||||||
|
corrected_pr = self.ORIGINAL.replace("500", "300")
|
||||||
|
raw = "[CORRECTED]\n%s" % corrected_pr
|
||||||
|
text, status, changes = apply_fact_check(raw, self.ORIGINAL)
|
||||||
|
assert status == "corrected"
|
||||||
|
assert "300" in text
|
||||||
|
assert changes == ""
|
||||||
|
|
||||||
|
def test_empty_output_returns_skipped(self):
|
||||||
|
text, status, changes = apply_fact_check("", self.ORIGINAL)
|
||||||
|
assert status == "skipped"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
|
||||||
|
def test_whitespace_only_returns_skipped(self):
|
||||||
|
text, status, changes = apply_fact_check(" \n ", self.ORIGINAL)
|
||||||
|
assert status == "skipped"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
|
||||||
|
def test_garbage_output_returns_skipped(self):
|
||||||
|
text, status, changes = apply_fact_check(
|
||||||
|
"I reviewed the press release and it looks good overall.", self.ORIGINAL
|
||||||
|
)
|
||||||
|
assert status == "skipped"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
|
||||||
|
def test_rejects_oversized_rewrite(self):
|
||||||
|
bloated = self.ORIGINAL + "\n\n" + self.ORIGINAL + "\n\nExtra content."
|
||||||
|
raw = "[CORRECTED]\n%s\n\nCHANGES:\n1. Added more detail." % bloated
|
||||||
|
text, status, changes = apply_fact_check(raw, self.ORIGINAL)
|
||||||
|
assert status == "skipped"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
assert "word count delta" in changes
|
||||||
|
|
||||||
|
def test_accepts_minor_word_count_change(self):
|
||||||
|
minor_edit = self.ORIGINAL.replace("500 widget variants", "480 widget variants")
|
||||||
|
raw = (
|
||||||
|
"[CORRECTED]\n%s\n\n"
|
||||||
|
"CHANGES:\n1. Corrected variant count from 500 to 480." % minor_edit
|
||||||
|
)
|
||||||
|
text, status, changes = apply_fact_check(raw, self.ORIGINAL)
|
||||||
|
assert status == "corrected"
|
||||||
|
assert "480" in text
|
||||||
|
|
||||||
|
def test_corrected_empty_body_returns_skipped(self):
|
||||||
|
text, status, changes = apply_fact_check("[CORRECTED]\n", self.ORIGINAL)
|
||||||
|
assert status == "skipped"
|
||||||
|
assert text == self.ORIGINAL
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildFactCheckPrompt:
|
||||||
|
"""Tests for build_fact_check_prompt structure."""
|
||||||
|
|
||||||
|
def test_includes_ground_truth_data(self):
|
||||||
|
prompt = build_fact_check_prompt(
|
||||||
|
"Some PR text.",
|
||||||
|
company_name="Acme Corp",
|
||||||
|
url="https://acme.com",
|
||||||
|
topic="widgets",
|
||||||
|
keyword="industrial widgets",
|
||||||
|
)
|
||||||
|
assert "Acme Corp" in prompt
|
||||||
|
assert "https://acme.com" in prompt
|
||||||
|
assert "widgets" in prompt
|
||||||
|
assert "industrial widgets" in prompt
|
||||||
|
assert "GROUND TRUTH" in prompt
|
||||||
|
|
||||||
|
def test_includes_pr_text(self):
|
||||||
|
prompt = build_fact_check_prompt(
|
||||||
|
"The quick brown fox.",
|
||||||
|
company_name="Test",
|
||||||
|
url="https://test.com",
|
||||||
|
topic="foxes",
|
||||||
|
keyword="brown fox",
|
||||||
|
)
|
||||||
|
assert "The quick brown fox." in prompt
|
||||||
|
|
||||||
|
def test_output_format_instructions(self):
|
||||||
|
prompt = build_fact_check_prompt(
|
||||||
|
"Text.", company_name="X", url="u", topic="t", keyword="k"
|
||||||
|
)
|
||||||
|
assert "[NO_ERRORS]" in prompt
|
||||||
|
assert "[CORRECTED]" in prompt
|
||||||
|
assert "CHANGES:" in prompt
|
||||||
Loading…
Reference in New Issue