From bdb4c9490a323e627ceb27c1c6c72399240e003a Mon Sep 17 00:00:00 2001 From: PeninsulaInd Date: Fri, 20 Mar 2026 12:38:32 -0500 Subject: [PATCH] Filter Cora entities by Best of Both correlation threshold (<= -0.19) Entities without a strong enough ranking correlation were being included in outlines and optimization. Now cora_parser.get_entities() filters out entities with Best of Both > -0.19 (or None). Threshold is configurable in OPTIMIZATION_RULES and documented in skill.md optimization rules table. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../content-researcher/scripts/cora_parser.py | 18 +++++++++++++++++- .claude/skills/content-researcher/skill.md | 1 + 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/.claude/skills/content-researcher/scripts/cora_parser.py b/.claude/skills/content-researcher/scripts/cora_parser.py index 4952e57..7e2a482 100644 --- a/.claude/skills/content-researcher/scripts/cora_parser.py +++ b/.claude/skills/content-researcher/scripts/cora_parser.py @@ -59,6 +59,13 @@ OPTIMIZATION_RULES = { "exclude_measurement_entities": True, # Ignore measurements (dimensions, tolerances) as entities "allow_organization_entities": True, # Organizations like ISO, ANSI, etc. are OK "never_mention_competitors": True, # Never mention competitors by name in content + + # Entity correlation threshold + # Best of Both = lower of Spearman's or Pearson's correlation. + # Measures correlation to ranking position (1=top, 100=bottom), so negative = better ranking. + # Only include entities with Best of Both <= this value. + # Set to None to disable filtering. + "entity_correlation_threshold": -0.19, } @@ -195,6 +202,15 @@ class CoraReport: if name.startswith("critical") or name.startswith("http"): continue + correlation = _safe_float(row, col_map.get("Best of Both")) + + # Filter by Best of Both correlation threshold. + # Lower (more negative) = stronger ranking signal (correlates with + # position 1 vs 100). Only keep entities at or below the threshold. + threshold = OPTIMIZATION_RULES.get("entity_correlation_threshold") + if threshold is not None and (correlation is None or correlation > threshold): + continue + entity = { "name": name, "freebase_id": _safe_str(row, col_map.get("Freebase ID")), @@ -203,7 +219,7 @@ class CoraReport: "relevance": _safe_float(row, col_map.get("Relevance")), "confidence": _safe_float(row, col_map.get("Confidence")), "type": _safe_str(row, col_map.get("Type")), - "correlation": _safe_float(row, col_map.get("Best of Both")), + "correlation": correlation, "current_count": _safe_int(row, site_col_idx), "max_count": _safe_int(row, col_map.get("Max")), "deficit": _safe_int(row, col_map.get("Deficit")), diff --git a/.claude/skills/content-researcher/skill.md b/.claude/skills/content-researcher/skill.md index 2c00dd4..9adabbc 100644 --- a/.claude/skills/content-researcher/skill.md +++ b/.claude/skills/content-researcher/skill.md @@ -492,6 +492,7 @@ These override any data from the Cora report: | Competitor names | NEVER use competitor company names as entities or LSI keywords. Do not mention competitors by name in content. | | Measurement entities | Ignore measurements (dimensions, tolerances, etc.) as entities — skip these in entity optimization | | Organization entities | Organizations like ISO, ANSI, ASTM are fine — keep these as entities | +| Entity correlation filter | Only entities with Best of Both <= -0.19 are included. Best of Both is the lower of Spearman's or Pearson's correlation to ranking position (1=top, 100=bottom), so more negative = stronger ranking signal. This filter is applied in `cora_parser.py` and affects all downstream consumers. To disable, set `entity_correlation_threshold` to `None` in `OPTIMIZATION_RULES`. Added 2026-03-20 — revert if entity coverage feels too thin. | ---