Filter Cora entities by Best of Both correlation threshold (<= -0.19)
Entities without a strong enough ranking correlation were being included in outlines and optimization. Now cora_parser.get_entities() filters out entities with Best of Both > -0.19 (or None). Threshold is configurable in OPTIMIZATION_RULES and documented in skill.md optimization rules table. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>master
parent
c12c655220
commit
bdb4c9490a
|
|
@ -59,6 +59,13 @@ OPTIMIZATION_RULES = {
|
||||||
"exclude_measurement_entities": True, # Ignore measurements (dimensions, tolerances) as entities
|
"exclude_measurement_entities": True, # Ignore measurements (dimensions, tolerances) as entities
|
||||||
"allow_organization_entities": True, # Organizations like ISO, ANSI, etc. are OK
|
"allow_organization_entities": True, # Organizations like ISO, ANSI, etc. are OK
|
||||||
"never_mention_competitors": True, # Never mention competitors by name in content
|
"never_mention_competitors": True, # Never mention competitors by name in content
|
||||||
|
|
||||||
|
# Entity correlation threshold
|
||||||
|
# Best of Both = lower of Spearman's or Pearson's correlation.
|
||||||
|
# Measures correlation to ranking position (1=top, 100=bottom), so negative = better ranking.
|
||||||
|
# Only include entities with Best of Both <= this value.
|
||||||
|
# Set to None to disable filtering.
|
||||||
|
"entity_correlation_threshold": -0.19,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -195,6 +202,15 @@ class CoraReport:
|
||||||
if name.startswith("critical") or name.startswith("http"):
|
if name.startswith("critical") or name.startswith("http"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
correlation = _safe_float(row, col_map.get("Best of Both"))
|
||||||
|
|
||||||
|
# Filter by Best of Both correlation threshold.
|
||||||
|
# Lower (more negative) = stronger ranking signal (correlates with
|
||||||
|
# position 1 vs 100). Only keep entities at or below the threshold.
|
||||||
|
threshold = OPTIMIZATION_RULES.get("entity_correlation_threshold")
|
||||||
|
if threshold is not None and (correlation is None or correlation > threshold):
|
||||||
|
continue
|
||||||
|
|
||||||
entity = {
|
entity = {
|
||||||
"name": name,
|
"name": name,
|
||||||
"freebase_id": _safe_str(row, col_map.get("Freebase ID")),
|
"freebase_id": _safe_str(row, col_map.get("Freebase ID")),
|
||||||
|
|
@ -203,7 +219,7 @@ class CoraReport:
|
||||||
"relevance": _safe_float(row, col_map.get("Relevance")),
|
"relevance": _safe_float(row, col_map.get("Relevance")),
|
||||||
"confidence": _safe_float(row, col_map.get("Confidence")),
|
"confidence": _safe_float(row, col_map.get("Confidence")),
|
||||||
"type": _safe_str(row, col_map.get("Type")),
|
"type": _safe_str(row, col_map.get("Type")),
|
||||||
"correlation": _safe_float(row, col_map.get("Best of Both")),
|
"correlation": correlation,
|
||||||
"current_count": _safe_int(row, site_col_idx),
|
"current_count": _safe_int(row, site_col_idx),
|
||||||
"max_count": _safe_int(row, col_map.get("Max")),
|
"max_count": _safe_int(row, col_map.get("Max")),
|
||||||
"deficit": _safe_int(row, col_map.get("Deficit")),
|
"deficit": _safe_int(row, col_map.get("Deficit")),
|
||||||
|
|
|
||||||
|
|
@ -492,6 +492,7 @@ These override any data from the Cora report:
|
||||||
| Competitor names | NEVER use competitor company names as entities or LSI keywords. Do not mention competitors by name in content. |
|
| Competitor names | NEVER use competitor company names as entities or LSI keywords. Do not mention competitors by name in content. |
|
||||||
| Measurement entities | Ignore measurements (dimensions, tolerances, etc.) as entities — skip these in entity optimization |
|
| Measurement entities | Ignore measurements (dimensions, tolerances, etc.) as entities — skip these in entity optimization |
|
||||||
| Organization entities | Organizations like ISO, ANSI, ASTM are fine — keep these as entities |
|
| Organization entities | Organizations like ISO, ANSI, ASTM are fine — keep these as entities |
|
||||||
|
| Entity correlation filter | Only entities with Best of Both <= -0.19 are included. Best of Both is the lower of Spearman's or Pearson's correlation to ranking position (1=top, 100=bottom), so more negative = stronger ranking signal. This filter is applied in `cora_parser.py` and affects all downstream consumers. To disable, set `entity_correlation_threshold` to `None` in `OPTIMIZATION_RULES`. Added 2026-03-20 — revert if entity coverage feels too thin. |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue