diff --git a/STORY_3.2_IMPLEMENTATION_SUMMARY.md b/STORY_3.2_IMPLEMENTATION_SUMMARY.md index 607dff8..a955eae 100644 --- a/STORY_3.2_IMPLEMENTATION_SUMMARY.md +++ b/STORY_3.2_IMPLEMENTATION_SUMMARY.md @@ -126,6 +126,7 @@ CREATE TABLE article_links ( from_content_id INTEGER NOT NULL, to_content_id INTEGER NULL, to_url TEXT NULL, + anchor_text TEXT NULL, -- Added in Story 4.5 link_type VARCHAR(20) NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (from_content_id) REFERENCES generated_content(id) ON DELETE CASCADE, diff --git a/STORY_4.5_IMPLEMENTATION_SUMMARY.md b/STORY_4.5_IMPLEMENTATION_SUMMARY.md new file mode 100644 index 0000000..40974f2 --- /dev/null +++ b/STORY_4.5_IMPLEMENTATION_SUMMARY.md @@ -0,0 +1,172 @@ +# Story 4.5: Create URL and Link Reporting Script - Implementation Summary + +**Status:** ✅ COMPLETE +**Story Points:** 3 +**Date Completed:** October 22, 2025 + +## Overview +Implemented a CLI command to export article URLs with optional link details (anchor text and destination URLs) based on project and tier filters. Additionally enhanced the data model to store anchor text directly in the database for better performance and data integrity. + +## Implementation + +### Core Features Implemented + +1. **CLI Command: `get-links`** + - Location: `src/cli/commands.py` + - Exports article URLs in CSV format + - Required arguments: + - `--project-id` / `-p`: Project ID to filter + - `--tier` / `-t`: Tier filter (supports "1", "2", or "2+" for ranges) + - Optional flags: + - `--with-anchor-text`: Include anchor text used for tiered links + - `--with-destination-url`: Include destination URL that the article links to + - Output: CSV to stdout (can be redirected to file) + +2. **Database Enhancement: anchor_text Field** + - Added `anchor_text` column to `article_links` table + - Migration script: `scripts/migrate_add_anchor_text.py` + - Updated `ArticleLink` model with new field + - Updated `ArticleLinkRepository.create()` to accept anchor_text parameter + +3. **Content Injection Updates** + - Modified `src/interlinking/content_injection.py` to capture and store actual anchor text used + - Updated `_try_inject_link()` to return the anchor text that was successfully injected + - All link creation calls now include anchor_text: + - Tiered links (money site and lower tier) + - Homepage links + - See Also section links + +## Files Modified + +### Database Layer +- `src/database/models.py` - Added `anchor_text` field to ArticleLink model +- `src/database/repositories.py` - Updated ArticleLinkRepository.create() +- `scripts/migrate_add_anchor_text.py` - New migration script + +### Business Logic +- `src/interlinking/content_injection.py`: + - Modified `_try_inject_link()` signature to return anchor text + - Updated `_inject_tiered_links()` to capture anchor text + - Updated `_inject_homepage_link()` to capture anchor text + - Updated `_inject_see_also_section()` to store article titles as anchor text + +### CLI +- `src/cli/commands.py`: + - Added `get-links` command + - Simplified implementation (no HTML parsing needed) + - Direct database read for anchor text + +### Tests +- `tests/integration/test_get_links_command.py` - New comprehensive test suite (9 tests) + +### Documentation +- `docs/prd/epic-4-deployment.md` - Updated Story 4.5 status to COMPLETE +- `docs/stories/story-3.2-find-tiered-links.md` - Updated ArticleLink schema to include anchor_text field +- `docs/architecture/data-models.md` - Added ArticleLink model documentation with anchor_text field +- `STORY_3.2_IMPLEMENTATION_SUMMARY.md` - Updated schema to include anchor_text field + +## Usage Examples + +### Basic usage - get all tier 1 URLs +```bash +python main.py get-links --project-id 1 --tier 1 +``` + +### Get tier 2 and above with anchor text and destinations +```bash +python main.py get-links --project-id 1 --tier 2+ --with-anchor-text --with-destination-url +``` + +### Export to file +```bash +python main.py get-links --project-id 1 --tier 1 --with-anchor-text > tier1_links.csv +``` + +## CSV Output Format + +**Basic (no flags):** +```csv +article_url,tier,title +https://example.com/article1.html,tier1,Article Title 1 +``` + +**With anchor text:** +```csv +article_url,tier,title,anchor_text +https://example.com/article1.html,tier1,Article Title 1,expert services +``` + +**With destination URL:** +```csv +article_url,tier,title,destination_url +https://example.com/article1.html,tier1,Article Title 1,https://www.moneysite.com +``` + +**With both flags:** +```csv +article_url,tier,title,anchor_text,destination_url +https://example.com/article1.html,tier1,Article Title 1,expert services,https://www.moneysite.com +``` + +## Testing + +**Test Coverage:** 9 integration tests, all passing + +**Test Cases:** +1. Basic tier 1 export (no optional flags) +2. Tier range filter (2+) +3. Export with anchor text +4. Export with destination URL +5. Export with both flags +6. Tier 2 resolves to_content_id to deployed URL +7. Error handling - invalid project +8. Error handling - invalid tier format +9. Error handling - no deployed articles + +## Database Enhancement Benefits + +The addition of the `anchor_text` field to the `article_links` table provides: + +1. **Performance**: No HTML parsing required - direct database read +2. **Data Integrity**: Know exactly what anchor text was used for each link +3. **Auditability**: Track link relationships and their anchor text +4. **Simplicity**: Cleaner code without BeautifulSoup HTML parsing in CLI + +## Migration + +To apply the database changes to existing databases: +```bash +python scripts/migrate_add_anchor_text.py +``` + +To rollback: +```bash +python scripts/migrate_add_anchor_text.py rollback +``` + +**Note:** Existing links will have NULL anchor_text. Re-run content injection to populate this field for existing content. + +## Acceptance Criteria - Verification + +✅ A new CLI command `get-links` is created +✅ The script accepts a mandatory `project_id` +✅ The script accepts a `tier` specifier supporting single tier and ranges (e.g., "2+") +✅ Optional flag `--with-anchor-text` includes the anchor text +✅ Optional flag `--with-destination-url` includes the destination URL +✅ The script queries the database to retrieve link information +✅ The output is well-formatted CSV printed to stdout + +## Known Limitations + +- Only reports tiered links (excludes homepage and see also links) +- Existing article_links records created before migration will have NULL anchor_text +- CSV output goes to stdout only (user must redirect to file) + +## Future Enhancements + +Potential improvements for future stories: +- Add `--link-type` flag to filter by link type (tiered, homepage, wheel_see_also) +- Add `--output` flag to write directly to file +- Add JSON output format option +- Add summary statistics (total links, link types breakdown) + diff --git a/docs/architecture/data-models.md b/docs/architecture/data-models.md index 408b7c7..0f4bdf2 100644 --- a/docs/architecture/data-models.md +++ b/docs/architecture/data-models.md @@ -51,7 +51,35 @@ The following data models will be implemented using SQLAlchemy. - `augmented`: Content was below minimum and was augmented - `failed`: Generation failed (error details in outline JSON) -## 4. FqdnMapping +## 4. ArticleLink + +**Purpose**: Tracks link relationships between articles for interlinking (tiered links, wheel links, homepage links). + +**Key Attributes**: +- `id`: Integer, Primary Key, Auto-increment +- `from_content_id`: Integer, Foreign Key to GeneratedContent, Not Null, Indexed +- `to_content_id`: Integer, Foreign Key to GeneratedContent, Nullable, Indexed +- `to_url`: Text, Nullable (for external links like money site) +- `anchor_text`: Text, Nullable (actual anchor text used for the link, added in Story 4.5) +- `link_type`: String(20), Not Null, Indexed (tiered, wheel_next, wheel_prev, homepage, wheel_see_also) +- `created_at`: DateTime, Not Null + +**Relationships**: +- Belongs to one GeneratedContent (source) +- Optionally belongs to another GeneratedContent (target) + +**Link Types**: +- `tiered`: Link from tier N article to tier N-1 article (or money site for tier 1) +- `wheel_next`: Link to next article in batch wheel +- `wheel_prev`: Link to previous article in batch wheel +- `wheel_see_also`: Link in "See Also" section +- `homepage`: Link to site homepage + +**Constraints**: +- Either `to_content_id` OR `to_url` must be set (not both) +- Unique constraint on (from_content_id, to_content_id, link_type) + +## 5. FqdnMapping **Purpose**: Maps cloud storage buckets to fully qualified domain names for URL generation. diff --git a/docs/prd/epic-4-deployment.md b/docs/prd/epic-4-deployment.md index e352dbf..2d84a5b 100644 --- a/docs/prd/epic-4-deployment.md +++ b/docs/prd/epic-4-deployment.md @@ -8,7 +8,7 @@ To deploy all finalized HTML content (articles and boilerplate pages) for a batc - **Story 4.2**: ✅ COMPLETE (implemented in Story 4.1) - **Story 4.3**: ✅ COMPLETE (implemented in Story 4.1) - **Story 4.4**: ✅ COMPLETE (5 story points) -- **Story 4.5**: Not started +- **Story 4.5**: ✅ COMPLETE (3 story points) ## Stories @@ -78,7 +78,7 @@ To deploy all finalized HTML content (articles and boilerplate pages) for a batc * Can be run manually after deployment or integrated into auto-deploy workflow. ### Story 4.5: Create URL and Link Reporting Script -**Status:** Not Started +**Status:** ✅ COMPLETE **As a user**, I want a script to generate custom lists of URLs based on project and tier, with optional link details, so that I can easily export data for analysis or external tools. @@ -93,14 +93,9 @@ To deploy all finalized HTML content (articles and boilerplate pages) for a batc * The script queries the database to retrieve the required link and content information. * The output is a well-formatted list (e.g., CSV or plain text) printed to the console. -## Technical Debt -- Multi-cloud support (AWS S3, Azure, DigitalOcean, etc.) - deferred from Story 4.1 -- CDN cache purging after deployment -- Boilerplate page storage optimization (regenerate on-the-fly vs storing HTML) -- Homepage (`index.html`) generation for sites - ## Notes - Story 4.1 is the primary deployment story and includes core functionality from Stories 4.2 and 4.3 - Auto-deploy is enabled by default to streamline workflow - All cloud provider credentials come from `.env` file only - Story 4.4 and 4.5 are independent utilities that can be implemented as needed +- Technical debt items tracked in [technical-debt.md](../technical-debt.md#epic-4-cloud-deployment) diff --git a/docs/prd/epic-5-maintenance.md b/docs/prd/epic-5-maintenance.md index 053ffac..330666f 100644 --- a/docs/prd/epic-5-maintenance.md +++ b/docs/prd/epic-5-maintenance.md @@ -4,7 +4,7 @@ To automate recurring site-level maintenance tasks that occur post-deployment, ensuring sites remain current and well-maintained without manual intervention. ## Rationale -After initial content deployment, sites require ongoing maintenance tasks such as updating homepages with new articles, refreshing navigation, and managing site-level pages. These tasks are: +After initial content deployment, sites require ongoing maintenance tasks. These tasks are: - **Recurring**: Need to run regularly (daily, weekly, etc.) - **Post-Deployment**: Occur after articles are published - **Site-Level Scope**: Operate on the entire site rather than individual articles diff --git a/docs/stories/story-3.2-find-tiered-links.md b/docs/stories/story-3.2-find-tiered-links.md index ac3fd0c..c1a7b1b 100644 --- a/docs/stories/story-3.2-find-tiered-links.md +++ b/docs/stories/story-3.2-find-tiered-links.md @@ -159,6 +159,7 @@ CREATE TABLE article_links ( from_content_id INTEGER NOT NULL, to_content_id INTEGER NULL, to_url TEXT NULL, + anchor_text TEXT NULL, link_type VARCHAR(20) NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (from_content_id) REFERENCES generated_content(id) ON DELETE CASCADE, @@ -172,6 +173,8 @@ CREATE INDEX idx_article_links_to ON article_links(to_content_id); CREATE INDEX idx_article_links_type ON article_links(link_type); ``` +**Note:** The `anchor_text` field was added in Story 4.5 to store the actual anchor text used for each link, improving query performance and data integrity. + **Link Types:** - `tiered`: Link from tier N article to tier N-1 article (or money site for tier 1) - `wheel_next`: Link to next article in batch wheel @@ -202,6 +205,7 @@ class ArticleLink(Base): index=True ) to_url: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + anchor_text: Mapped[Optional[str]] = mapped_column(Text, nullable=True) # Added in Story 4.5 link_type: Mapped[str] = mapped_column(String(20), nullable=False, index=True) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) ``` @@ -227,6 +231,7 @@ link_repo.create( from_content_id=article_a.id, to_content_id=article_b.id, to_url=None, + anchor_text="Next Article", link_type="wheel_next" ) @@ -235,6 +240,7 @@ link_repo.create( from_content_id=tier1_article.id, to_content_id=None, to_url="https://www.moneysite.com", + anchor_text="expert services", # Added in Story 4.5 link_type="tiered" ) @@ -243,6 +249,7 @@ link_repo.create( from_content_id=tier2_article.id, to_content_id=tier1_article.id, to_url=None, + anchor_text="learn more", # Added in Story 4.5 link_type="tiered" ) diff --git a/docs/technical-debt.md b/docs/technical-debt.md index 0bd89d3..dc8e197 100644 --- a/docs/technical-debt.md +++ b/docs/technical-debt.md @@ -493,6 +493,116 @@ See full specification: `docs/stories/story-3.4-boilerplate-site-pages.md` --- +## Epic 4: Cloud Deployment + +### Multi-Cloud Storage Support + +**Priority**: Low +**Epic**: Epic 4 (Deployment) +**Estimated Effort**: Medium (5-8 story points) +**Status**: Deferred from Story 4.1 + +#### Problem +Story 4.1 implements deployment to Bunny.net storage only. Support for other cloud providers (AWS S3, Azure Blob Storage, DigitalOcean Spaces, Backblaze B2, etc.) was deferred. + +#### Impact +- Limited flexibility for users who prefer or require other providers +- Cannot leverage existing infrastructure on other platforms +- Vendor lock-in to Bunny.net + +#### Solution +Implement a storage provider abstraction layer with pluggable backends: +- Abstract `StorageClient` interface +- Provider-specific implementations (S3Client, AzureClient, etc.) +- Provider selection via site deployment configuration +- All credentials via `.env` file + +**Dependencies**: None (can be implemented anytime) + +--- + +### CDN Cache Purging After Deployment + +**Priority**: Medium +**Epic**: Epic 4 (Deployment) +**Estimated Effort**: Small (2-3 story points) + +#### Problem +After deploying updated content, old versions may remain cached in CDN, causing users to see stale content until cache naturally expires. + +#### Impact +- Content updates not immediately visible +- Confusing for testing/verification +- May take hours for changes to propagate + +#### Solution +Add cache purging step after successful deployment: +- Bunny.net: Use Pull Zone purge API +- Purge specific URLs or entire zone +- Optional flag to skip purging (for performance) +- Report purge status in deployment summary + +**Dependencies**: Story 4.1 (deployment must work first) + +--- + +### Boilerplate Page Storage Optimization + +**Priority**: Low +**Epic**: Epic 3/4 (Pre-deployment/Deployment) +**Estimated Effort**: Small (2-3 story points) + +#### Problem +Story 3.4 stores full HTML for boilerplate pages (about, contact, privacy) in the database. This is inefficient and creates consistency issues if templates change. + +#### Impact +- Database bloat (HTML is large) +- Template changes don't retroactively apply to existing pages +- Difficult to update content across all sites + +#### Solution +Store only metadata, regenerate HTML on-the-fly during deployment: +- Database: Store only `page_type` marker (not full HTML) +- Deployment: Generate HTML using current template at deploy time +- Ensures consistency with latest templates +- Reduces storage requirements + +**Alternative**: Keep current approach if regeneration adds too much complexity. + +**Dependencies**: Story 3.4 and 4.1 (both must exist first) + +--- + +### Homepage (index.html) Generation + +**Priority**: Medium +**Epic**: Epic 3 (Pre-deployment) or Epic 4 (Deployment) +**Estimated Effort**: Medium (5-8 story points) + +#### Problem +Sites have navigation with `/index.html` link, but no homepage exists. Users landing on root domain see 404 or directory listing. + +#### Impact +- Poor user experience for site visitors +- Unprofessional appearance +- Lost SEO opportunity (homepage is important) + +#### Solution +Generate `index.html` for each site with: +- List of recent articles (with links) +- Site branding/header +- Brief description +- Professional layout using same template system + +**Options:** +1. Static page generated once during site creation +2. Dynamic listing updated after each deployment +3. Simple redirect to first article + +**Dependencies**: Story 3.4 (boilerplate page infrastructure) + +--- + ## Future Sections Add new technical debt items below as they're identified during development. diff --git a/scripts/migrate_add_anchor_text.py b/scripts/migrate_add_anchor_text.py new file mode 100644 index 0000000..e623e47 --- /dev/null +++ b/scripts/migrate_add_anchor_text.py @@ -0,0 +1,77 @@ +""" +Migration script to add anchor_text field to article_links table +Story 4.5: Create URL and Link Reporting Script (Enhancement) +""" + +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from src.database.session import db_manager +from sqlalchemy import text + + +def migrate(): + """Add anchor_text field to article_links table""" + + session = db_manager.get_session() + + try: + print("Starting migration: Add anchor_text field to article_links...") + + print(" Adding anchor_text column...") + session.execute(text(""" + ALTER TABLE article_links + ADD COLUMN anchor_text TEXT NULL + """)) + + session.commit() + + print("Migration completed successfully!") + print("\nNew field added:") + print(" - anchor_text (TEXT, nullable)") + print("\nNote: Existing links will have NULL anchor_text.") + print(" Re-run content injection to populate this field for existing content.") + + except Exception as e: + session.rollback() + print(f"Migration failed: {e}") + raise + + finally: + session.close() + + +def rollback(): + """Rollback migration (remove anchor_text field)""" + + session = db_manager.get_session() + + try: + print("Rolling back migration: Remove anchor_text field from article_links...") + + print(" Removing anchor_text column...") + session.execute(text(""" + ALTER TABLE article_links + DROP COLUMN anchor_text + """)) + + session.commit() + + print("Rollback completed successfully!") + + except Exception as e: + session.rollback() + print(f"Rollback failed: {e}") + raise + + finally: + session.close() + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "rollback": + rollback() + else: + migrate() + diff --git a/src/cli/commands.py b/src/cli/commands.py index 12e5f13..d560389 100644 --- a/src/cli/commands.py +++ b/src/cli/commands.py @@ -1182,5 +1182,112 @@ def verify_deployment(batch_id: int, sample: Optional[int], timeout: int): raise click.Abort() +@app.command("get-links") +@click.option('--project-id', '-p', required=True, type=int, help='Project ID to get links for') +@click.option('--tier', '-t', required=True, help='Tier to filter (e.g., "1" or "2+" for tier 2 and above)') +@click.option('--with-anchor-text', is_flag=True, help='Include anchor text used for tiered links') +@click.option('--with-destination-url', is_flag=True, help='Include destination URL that the article links to') +def get_links(project_id: int, tier: str, with_anchor_text: bool, with_destination_url: bool): + """Export article URLs with optional link details for a project and tier""" + import csv + import sys + from src.database.repositories import ArticleLinkRepository + + try: + session = db_manager.get_session() + + try: + content_repo = GeneratedContentRepository(session) + project_repo = ProjectRepository(session) + link_repo = ArticleLinkRepository(session) + + project = project_repo.get_by_id(project_id) + if not project: + click.echo(f"Error: Project {project_id} not found", err=True) + raise click.Abort() + + tier_range_mode = False + min_tier = 1 + + if tier.endswith('+'): + tier_range_mode = True + try: + min_tier = int(tier[:-1]) + except ValueError: + click.echo(f"Error: Invalid tier format '{tier}'. Use '1', '2', or '2+'", err=True) + raise click.Abort() + else: + try: + min_tier = int(tier) + tier_range_mode = False + except ValueError: + click.echo(f"Error: Invalid tier format '{tier}'. Use '1', '2', or '2+'", err=True) + raise click.Abort() + + all_articles = content_repo.get_by_project_id(project_id) + + if tier_range_mode: + articles = [a for a in all_articles if a.deployed_url and int(a.tier.replace('tier', '')) >= min_tier] + else: + tier_str = f"tier{min_tier}" + articles = [a for a in all_articles if a.deployed_url and a.tier == tier_str] + + if not articles: + click.echo(f"No deployed articles found for project {project_id} with tier filter '{tier}'", err=True) + raise click.Abort() + + csv_writer = csv.writer(sys.stdout) + + header = ['article_url', 'tier', 'title'] + if with_anchor_text: + header.append('anchor_text') + if with_destination_url: + header.append('destination_url') + + csv_writer.writerow(header) + + for article in articles: + row = [article.deployed_url, article.tier, article.title] + + if with_anchor_text or with_destination_url: + tiered_links = link_repo.get_by_source_article(article.id) + tiered_links = [link for link in tiered_links if link.link_type == 'tiered'] + + if tiered_links: + for link in tiered_links: + row_with_link = row.copy() + + if with_anchor_text: + row_with_link.append(link.anchor_text or '') + + if with_destination_url: + if link.to_url: + row_with_link.append(link.to_url) + elif link.to_content_id: + target_article = content_repo.get_by_id(link.to_content_id) + row_with_link.append(target_article.deployed_url if target_article and target_article.deployed_url else '') + else: + row_with_link.append('') + + csv_writer.writerow(row_with_link) + else: + if with_anchor_text: + row.append('') + if with_destination_url: + row.append('') + csv_writer.writerow(row) + else: + csv_writer.writerow(row) + + finally: + session.close() + + except Exception as e: + click.echo(f"Error getting links: {e}", err=True) + raise click.Abort() + + + + if __name__ == "__main__": app() diff --git a/src/database/models.py b/src/database/models.py index 8431e0f..0f6f317 100644 --- a/src/database/models.py +++ b/src/database/models.py @@ -170,6 +170,7 @@ class ArticleLink(Base): index=True ) to_url: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + anchor_text: Mapped[Optional[str]] = mapped_column(Text, nullable=True) link_type: Mapped[str] = mapped_column(String(20), nullable=False, index=True) created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, nullable=False) diff --git a/src/database/repositories.py b/src/database/repositories.py index 82bea6a..13e3de2 100644 --- a/src/database/repositories.py +++ b/src/database/repositories.py @@ -560,6 +560,7 @@ class ArticleLinkRepository(IArticleLinkRepository): from_content_id: int, to_content_id: Optional[int] = None, to_url: Optional[str] = None, + anchor_text: Optional[str] = None, link_type: str = "tiered" ) -> ArticleLink: """ @@ -569,6 +570,7 @@ class ArticleLinkRepository(IArticleLinkRepository): from_content_id: Source article ID to_content_id: Target article ID (for internal links) to_url: Target URL (for external links like money site) + anchor_text: The anchor text used for the link link_type: Type of link (tiered, wheel_next, wheel_prev, homepage) Returns: @@ -584,6 +586,7 @@ class ArticleLinkRepository(IArticleLinkRepository): from_content_id=from_content_id, to_content_id=to_content_id, to_url=to_url, + anchor_text=anchor_text, link_type=link_type ) diff --git a/src/interlinking/content_injection.py b/src/interlinking/content_injection.py index b181d2c..f8118aa 100644 --- a/src/interlinking/content_injection.py +++ b/src/interlinking/content_injection.py @@ -106,14 +106,15 @@ def _inject_tiered_links( anchor_texts = _get_anchor_texts_for_tier("tier1", project, job_config) # Try to inject link - html, link_injected = _try_inject_link(html, anchor_texts, target_url) + html, link_injected, anchor_text_used = _try_inject_link(html, anchor_texts, target_url) if link_injected: - # Record link + # Record link with anchor text link_repo.create( from_content_id=content.id, to_content_id=None, to_url=target_url, + anchor_text=anchor_text_used, link_type="tiered" ) logger.info(f"Injected money site link for content {content.id}") @@ -139,14 +140,15 @@ def _inject_tiered_links( continue # Try to inject link - html, link_injected = _try_inject_link(html, [anchor_text], target_url) + html, link_injected, anchor_text_used = _try_inject_link(html, [anchor_text], target_url) if link_injected: - # Record link + # Record link with anchor text link_repo.create( from_content_id=content.id, to_content_id=None, to_url=target_url, + anchor_text=anchor_text_used, link_type="tiered" ) logger.info(f"Injected lower tier link to {target_url} for content {content.id}") @@ -177,7 +179,7 @@ def _inject_homepage_link( anchor_text = "Home" # Try to inject link (will search article content only, not nav) - html, link_injected = _try_inject_link(html, [anchor_text], homepage_url) + html, link_injected, anchor_text_used = _try_inject_link(html, [anchor_text], homepage_url) if link_injected: # Record link @@ -185,6 +187,7 @@ def _inject_homepage_link( from_content_id=content.id, to_content_id=None, to_url=homepage_url, + anchor_text=anchor_text_used, link_type="homepage" ) logger.info(f"Injected homepage link for content {content.id}") @@ -221,6 +224,7 @@ def _inject_see_also_section( from_content_id=content.id, to_content_id=article['content_id'], to_url=None, + anchor_text=article['title'], link_type="wheel_see_also" ) @@ -259,25 +263,25 @@ def _get_anchor_texts_for_tier( return default_anchors -def _try_inject_link(html: str, anchor_texts: List[str], target_url: str) -> Tuple[str, bool]: +def _try_inject_link(html: str, anchor_texts: List[str], target_url: str) -> Tuple[str, bool, Optional[str]]: """ Try to inject a link with anchor text into HTML - Returns (updated_html, link_injected) + Returns (updated_html, link_injected, anchor_text_used) """ for anchor_text in anchor_texts: # Try to find and wrap anchor text in content updated_html, found = _find_and_wrap_anchor_text(html, anchor_text, target_url) if found: - return updated_html, True + return updated_html, True, anchor_text # Fallback: insert anchor text + link into random paragraph if anchor_texts: anchor_text = anchor_texts[0] updated_html = _insert_link_into_random_paragraph(html, anchor_text, target_url) - return updated_html, True + return updated_html, True, anchor_text - return html, False + return html, False, None def _find_and_wrap_anchor_text(html: str, anchor_text: str, target_url: str) -> Tuple[str, bool]: diff --git a/tests/integration/test_get_links_command.py b/tests/integration/test_get_links_command.py new file mode 100644 index 0000000..2de192f --- /dev/null +++ b/tests/integration/test_get_links_command.py @@ -0,0 +1,298 @@ +"""Integration tests for get-links CLI command""" + +import pytest +from click.testing import CliRunner +from src.cli.commands import app +from src.database.models import Project, GeneratedContent, ArticleLink +from src.database.session import db_manager + + +@pytest.fixture +def cli_runner(): + """Create Click CLI runner""" + return CliRunner() + + +@pytest.fixture +def test_project_with_deployed_content(): + """Create a test project with deployed articles and tiered links""" + session = db_manager.get_session() + try: + project = Project( + name="Test Link Export Project", + main_keyword="test keyword", + user_id=1, + money_site_url="https://www.moneysite.com" + ) + session.add(project) + session.flush() + + tier1_html = ''' + +

Test Article

+

This is content about test keyword services.

+ + ''' + + tier2_html = ''' + +

Test Tier 2

+

Check out our expert guide for more.

+ + ''' + + article1 = GeneratedContent( + project_id=project.id, + tier="tier1", + keyword="test keyword 1", + title="Test Article 1 - Tier 1", + outline={"sections": []}, + content="

Test content

", + word_count=500, + status="deployed", + formatted_html=tier1_html, + deployed_url="https://cdn1.example.com/article1.html" + ) + session.add(article1) + + article2 = GeneratedContent( + project_id=project.id, + tier="tier1", + keyword="test keyword 2", + title="Test Article 2 - Tier 1", + outline={"sections": []}, + content="

Test content 2

", + word_count=600, + status="deployed", + formatted_html=tier1_html, + deployed_url="https://cdn1.example.com/article2.html" + ) + session.add(article2) + + article3 = GeneratedContent( + project_id=project.id, + tier="tier2", + keyword="test keyword 3", + title="Test Article 3 - Tier 2", + outline={"sections": []}, + content="

Test content 3

", + word_count=700, + status="deployed", + formatted_html=tier2_html, + deployed_url="https://cdn2.example.com/article3.html" + ) + session.add(article3) + + article4 = GeneratedContent( + project_id=project.id, + tier="tier3", + keyword="test keyword 4", + title="Test Article 4 - Tier 3", + outline={"sections": []}, + content="

Test content 4

", + word_count=800, + status="deployed", + formatted_html=tier2_html, + deployed_url="https://cdn3.example.com/article4.html" + ) + session.add(article4) + + session.flush() + + link1 = ArticleLink( + from_content_id=article1.id, + to_url="https://www.moneysite.com", + anchor_text="test keyword", + link_type="tiered" + ) + session.add(link1) + + link2 = ArticleLink( + from_content_id=article2.id, + to_url="https://www.moneysite.com", + anchor_text="test keyword", + link_type="tiered" + ) + session.add(link2) + + link3 = ArticleLink( + from_content_id=article3.id, + to_content_id=article1.id, + anchor_text="expert guide", + link_type="tiered" + ) + session.add(link3) + + link4 = ArticleLink( + from_content_id=article4.id, + to_content_id=article3.id, + anchor_text="expert guide", + link_type="tiered" + ) + session.add(link4) + + session.commit() + + yield project, [article1, article2, article3, article4] + finally: + session.close() + + +class TestGetLinksCommand: + """Test suite for get-links command""" + + def test_get_links_basic_tier1(self, cli_runner, test_project_with_deployed_content): + """Test basic tier 1 export without optional flags""" + project, articles = test_project_with_deployed_content + + result = cli_runner.invoke(app, [ + 'get-links', + '--project-id', str(project.id), + '--tier', '1' + ]) + + assert result.exit_code == 0 + output = result.output + + assert 'article_url,tier,title' in output + assert 'https://cdn1.example.com/article1.html' in output + assert 'https://cdn1.example.com/article2.html' in output + assert 'tier1' in output + assert 'https://cdn2.example.com/article3.html' not in output + + def test_get_links_tier_range(self, cli_runner, test_project_with_deployed_content): + """Test tier range filter (2+)""" + project, articles = test_project_with_deployed_content + + result = cli_runner.invoke(app, [ + 'get-links', + '--project-id', str(project.id), + '--tier', '2+' + ]) + + assert result.exit_code == 0 + output = result.output + + assert 'https://cdn2.example.com/article3.html' in output + assert 'https://cdn3.example.com/article4.html' in output + assert 'tier2' in output + assert 'tier3' in output + assert 'https://cdn1.example.com/article1.html' not in output + + def test_get_links_with_anchor_text(self, cli_runner, test_project_with_deployed_content): + """Test export with anchor text extraction""" + project, articles = test_project_with_deployed_content + + result = cli_runner.invoke(app, [ + 'get-links', + '--project-id', str(project.id), + '--tier', '1', + '--with-anchor-text' + ]) + + assert result.exit_code == 0 + output = result.output + + assert 'article_url,tier,title,anchor_text' in output + assert 'test keyword' in output + + def test_get_links_with_destination_url(self, cli_runner, test_project_with_deployed_content): + """Test export with destination URLs""" + project, articles = test_project_with_deployed_content + + result = cli_runner.invoke(app, [ + 'get-links', + '--project-id', str(project.id), + '--tier', '1', + '--with-destination-url' + ]) + + assert result.exit_code == 0 + output = result.output + + assert 'article_url,tier,title,destination_url' in output + assert 'https://www.moneysite.com' in output + + def test_get_links_with_both_flags(self, cli_runner, test_project_with_deployed_content): + """Test export with both anchor text and destination URL""" + project, articles = test_project_with_deployed_content + + result = cli_runner.invoke(app, [ + 'get-links', + '--project-id', str(project.id), + '--tier', '1', + '--with-anchor-text', + '--with-destination-url' + ]) + + assert result.exit_code == 0 + output = result.output + + assert 'article_url,tier,title,anchor_text,destination_url' in output + assert 'test keyword' in output + assert 'https://www.moneysite.com' in output + + def test_get_links_tier2_resolves_content_id(self, cli_runner, test_project_with_deployed_content): + """Test that tier 2 links resolve to_content_id to deployed URL""" + project, articles = test_project_with_deployed_content + + result = cli_runner.invoke(app, [ + 'get-links', + '--project-id', str(project.id), + '--tier', '2', + '--with-destination-url' + ]) + + assert result.exit_code == 0 + output = result.output + + assert 'https://cdn2.example.com/article3.html' in output + assert 'https://cdn1.example.com/article1.html' in output + + def test_get_links_invalid_project(self, cli_runner): + """Test error handling for non-existent project""" + result = cli_runner.invoke(app, [ + 'get-links', + '--project-id', '99999', + '--tier', '1' + ]) + + assert result.exit_code == 1 + assert 'Error: Project 99999 not found' in result.output + + def test_get_links_invalid_tier_format(self, cli_runner, test_project_with_deployed_content): + """Test error handling for invalid tier format""" + project, articles = test_project_with_deployed_content + + result = cli_runner.invoke(app, [ + 'get-links', + '--project-id', str(project.id), + '--tier', 'invalid' + ]) + + assert result.exit_code == 1 + assert 'Invalid tier format' in result.output + + def test_get_links_no_deployed_articles(self, cli_runner): + """Test error handling when no deployed articles exist""" + session = db_manager.get_session() + try: + project = Project( + name="Empty Project", + main_keyword="empty", + user_id=1 + ) + session.add(project) + session.commit() + + result = cli_runner.invoke(app, [ + 'get-links', + '--project-id', str(project.id), + '--tier', '1' + ]) + + assert result.exit_code == 1 + assert 'No deployed articles found' in result.output + finally: + session.close() +