From 04f10d6d26f57adb8cf41b83ffa761e9e17aae83 Mon Sep 17 00:00:00 2001 From: PeninsulaInd Date: Mon, 3 Nov 2025 10:07:35 -0600 Subject: [PATCH] Adds ability to quickly import without CORA- version 1.2 --- .../story-2.8-simple-spreadsheet-ingestion.md | 209 ++++++++++++++++++ docs/technical-debt.md | 43 ++++ requirements.txt | 2 +- src/auth/service.py | 12 + src/cli/commands.py | 129 ++++++++++- src/ingestion/parser.py | 157 +++++++++++++ 6 files changed, 548 insertions(+), 4 deletions(-) create mode 100644 docs/stories/story-2.8-simple-spreadsheet-ingestion.md diff --git a/docs/stories/story-2.8-simple-spreadsheet-ingestion.md b/docs/stories/story-2.8-simple-spreadsheet-ingestion.md new file mode 100644 index 0000000..164eb9b --- /dev/null +++ b/docs/stories/story-2.8-simple-spreadsheet-ingestion.md @@ -0,0 +1,209 @@ +# Story 2.8: Simple Spreadsheet Ingestion + +## Overview +Implement a simplified spreadsheet ingestion path that allows users to quickly create projects from basic data without requiring a full CORA report. This addresses the need for faster project setup when a full CORA run (20-25 minutes) is unnecessary. + +## Story Details +**As a User**, I want to ingest a simple spreadsheet with minimal required data, so that I can quickly create a project for content generation without waiting for a full CORA analysis. + +## Context +A full CORA run takes 20-25 minutes and includes extensive metrics. Sometimes users only need to add information from a few cells they pasted into a spreadsheet. Eventually this will be entered via a webform, but for now a simpler spreadsheet format is needed. + +## Acceptance Criteria + +### 1. CLI Command to Ingest Simple Spreadsheets +**Status:** PENDING + +A CLI command exists to accept simple .xlsx file paths: +- Command: `ingest-simple` +- Options: `--file`, `--name` (optional, overrides spreadsheet), `--money-site-url`, `--username`, `--password` +- Requires user authentication (any authenticated user can create projects) +- Returns success message with project details + +### 2. Spreadsheet Format +**Status:** PENDING + +The parser accepts a simple single-sheet spreadsheet format: +- **First row**: Headers (column names) +- **Second row**: Data values + +**Required columns:** +- `main_keyword`: Single phrase keyword (e.g., "shaft machining") +- `project_name`: Name for the project +- `related_searches`: Comma-delimited list (e.g., "term1, term2, term3") +- `entities`: Comma-delimited list (e.g., "entity1, entity2, entity3") + +**Optional columns:** +- `word_count`: Integer (default: 1500) +- `term_frequency`: Integer (default: 3) + +### 3. Data Parsing +**Status:** PENDING + +The parser correctly extracts and processes data: +- Parses comma-delimited `related_searches` into array +- Parses comma-delimited `entities` into array +- Applies defaults for optional fields (word_count=1500, term_frequency=3) +- Sets all structure metrics (title_exact_match, h1_exact, h2_total, etc.) to `None` +- Validates required fields are present + +### 4. Database Storage +**Status:** PENDING + +Project records are created with all data: +- User association (user_id foreign key) +- Main keyword and project name +- Word count and term frequency (with defaults) +- Entities and related searches as JSON arrays +- Structure metrics as `NULL` (not required for simple ingestion) +- Money site URL (prompted if not provided) +- Timestamps (created_at, updated_at) + +### 5. Error Handling +**Status:** PENDING + +Graceful error handling for: +- File not found errors +- Invalid Excel file format +- Missing required columns (main_keyword, project_name) +- Empty or invalid comma-delimited lists (treated as empty arrays) +- Authentication failures +- Database errors + +## Implementation Details + +### Files to Create/Modify + +#### 1. `src/ingestion/parser.py` - UPDATED +Add `SimpleSpreadsheetParser` class: +```python +class SimpleSpreadsheetParser: + """Parser for simple single-sheet spreadsheets with basic project data""" + + def __init__(self, file_path: str) + def _parse_comma_delimited(self, value: Any) -> List[str] + def parse(self) -> Dict[str, Any] +``` + +**Key Features:** +- Reads first sheet of workbook +- First row as headers (case-insensitive) +- Second row as data values +- Parses comma-delimited strings into arrays +- Applies defaults for optional fields +- Returns data structure compatible with `ProjectRepository.create()` + +#### 2. `src/cli/commands.py` - UPDATED +Add `ingest-simple` command: +```python +@app.command() +@click.option('--file', '-f', required=True) +@click.option('--name', '-n', help='Override project_name from spreadsheet') +@click.option('--money-site-url', '-m') +@click.option('--username', '-u') +@click.option('--password', '-p') +def ingest_simple(...) +``` + +**Features:** +- Authenticate user +- Parse simple spreadsheet +- Display parsed data summary +- Prompt for money_site_url if not provided +- Create project via ProjectRepository +- Show success summary + +### Data Model + +Uses existing `Project` model - no database changes required. Structure metrics will be `NULL` for simple ingestion projects. + +### Spreadsheet Example + +**Simple Format:** +| main_keyword | project_name | related_searches | entities | word_count | term_frequency | +|-------------|--------------|------------------|----------|------------|----------------| +| best coffee makers | Coffee Project | best espresso machines, coffee maker reviews, top coffee makers | coffee, espresso, brewing | 1500 | 3 | + +**Minimal Format (uses defaults):** +| main_keyword | project_name | related_searches | entities | +|-------------|--------------|------------------|----------| +| shaft machining | Machining Project | CNC machining, precision machining | machining, lathe, milling | + +## CLI Usage + +**Basic:** +```bash +python main.py ingest-simple \ + --file simple_project.xlsx \ + --username admin \ + --password pass +``` + +**With Overrides:** +```bash +python main.py ingest-simple \ + --file simple_project.xlsx \ + --name "Custom Project Name" \ + --money-site-url https://example.com \ + --username admin \ + --password pass +``` + +**Expected Output:** +``` +Authenticated as: admin (Admin) + +Parsing simple spreadsheet: simple_project.xlsx +Main Keyword: best coffee makers +Project Name: Coffee Project +Word Count: 1500 +Term Frequency: 3 +Entities: 3 +Related Searches: 3 + Entities: coffee, espresso, brewing + Related Searches: best espresso machines, coffee maker reviews, top coffee makers + +Enter money site URL (required for tiered linking): https://moneysite.com + +Creating project: Coffee Project +Money Site URL: https://moneysite.com + +Success: Project 'Coffee Project' created (ID: 1) +Main Keyword: best coffee makers +Money Site URL: https://moneysite.com +Word Count: 1500 +Term Frequency: 3 +Entities: 3 +Related Searches: 3 +``` + +## Error Handling Examples + +**Missing Required Column:** +``` +Error parsing spreadsheet: Required field 'main_keyword' not found +``` + +**Invalid File:** +``` +Error parsing spreadsheet: Failed to open Excel file: [details] +``` + +**Empty Spreadsheet:** +``` +Error parsing spreadsheet: No headers found in spreadsheet +``` + +## Dependencies + +- Story 2.1 (CORA ingestion) - Reuses ProjectRepository and Project model +- Existing authentication system +- Existing database models + +## Future Enhancements + +- Support for multiple projects per spreadsheet (multiple data rows) +- CSV format support (in addition to Excel) +- Web form interface (deferred to future story) +- Validation of comma-delimited format with better error messages + diff --git a/docs/technical-debt.md b/docs/technical-debt.md index 08f1fab..bbf1b39 100644 --- a/docs/technical-debt.md +++ b/docs/technical-debt.md @@ -457,6 +457,49 @@ This would still provide value with much less complexity (2-3 story points inste ## Story 3.3: Content Interlinking Injection +### Anchor Text Variation Insertion + +**Priority**: Medium +**Epic Suggestion**: Epic 3 (Pre-deployment) - Enhancement +**Estimated Effort**: Small (1-2 story points) + +#### Problem +Currently, when anchor text (main keyword or variations) is not found in the generated article content, the system falls back to inserting only the main keyword. The system searches for variations like "learn about {keyword}" and "{keyword} guide", but these variations almost never exist in the AI-generated content. This means we always end up inserting the exact same anchor text (the main keyword), reducing anchor text diversity. + +#### Current Behavior +In `src/interlinking/content_injection.py`, the `_try_inject_link()` function: +1. Searches for anchor text variations in content (main keyword first, then variations) +2. If found, wraps that text with a link +3. **If not found, only inserts the first anchor text (main keyword) into content** + +Example for "shaft machining": +- Searches for: "shaft machining", "learn about shaft machining", "shaft machining guide", etc. +- Variations are almost never in the content +- Always falls back to inserting just "shaft machining" + +#### Proposed Solution +When anchor text is not found in content, randomly select from ALL available anchor text variations (not just the first one) for insertion: + +**Change in `_try_inject_link()`:** +```python +# Current: Always inserts anchor_texts[0] (main keyword) +# Proposed: Randomly select from all anchor_texts for insertion +if anchor_texts: + anchor_text = random.choice(anchor_texts) # Random variation instead of [0] + updated_html = _insert_link_into_random_paragraph(html, anchor_text, target_url) +``` + +#### Impact +- Improved anchor text diversity +- More natural linking patterns +- Better SEO through varied anchor text +- Leverages all generated variations instead of just one + +#### Dependencies +None - can be implemented immediately. + +--- + ### Boilerplate Site Pages (About, Contact, Privacy) **Priority**: High diff --git a/requirements.txt b/requirements.txt index 32fbacf..4162c00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ alembic==1.17.0 # Authentication passlib[bcrypt]==1.7.4 -bcrypt==4.0.1 +bcrypt==3.2.2 python-jose==3.5.0 cryptography==46.0.3 diff --git a/src/auth/service.py b/src/auth/service.py index a73ce66..2b12f16 100644 --- a/src/auth/service.py +++ b/src/auth/service.py @@ -34,7 +34,13 @@ class AuthService: Returns: The hashed password string + + Note: + bcrypt has a 72-byte limit, so passwords longer than 72 bytes are truncated """ + password_bytes = password.encode('utf-8') + if len(password_bytes) > 72: + password = password_bytes[:72].decode('utf-8', errors='ignore') return pwd_context.hash(password) @staticmethod @@ -48,7 +54,13 @@ class AuthService: Returns: True if password matches, False otherwise + + Note: + bcrypt has a 72-byte limit, so passwords longer than 72 bytes are truncated """ + password_bytes = plain_password.encode('utf-8') + if len(password_bytes) > 72: + plain_password = password_bytes[:72].decode('utf-8', errors='ignore') return pwd_context.verify(plain_password, hashed_password) def authenticate_user(self, username: str, password: str) -> Optional[User]: diff --git a/src/cli/commands.py b/src/cli/commands.py index a0e88ed..b9547fa 100644 --- a/src/cli/commands.py +++ b/src/cli/commands.py @@ -15,7 +15,7 @@ from src.deployment.bunnynet import ( BunnyNetAuthError, BunnyNetResourceConflictError ) -from src.ingestion.parser import CORAParser, CORAParseError +from src.ingestion.parser import CORAParser, CORAParseError, SimpleSpreadsheetParser from src.generation.ai_client import AIClient, PromptManager from src.generation.service import ContentGenerator from src.generation.batch_processor import BatchProcessor @@ -23,10 +23,14 @@ from src.database.repositories import GeneratedContentRepository, SitePageReposi from src.deployment.bunny_storage import BunnyStorageClient, BunnyStorageError from src.deployment.deployment_service import DeploymentService from src.deployment.url_logger import URLLogger +from dotenv import load_dotenv import os import requests import random +# Load .env file at module level +load_dotenv() + def authenticate_admin(username: str, password: str) -> Optional[User]: """ @@ -55,13 +59,22 @@ def authenticate_admin(username: str, password: str) -> Optional[User]: def prompt_admin_credentials() -> tuple[str, str]: """ Prompt for admin username and password + Checks environment variables CLIENT_USERNAME and CLIENT_PASSWORD first Returns: Tuple of (username, password) """ + username = os.getenv("CLIENT_USERNAME") + password = os.getenv("CLIENT_PASSWORD") + + if username and password: + return username, password + click.echo("Admin authentication required") - username = click.prompt("Username", type=str) - password = click.prompt("Password", type=str, hide_input=True) + if not username: + username = click.prompt("Username", type=str) + if not password: + password = click.prompt("Password", type=str, hide_input=True) return username, password @@ -861,6 +874,116 @@ def ingest_cora(file_path: str, name: str, money_site_url: Optional[str], custom raise click.Abort() +@app.command() +@click.option('--file', '-f', 'file_path', required=True, type=click.Path(exists=True), help='Path to simple .xlsx spreadsheet file') +@click.option('--name', '-n', help='Project name (overrides project_name from spreadsheet if provided)') +@click.option('--money-site-url', '-m', help='Money site URL (e.g., https://example.com)') +@click.option('--username', '-u', help='Username for authentication') +@click.option('--password', '-p', help='Password for authentication') +def ingest_simple(file_path: str, name: Optional[str], money_site_url: Optional[str], username: Optional[str], password: Optional[str]): + """Ingest a simple spreadsheet and create a new project + + Expected spreadsheet format: + - First row: Headers (main_keyword, project_name, related_searches, entities) + - Second row: Data values + + Required columns: main_keyword, project_name, related_searches, entities + - main_keyword: Single phrase keyword + - project_name: Name for the project + - related_searches: Comma-delimited list (e.g., "term1, term2, term3") + - entities: Comma-delimited list (e.g., "entity1, entity2, entity3") + + Optional columns (with defaults): + - word_count: Default 1500 + - term_frequency: Default 3 + """ + try: + if not username or not password: + username, password = prompt_admin_credentials() + + session = db_manager.get_session() + try: + user_repo = UserRepository(session) + auth_service = AuthService(user_repo) + + user = auth_service.authenticate_user(username, password) + if not user: + click.echo("Error: Authentication failed", err=True) + raise click.Abort() + + click.echo(f"Authenticated as: {user.username} ({user.role})") + + click.echo(f"\nParsing simple spreadsheet: {file_path}") + + parser = SimpleSpreadsheetParser(file_path) + data = parser.parse() + + project_name = name or data.get("project_name") + if not project_name: + click.echo("Error: Project name is required (provide via --name or in spreadsheet)", err=True) + raise click.Abort() + + click.echo(f"Main Keyword: {data['main_keyword']}") + click.echo(f"Project Name: {project_name}") + click.echo(f"Word Count: {data['word_count']}") + click.echo(f"Term Frequency: {data['term_frequency']}") + click.echo(f"Entities: {len(data['entities'])}") + click.echo(f"Related Searches: {len(data['related_searches'])}") + + if data['entities']: + click.echo(f" Entities: {', '.join(data['entities'][:5])}" + (f" ... (+{len(data['entities']) - 5} more)" if len(data['entities']) > 5 else "")) + + if data['related_searches']: + click.echo(f" Related Searches: {', '.join(data['related_searches'][:5])}" + (f" ... (+{len(data['related_searches']) - 5} more)" if len(data['related_searches']) > 5 else "")) + + if not money_site_url: + money_site_url = click.prompt( + "\nEnter money site URL (required for tiered linking)", + type=str + ) + + if not money_site_url.startswith('http://') and not money_site_url.startswith('https://'): + click.echo("Error: Money site URL must start with http:// or https://", err=True) + raise click.Abort() + + money_site_url = money_site_url.rstrip('/') + + click.echo(f"\nCreating project: {project_name}") + click.echo(f"Money Site URL: {money_site_url}") + + data['money_site_url'] = money_site_url + + project_data = {k: v for k, v in data.items() if k != 'project_name'} + + project_repo = ProjectRepository(session) + project = project_repo.create( + user_id=user.id, + name=project_name, + data=project_data + ) + + click.echo(f"\nSuccess: Project '{project.name}' created (ID: {project.id})") + click.echo(f"Main Keyword: {project.main_keyword}") + click.echo(f"Money Site URL: {project.money_site_url}") + click.echo(f"Word Count: {project.word_count}") + click.echo(f"Term Frequency: {project.term_frequency}") + click.echo(f"Entities: {len(project.entities or [])}") + click.echo(f"Related Searches: {len(project.related_searches or [])}") + + except CORAParseError as e: + click.echo(f"Error parsing spreadsheet: {e}", err=True) + raise click.Abort() + except ValueError as e: + click.echo(f"Error creating project: {e}", err=True) + raise click.Abort() + finally: + session.close() + + except Exception as e: + click.echo(f"Error ingesting spreadsheet: {e}", err=True) + raise click.Abort() + + @app.command() @click.option('--username', '-u', help='Username for authentication') @click.option('--password', '-p', help='Password for authentication') diff --git a/src/ingestion/parser.py b/src/ingestion/parser.py index 517d7ea..12080c7 100644 --- a/src/ingestion/parser.py +++ b/src/ingestion/parser.py @@ -258,3 +258,160 @@ class CORAParser: raise CORAParseError(f"Unexpected error during parsing: {e}") finally: self.workbook.close() + + +class SimpleSpreadsheetParser: + """Parser for simple single-sheet spreadsheets with basic project data""" + + def __init__(self, file_path: str): + """ + Initialize parser with file path + + Args: + file_path: Path to .xlsx file + + Raises: + CORAParseError: If file doesn't exist or can't be opened + """ + self.file_path = Path(file_path) + if not self.file_path.exists(): + raise CORAParseError(f"File not found: {file_path}") + + try: + self.workbook = openpyxl.load_workbook(self.file_path, data_only=True) + except Exception as e: + raise CORAParseError(f"Failed to open Excel file: {e}") + + def _parse_comma_delimited(self, value: Any) -> List[str]: + """ + Parse comma-delimited string into list + + Args: + value: String value or None + + Returns: + List of trimmed strings + """ + if not value: + return [] + + value_str = str(value).strip() + if not value_str: + return [] + + return [item.strip() for item in value_str.split(',') if item.strip()] + + def parse(self) -> Dict[str, Any]: + """ + Parse simple spreadsheet and return project data + + Expected format: + - First row: headers (main_keyword, project_name, related_searches, entities, etc.) + - Second row: data values + + Required columns: main_keyword, project_name, related_searches, entities + + Returns: + Dictionary with project data (same structure as CORAParser.parse()) + + Raises: + CORAParseError: If parsing fails + """ + try: + if not self.workbook.sheetnames: + raise CORAParseError("Spreadsheet has no sheets") + + sheet = self.workbook[self.workbook.sheetnames[0]] + + headers = [] + data_row = None + + for row_idx, row in enumerate(sheet.iter_rows(values_only=True), start=1): + if row_idx == 1: + headers = [str(cell).strip().lower() if cell else "" for cell in row] + continue + elif row_idx == 2: + data_row = list(row) + break + + if not headers: + raise CORAParseError("No headers found in spreadsheet") + + if not data_row: + raise CORAParseError("No data row found in spreadsheet") + + header_to_value = {} + for idx, header in enumerate(headers): + if idx < len(data_row): + header_to_value[header] = data_row[idx] + + main_keyword = header_to_value.get("main_keyword") or header_to_value.get("keyword") + project_name = header_to_value.get("project_name") or header_to_value.get("name") + related_searches_str = header_to_value.get("related_searches") or header_to_value.get("related_search") + entities_str = header_to_value.get("entities") or header_to_value.get("entity") + + if not main_keyword: + raise CORAParseError("Required field 'main_keyword' not found") + + if not project_name: + raise CORAParseError("Required field 'project_name' not found") + + related_searches = self._parse_comma_delimited(related_searches_str) + entities = self._parse_comma_delimited(entities_str) + + word_count = header_to_value.get("word_count") + if word_count: + try: + word_count = int(word_count) + except (ValueError, TypeError): + word_count = 1500 + else: + word_count = 1500 + + term_frequency = header_to_value.get("term_frequency") + if term_frequency: + try: + term_frequency = int(term_frequency) + except (ValueError, TypeError): + term_frequency = 3 + else: + term_frequency = 3 + + return { + "main_keyword": str(main_keyword).strip(), + "project_name": str(project_name).strip(), + "word_count": word_count, + "term_frequency": term_frequency, + "related_search_density": None, + "entity_density": None, + "lsi_density": None, + "spintax_related_search_terms": None, + "title_exact_match": None, + "title_related_search": None, + "meta_exact_match": None, + "meta_related_search": None, + "meta_entities": None, + "h1_exact": None, + "h1_related_search": None, + "h1_entities": None, + "h1_lsi": None, + "h2_total": None, + "h2_exact": None, + "h2_related_search": None, + "h2_entities": None, + "h2_lsi": None, + "h3_total": None, + "h3_exact": None, + "h3_related_search": None, + "h3_entities": None, + "h3_lsi": None, + "entities": entities, + "related_searches": related_searches, + "custom_anchor_text": [], + } + except CORAParseError: + raise + except Exception as e: + raise CORAParseError(f"Unexpected error during parsing: {e}") + finally: + self.workbook.close() \ No newline at end of file