local db works with limited reporting

2025-06-13 11:14:42 -05:00 · 2025-06-13 11:14:42 -05:00 · 3b99e345ed
commit 3b99e345ed
4 changed files with 701 additions and 0 deletions
--- a/link_tracker.db
+++ b/link_tracker.db
--- a/main.py
+++ b/main.py
@ -0,0 +1,502 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import PlainTextResponse
 from pydantic import BaseModel, HttpUrl
 from typing import List, Optional
 from datetime import datetime
 import sqlite3
 import json
 from pathlib import Path
 app = FastAPI(title="Link Tracker API", version="1.0.0")
 # Enable CORS for your Chrome extension
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # In production, restrict this to specific origins
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Database setup
 DB_PATH = "link_tracker.db"
 def init_db():
    """Initialize the database with required tables"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    # Pages table - stores captured page information
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS pages (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            url TEXT UNIQUE NOT NULL,
            title TEXT NOT NULL,
            keywords TEXT,  -- JSON array of keywords
            timestamp DATETIME NOT NULL,
            detected_clients TEXT,  -- JSON array of detected clients
            total_links INTEGER NOT NULL,
            linked_to TEXT,  -- JSON array of client URLs this page links to
            colinkiri BOOLEAN DEFAULT FALSE,
            indexer BOOLEAN DEFAULT FALSE,
            t2 BOOLEAN DEFAULT FALSE,
            created_at DATETIME DEFAULT CURRENT_TIMESTAMP
        )
    """)
    # For existing databases, add the new columns if they don't exist
    try:
        cursor.execute("ALTER TABLE pages ADD COLUMN colinkiri BOOLEAN DEFAULT FALSE")
    except sqlite3.OperationalError:
        pass  # Column already exists
    try:
        cursor.execute("ALTER TABLE pages ADD COLUMN indexer BOOLEAN DEFAULT FALSE")
    except sqlite3.OperationalError:
        pass  # Column already exists
    try:
        cursor.execute("ALTER TABLE pages ADD COLUMN t2 BOOLEAN DEFAULT FALSE")
    except sqlite3.OperationalError:
        pass  # Column already exists
    try:
        cursor.execute("ALTER TABLE pages ADD COLUMN linked_to TEXT")
    except sqlite3.OperationalError:
        pass  # Column already exists
    # Links table - stores all external links found on pages
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS links (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            page_id INTEGER,
            href TEXT NOT NULL,
            anchor_text TEXT,
            title_attr TEXT,
            domain TEXT NOT NULL,
            is_client_link BOOLEAN DEFAULT FALSE,
            client_domain TEXT,
            client_name TEXT,
            created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
            FOREIGN KEY (page_id) REFERENCES pages (id)
        )
    """)
    # Create indexes for better query performance
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_pages_url ON pages (url)")
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_links_domain ON links (domain)")
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_links_client_domain ON links (client_domain)")
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_links_page_id ON links (page_id)")
    conn.commit()
    conn.close()
 # Pydantic models for API requests
 class LinkData(BaseModel):
    href: str
    text: Optional[str] = ""
    title: Optional[str] = ""
 class DetectedClient(BaseModel):
    domain: str
    name: str
 class PageCaptureRequest(BaseModel):
    url: str
    title: str
    timestamp: str
    keywords: List[str]
    detectedClients: List[DetectedClient]
    totalLinks: int
    links: List[LinkData]
 # API Response models
 class PageSummary(BaseModel):
    id: int
    url: str
    title: str
    timestamp: str
    detected_clients: List[str]
    total_links: int
    client_links_count: int
 class LinkSummary(BaseModel):
    href: str
    anchor_text: str
    domain: str
    is_client_link: bool
    client_name: Optional[str] = None
@app.on_event("startup")
 async def startup_event():
    """Initialize database on startup"""
    init_db()
@app.get("/")
 async def root():
    """Health check endpoint"""
    return {"message": "Link Tracker API is running"}
@app.post("/capture-page")
 async def capture_page(data: PageCaptureRequest):
    """Capture page data and links from Chrome extension"""
    try:
        print(f"Received data: {data}")  # Debug logging
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        # Check if page already exists
        cursor.execute("SELECT id FROM pages WHERE url = ?", (data.url,))
        existing_page = cursor.fetchone()
        # Get client domains for faster lookup
        client_domains = {c.domain: c.name for c in data.detectedClients}
        # Collect client URLs for the linked_to field
        client_urls = []
        if existing_page:
            # Update existing page
            page_id = existing_page[0]
            cursor.execute("""
                UPDATE pages 
                SET title = ?, keywords = ?, timestamp = ?, 
                    detected_clients = ?, total_links = ?, linked_to = ?
                WHERE id = ?
            """, (
                data.title,
                json.dumps(data.keywords),
                data.timestamp,
                json.dumps([{"domain": c.domain, "name": c.name} for c in data.detectedClients]),
                data.totalLinks,
                json.dumps([]),  # Will be populated below
                page_id
            ))
            # Delete existing links for this page
            cursor.execute("DELETE FROM links WHERE page_id = ?", (page_id,))
        else:
            # Insert new page
            cursor.execute("""
                INSERT INTO pages (url, title, keywords, timestamp, detected_clients, total_links, linked_to)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            """, (
                data.url,
                data.title,
                json.dumps(data.keywords),
                data.timestamp,
                json.dumps([{"domain": c.domain, "name": c.name} for c in data.detectedClients]),
                data.totalLinks,
                json.dumps([])  # Will be populated below
            ))
            page_id = cursor.lastrowid
        # Insert links
        for link in data.links:
            try:
                from urllib.parse import urlparse
                parsed_url = urlparse(link.href)
                domain = parsed_url.netloc.replace('www.', '')
                # Check if this is a client link
                is_client_link = domain in client_domains
                client_name = client_domains.get(domain) if is_client_link else None
                # If it's a client link, add to linked_to array
                if is_client_link:
                    client_urls.append(link.href)
                cursor.execute("""
                    INSERT INTO links (page_id, href, anchor_text, title_attr, domain, 
                                     is_client_link, client_domain, client_name)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                """, (
                    page_id,
                    link.href,
                    link.text or "",
                    link.title or "",
                    domain,
                    is_client_link,
                    domain if is_client_link else None,
                    client_name
                ))
            except Exception as e:
                print(f"Error processing link {link.href}: {e}")
                continue
        # Update the linked_to field with collected client URLs
        cursor.execute("UPDATE pages SET linked_to = ? WHERE id = ?", (json.dumps(client_urls), page_id))
        conn.commit()
        conn.close()
        return {
            "success": True,
            "message": f"Captured {data.totalLinks} links from {data.url}",
            "page_id": page_id,
            "detected_clients": len(data.detectedClients)
        }
    except Exception as e:
        print(f"Error details: {e}")  # Debug logging
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"Error capturing page data: {str(e)}")
@app.get("/pages", response_model=List[PageSummary])
 async def get_pages(limit: int = 50, offset: int = 0):
    """Get list of captured pages"""
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute("""
            SELECT p.id, p.url, p.title, p.timestamp, p.detected_clients, p.total_links,
                   COUNT(l.id) as client_links_count
            FROM pages p
            LEFT JOIN links l ON p.id = l.page_id AND l.is_client_link = 1
            GROUP BY p.id
            ORDER BY p.created_at DESC
            LIMIT ? OFFSET ?
        """, (limit, offset))
        pages = []
        for row in cursor.fetchall():
            detected_clients_data = json.loads(row[4]) if row[4] else []
            client_names = [c["name"] for c in detected_clients_data]
            pages.append(PageSummary(
                id=row[0],
                url=row[1],
                title=row[2],
                timestamp=row[3],
                detected_clients=client_names,
                total_links=row[5],
                client_links_count=row[6]
            ))
        conn.close()
        return pages
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error fetching pages: {str(e)}")
@app.get("/pages/{page_id}/links", response_model=List[LinkSummary])
 async def get_page_links(page_id: int):
    """Get all links for a specific page"""
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute("""
            SELECT href, anchor_text, domain, is_client_link, client_name
            FROM links
            WHERE page_id = ?
            ORDER BY is_client_link DESC, domain ASC
        """, (page_id,))
        links = []
        for row in cursor.fetchall():
            links.append(LinkSummary(
                href=row[0],
                anchor_text=row[1],
                domain=row[2],
                is_client_link=bool(row[3]),
                client_name=row[4]
            ))
        conn.close()
        return links
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error fetching links: {str(e)}")
@app.get("/clients/{client_domain}/links")
 async def get_client_links(client_domain: str, limit: int = 100):
    """Get all links pointing to a specific client domain"""
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute("""
            SELECT l.href, l.anchor_text, p.url as source_page, p.title as source_title, 
                   l.client_name, p.timestamp
            FROM links l
            JOIN pages p ON l.page_id = p.id
            WHERE l.client_domain = ?
            ORDER BY p.timestamp DESC
            LIMIT ?
        """, (client_domain, limit))
        links = []
        for row in cursor.fetchall():
            links.append({
                "target_url": row[0],
                "anchor_text": row[1],
                "source_page": row[2],
                "source_title": row[3],
                "client_name": row[4],
                "timestamp": row[5]
            })
        conn.close()
        return {"client_domain": client_domain, "links": links}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error fetching client links: {str(e)}")
@app.get("/urls/for-colinkiri", response_class=PlainTextResponse)
 async def get_urls_for_colinkiri():
    """Get all URLs where colinkiri=false and mark them as processed"""
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        # Get unprocessed URLs
        cursor.execute("""
            SELECT id, url
            FROM pages 
            WHERE colinkiri = FALSE
            ORDER BY created_at ASC
        """)
        page_ids = []
        url_list = []
        for row in cursor.fetchall():
            page_ids.append(row[0])
            url_list.append(row[1])
        # Mark them as processed
        if page_ids:
            placeholders = ','.join(['?'] * len(page_ids))
            cursor.execute(f"UPDATE pages SET colinkiri = TRUE WHERE id IN ({placeholders})", page_ids)
            conn.commit()
        conn.close()
        # Return URLs as plain text, one per line
        return '\n'.join(url_list)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error getting URLs for colinkiri: {str(e)}")
@app.get("/urls/for-indexer", response_class=PlainTextResponse)
 async def get_urls_for_indexer():
    """Get all URLs where indexer=false and mark them as processed"""
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        # Get unprocessed URLs
        cursor.execute("""
            SELECT id, url
            FROM pages 
            WHERE indexer = FALSE
            ORDER BY created_at ASC
        """)
        page_ids = []
        url_list = []
        for row in cursor.fetchall():
            page_ids.append(row[0])
            url_list.append(row[1])
        # Mark them as processed
        if page_ids:
            placeholders = ','.join(['?'] * len(page_ids))
            cursor.execute(f"UPDATE pages SET indexer = TRUE WHERE id IN ({placeholders})", page_ids)
            conn.commit()
        conn.close()
        # Return URLs as plain text, one per line
        return '\n'.join(url_list)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error getting URLs for indexer: {str(e)}")
@app.get("/search/linking-to")
 async def search_pages_linking_to(target_url: str):
    """Find all pages that link to a specific URL"""
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute("""
            SELECT id, url, title, timestamp, linked_to
            FROM pages 
            WHERE linked_to LIKE ?
            ORDER BY created_at DESC
        """, (f'%{target_url}%',))
        pages = []
        for row in cursor.fetchall():
            linked_to = json.loads(row[4]) if row[4] else []
            # Verify the exact URL is in the linked_to array
            if target_url in linked_to:
                pages.append({
                    "id": row[0],
                    "url": row[1],
                    "title": row[2],
                    "timestamp": row[3],
                    "linked_to": linked_to
                })
        conn.close()
        return {
            "target_url": target_url,
            "pages": pages,
            "count": len(pages)
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error searching for pages linking to URL: {str(e)}")
@app.get("/stats")
 async def get_stats():
    """Get overall statistics"""
    try:
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        # Total pages captured
        cursor.execute("SELECT COUNT(*) FROM pages")
        total_pages = cursor.fetchone()[0]
        # Total links captured
        cursor.execute("SELECT COUNT(*) FROM links")
        total_links = cursor.fetchone()[0]
        # Total client links
        cursor.execute("SELECT COUNT(*) FROM links WHERE is_client_link = 1")
        client_links = cursor.fetchone()[0]
        # Links by client
        cursor.execute("""
            SELECT client_name, COUNT(*) as link_count
            FROM links
            WHERE is_client_link = 1
            GROUP BY client_name
            ORDER BY link_count DESC
        """)
        client_stats = [{"client": row[0], "links": row[1]} for row in cursor.fetchall()]
        conn.close()
        return {
            "total_pages": total_pages,
            "total_links": total_links,
            "client_links": client_links,
            "other_links": total_links - client_links,
            "client_breakdown": client_stats
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error fetching stats: {str(e)}")
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/readme_md.md
+++ b/readme_md.md
@ -0,0 +1,196 @@
 # Link Tracker API
 A simple web service that tracks links to your client websites and provides reports for SEO indexing services.
 ## What This Does
 This tool helps you:
 - Track which pages link to your client websites
 - Generate lists of URLs to submit to indexing services like colinkiri
 - Keep track of which URLs you've already submitted
 ## Installation
 ### Step 1: Install Python
 **On Mac:**
 1. Open Terminal (press `Cmd + Space`, type "Terminal", press Enter)
 2. Install Homebrew if you don't have it:
   ```bash
   /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
   ```
 3. Install Python:
   ```bash
   brew install python
   ```
 **On Windows:**
 1. Go to https://python.org/downloads/
 2. Download the latest Python version
 3. Run the installer
 4. **Important:** Check "Add Python to PATH" during installation
 5. Open Command Prompt (press `Win + R`, type "cmd", press Enter)
 ### Step 2: Download the Project
 1. Open Terminal (Mac) or Command Prompt (Windows)
 2. Navigate to where you want to install the project:
   ```bash
   cd Desktop
   ```
 3. Clone the project from GitHub:
   ```bash
   git clone https://git.peninsulaindustries.com/bryanb/Link-Tracker-Server.git
   cd link-tracker
   ```
 ### Step 3: Install Required Packages
 **On Mac (Terminal):**
 ```bash
 cd ~/Desktop/link-tracker
 python3 -m venv venv
 source venv/bin/activate
 pip install -r requirements.txt
 ```
 **On Windows (Command Prompt):**
 ```bash
 cd %USERPROFILE%\Desktop\link-tracker
 python -m venv venv
 venv\Scripts\activate
 pip install -r requirements.txt
 ```
 ## Running the Server
 **On Mac:**
 ```bash
 cd ~/Desktop/link-tracker
 source venv/bin/activate
 python main.py
 ```
 **On Windows:**
 ```bash
 cd %USERPROFILE%\Desktop\link-tracker
 venv\Scripts\activate
 python main.py
 ```
 You should see something like:
 ```
 INFO:     Started server process [12345]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
 INFO:     Uvicorn running on http://0.0.0.0:8000
 ```
 **The server is now running!** Keep this window open while you use the system.
 ## Getting Your URL Reports
 ### For Colinkiri Indexing Service
 Open your web browser and go to:
 ```
 http://localhost:8000/urls/for-colinkiri
 ```
 This will show you a list of URLs, one per line, that you can copy and paste into colinkiri. **Important:** After you visit this URL, those pages are marked as "submitted" so they won't appear in future reports.
 ### For Other Indexing Services
 Open your web browser and go to:
 ```
 http://localhost:8000/urls/for-indexer
 ```
 Same as above, but for a different indexing service.
 ## Updating to the Latest Version
 When updates are available, you can easily update the project:
 1. Stop the server (press `Ctrl + C` in the terminal)
 2. Update the code:
   ```bash
   git pull origin main
   ```
 3. Update any new dependencies:
   ```bash
   pip install -r requirements.txt
   ```
 4. Restart the server:
   ```bash
   python main.py
   ```
 **Note:** Your data (link_tracker.db file) will not be affected by updates.
 ## Stopping the Server
 To stop the server, go back to your Terminal/Command Prompt window and press `Ctrl + C`.
 ## Troubleshooting
 **"Command not found" errors:**
 - Make sure Python is installed and added to your PATH
 - Try using `python3` instead of `python` on Mac
 - Try using `py` instead of `python` on Windows
 **"Port already in use" error:**
 - Another program is using port 8000
 - Try changing the port in main.py (last line): `uvicorn.run(app, host="0.0.0.0", port=8001)`
 - Then use `http://localhost:8001` instead of `http://localhost:8000`
 **Can't access the URLs:**
 - Make sure the server is running (you should see the "Uvicorn running" message)
 - Check that you're using the correct URL: `http://localhost:8000`
 - Try refreshing your browser
 ## Other Available Endpoints
 These are for advanced users or developers:
 ### View All Captured Pages
 ```
 http://localhost:8000/pages
 ```
 Shows all pages that have been captured with link information.
 ### View Statistics
 ```
 http://localhost:8000/stats
 ```
 Shows overall statistics about captured pages and links.
 ### Search for Pages Linking to Specific URL
 ```
 http://localhost:8000/search/linking-to?target_url=https://example.com
 ```
 Find all pages that link to a specific URL.
 ### View Links for Specific Client
 ```
 http://localhost:8000/clients/clientdomain.com/links
 ```
 See all links pointing to a specific client domain.
 ### API Documentation
 ```
 http://localhost:8000/docs
 ```
 Interactive API documentation (for developers).
 ## Data Storage
 The system automatically creates a file called `link_tracker.db` in the same folder as `main.py`. This file contains all your captured data. **Don't delete this file** unless you want to lose all your data.
 ## Support
 If you run into issues:
 1. Make sure Python is properly installed
 2. Make sure you're in the correct folder when running commands
 3. Check that the server is running before trying to access URLs
 4. Try restarting the server if something seems stuck
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
 fastapi==0.104.1
 uvicorn[standard]==0.24.0
 pydantic==2.8.2