local db works with limited reporting

main
Bryan Bigari 2025-06-13 11:14:42 -05:00
commit 3b99e345ed
4 changed files with 701 additions and 0 deletions

BIN
link_tracker.db 100644

Binary file not shown.

502
main.py 100644
View File

@ -0,0 +1,502 @@
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import PlainTextResponse
from pydantic import BaseModel, HttpUrl
from typing import List, Optional
from datetime import datetime
import sqlite3
import json
from pathlib import Path
app = FastAPI(title="Link Tracker API", version="1.0.0")
# Enable CORS for your Chrome extension
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # In production, restrict this to specific origins
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Database setup
DB_PATH = "link_tracker.db"
def init_db():
"""Initialize the database with required tables"""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Pages table - stores captured page information
cursor.execute("""
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
title TEXT NOT NULL,
keywords TEXT, -- JSON array of keywords
timestamp DATETIME NOT NULL,
detected_clients TEXT, -- JSON array of detected clients
total_links INTEGER NOT NULL,
linked_to TEXT, -- JSON array of client URLs this page links to
colinkiri BOOLEAN DEFAULT FALSE,
indexer BOOLEAN DEFAULT FALSE,
t2 BOOLEAN DEFAULT FALSE,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
""")
# For existing databases, add the new columns if they don't exist
try:
cursor.execute("ALTER TABLE pages ADD COLUMN colinkiri BOOLEAN DEFAULT FALSE")
except sqlite3.OperationalError:
pass # Column already exists
try:
cursor.execute("ALTER TABLE pages ADD COLUMN indexer BOOLEAN DEFAULT FALSE")
except sqlite3.OperationalError:
pass # Column already exists
try:
cursor.execute("ALTER TABLE pages ADD COLUMN t2 BOOLEAN DEFAULT FALSE")
except sqlite3.OperationalError:
pass # Column already exists
try:
cursor.execute("ALTER TABLE pages ADD COLUMN linked_to TEXT")
except sqlite3.OperationalError:
pass # Column already exists
# Links table - stores all external links found on pages
cursor.execute("""
CREATE TABLE IF NOT EXISTS links (
id INTEGER PRIMARY KEY AUTOINCREMENT,
page_id INTEGER,
href TEXT NOT NULL,
anchor_text TEXT,
title_attr TEXT,
domain TEXT NOT NULL,
is_client_link BOOLEAN DEFAULT FALSE,
client_domain TEXT,
client_name TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (page_id) REFERENCES pages (id)
)
""")
# Create indexes for better query performance
cursor.execute("CREATE INDEX IF NOT EXISTS idx_pages_url ON pages (url)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_links_domain ON links (domain)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_links_client_domain ON links (client_domain)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_links_page_id ON links (page_id)")
conn.commit()
conn.close()
# Pydantic models for API requests
class LinkData(BaseModel):
href: str
text: Optional[str] = ""
title: Optional[str] = ""
class DetectedClient(BaseModel):
domain: str
name: str
class PageCaptureRequest(BaseModel):
url: str
title: str
timestamp: str
keywords: List[str]
detectedClients: List[DetectedClient]
totalLinks: int
links: List[LinkData]
# API Response models
class PageSummary(BaseModel):
id: int
url: str
title: str
timestamp: str
detected_clients: List[str]
total_links: int
client_links_count: int
class LinkSummary(BaseModel):
href: str
anchor_text: str
domain: str
is_client_link: bool
client_name: Optional[str] = None
@app.on_event("startup")
async def startup_event():
"""Initialize database on startup"""
init_db()
@app.get("/")
async def root():
"""Health check endpoint"""
return {"message": "Link Tracker API is running"}
@app.post("/capture-page")
async def capture_page(data: PageCaptureRequest):
"""Capture page data and links from Chrome extension"""
try:
print(f"Received data: {data}") # Debug logging
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Check if page already exists
cursor.execute("SELECT id FROM pages WHERE url = ?", (data.url,))
existing_page = cursor.fetchone()
# Get client domains for faster lookup
client_domains = {c.domain: c.name for c in data.detectedClients}
# Collect client URLs for the linked_to field
client_urls = []
if existing_page:
# Update existing page
page_id = existing_page[0]
cursor.execute("""
UPDATE pages
SET title = ?, keywords = ?, timestamp = ?,
detected_clients = ?, total_links = ?, linked_to = ?
WHERE id = ?
""", (
data.title,
json.dumps(data.keywords),
data.timestamp,
json.dumps([{"domain": c.domain, "name": c.name} for c in data.detectedClients]),
data.totalLinks,
json.dumps([]), # Will be populated below
page_id
))
# Delete existing links for this page
cursor.execute("DELETE FROM links WHERE page_id = ?", (page_id,))
else:
# Insert new page
cursor.execute("""
INSERT INTO pages (url, title, keywords, timestamp, detected_clients, total_links, linked_to)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (
data.url,
data.title,
json.dumps(data.keywords),
data.timestamp,
json.dumps([{"domain": c.domain, "name": c.name} for c in data.detectedClients]),
data.totalLinks,
json.dumps([]) # Will be populated below
))
page_id = cursor.lastrowid
# Insert links
for link in data.links:
try:
from urllib.parse import urlparse
parsed_url = urlparse(link.href)
domain = parsed_url.netloc.replace('www.', '')
# Check if this is a client link
is_client_link = domain in client_domains
client_name = client_domains.get(domain) if is_client_link else None
# If it's a client link, add to linked_to array
if is_client_link:
client_urls.append(link.href)
cursor.execute("""
INSERT INTO links (page_id, href, anchor_text, title_attr, domain,
is_client_link, client_domain, client_name)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (
page_id,
link.href,
link.text or "",
link.title or "",
domain,
is_client_link,
domain if is_client_link else None,
client_name
))
except Exception as e:
print(f"Error processing link {link.href}: {e}")
continue
# Update the linked_to field with collected client URLs
cursor.execute("UPDATE pages SET linked_to = ? WHERE id = ?", (json.dumps(client_urls), page_id))
conn.commit()
conn.close()
return {
"success": True,
"message": f"Captured {data.totalLinks} links from {data.url}",
"page_id": page_id,
"detected_clients": len(data.detectedClients)
}
except Exception as e:
print(f"Error details: {e}") # Debug logging
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail=f"Error capturing page data: {str(e)}")
@app.get("/pages", response_model=List[PageSummary])
async def get_pages(limit: int = 50, offset: int = 0):
"""Get list of captured pages"""
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("""
SELECT p.id, p.url, p.title, p.timestamp, p.detected_clients, p.total_links,
COUNT(l.id) as client_links_count
FROM pages p
LEFT JOIN links l ON p.id = l.page_id AND l.is_client_link = 1
GROUP BY p.id
ORDER BY p.created_at DESC
LIMIT ? OFFSET ?
""", (limit, offset))
pages = []
for row in cursor.fetchall():
detected_clients_data = json.loads(row[4]) if row[4] else []
client_names = [c["name"] for c in detected_clients_data]
pages.append(PageSummary(
id=row[0],
url=row[1],
title=row[2],
timestamp=row[3],
detected_clients=client_names,
total_links=row[5],
client_links_count=row[6]
))
conn.close()
return pages
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error fetching pages: {str(e)}")
@app.get("/pages/{page_id}/links", response_model=List[LinkSummary])
async def get_page_links(page_id: int):
"""Get all links for a specific page"""
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("""
SELECT href, anchor_text, domain, is_client_link, client_name
FROM links
WHERE page_id = ?
ORDER BY is_client_link DESC, domain ASC
""", (page_id,))
links = []
for row in cursor.fetchall():
links.append(LinkSummary(
href=row[0],
anchor_text=row[1],
domain=row[2],
is_client_link=bool(row[3]),
client_name=row[4]
))
conn.close()
return links
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error fetching links: {str(e)}")
@app.get("/clients/{client_domain}/links")
async def get_client_links(client_domain: str, limit: int = 100):
"""Get all links pointing to a specific client domain"""
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("""
SELECT l.href, l.anchor_text, p.url as source_page, p.title as source_title,
l.client_name, p.timestamp
FROM links l
JOIN pages p ON l.page_id = p.id
WHERE l.client_domain = ?
ORDER BY p.timestamp DESC
LIMIT ?
""", (client_domain, limit))
links = []
for row in cursor.fetchall():
links.append({
"target_url": row[0],
"anchor_text": row[1],
"source_page": row[2],
"source_title": row[3],
"client_name": row[4],
"timestamp": row[5]
})
conn.close()
return {"client_domain": client_domain, "links": links}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error fetching client links: {str(e)}")
@app.get("/urls/for-colinkiri", response_class=PlainTextResponse)
async def get_urls_for_colinkiri():
"""Get all URLs where colinkiri=false and mark them as processed"""
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Get unprocessed URLs
cursor.execute("""
SELECT id, url
FROM pages
WHERE colinkiri = FALSE
ORDER BY created_at ASC
""")
page_ids = []
url_list = []
for row in cursor.fetchall():
page_ids.append(row[0])
url_list.append(row[1])
# Mark them as processed
if page_ids:
placeholders = ','.join(['?'] * len(page_ids))
cursor.execute(f"UPDATE pages SET colinkiri = TRUE WHERE id IN ({placeholders})", page_ids)
conn.commit()
conn.close()
# Return URLs as plain text, one per line
return '\n'.join(url_list)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error getting URLs for colinkiri: {str(e)}")
@app.get("/urls/for-indexer", response_class=PlainTextResponse)
async def get_urls_for_indexer():
"""Get all URLs where indexer=false and mark them as processed"""
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Get unprocessed URLs
cursor.execute("""
SELECT id, url
FROM pages
WHERE indexer = FALSE
ORDER BY created_at ASC
""")
page_ids = []
url_list = []
for row in cursor.fetchall():
page_ids.append(row[0])
url_list.append(row[1])
# Mark them as processed
if page_ids:
placeholders = ','.join(['?'] * len(page_ids))
cursor.execute(f"UPDATE pages SET indexer = TRUE WHERE id IN ({placeholders})", page_ids)
conn.commit()
conn.close()
# Return URLs as plain text, one per line
return '\n'.join(url_list)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error getting URLs for indexer: {str(e)}")
@app.get("/search/linking-to")
async def search_pages_linking_to(target_url: str):
"""Find all pages that link to a specific URL"""
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("""
SELECT id, url, title, timestamp, linked_to
FROM pages
WHERE linked_to LIKE ?
ORDER BY created_at DESC
""", (f'%{target_url}%',))
pages = []
for row in cursor.fetchall():
linked_to = json.loads(row[4]) if row[4] else []
# Verify the exact URL is in the linked_to array
if target_url in linked_to:
pages.append({
"id": row[0],
"url": row[1],
"title": row[2],
"timestamp": row[3],
"linked_to": linked_to
})
conn.close()
return {
"target_url": target_url,
"pages": pages,
"count": len(pages)
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error searching for pages linking to URL: {str(e)}")
@app.get("/stats")
async def get_stats():
"""Get overall statistics"""
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Total pages captured
cursor.execute("SELECT COUNT(*) FROM pages")
total_pages = cursor.fetchone()[0]
# Total links captured
cursor.execute("SELECT COUNT(*) FROM links")
total_links = cursor.fetchone()[0]
# Total client links
cursor.execute("SELECT COUNT(*) FROM links WHERE is_client_link = 1")
client_links = cursor.fetchone()[0]
# Links by client
cursor.execute("""
SELECT client_name, COUNT(*) as link_count
FROM links
WHERE is_client_link = 1
GROUP BY client_name
ORDER BY link_count DESC
""")
client_stats = [{"client": row[0], "links": row[1]} for row in cursor.fetchall()]
conn.close()
return {
"total_pages": total_pages,
"total_links": total_links,
"client_links": client_links,
"other_links": total_links - client_links,
"client_breakdown": client_stats
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error fetching stats: {str(e)}")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

196
readme_md.md 100644
View File

@ -0,0 +1,196 @@
# Link Tracker API
A simple web service that tracks links to your client websites and provides reports for SEO indexing services.
## What This Does
This tool helps you:
- Track which pages link to your client websites
- Generate lists of URLs to submit to indexing services like colinkiri
- Keep track of which URLs you've already submitted
## Installation
### Step 1: Install Python
**On Mac:**
1. Open Terminal (press `Cmd + Space`, type "Terminal", press Enter)
2. Install Homebrew if you don't have it:
```bash
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
```
3. Install Python:
```bash
brew install python
```
**On Windows:**
1. Go to https://python.org/downloads/
2. Download the latest Python version
3. Run the installer
4. **Important:** Check "Add Python to PATH" during installation
5. Open Command Prompt (press `Win + R`, type "cmd", press Enter)
### Step 2: Download the Project
1. Open Terminal (Mac) or Command Prompt (Windows)
2. Navigate to where you want to install the project:
```bash
cd Desktop
```
3. Clone the project from GitHub:
```bash
git clone https://git.peninsulaindustries.com/bryanb/Link-Tracker-Server.git
cd link-tracker
```
### Step 3: Install Required Packages
**On Mac (Terminal):**
```bash
cd ~/Desktop/link-tracker
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
```
**On Windows (Command Prompt):**
```bash
cd %USERPROFILE%\Desktop\link-tracker
python -m venv venv
venv\Scripts\activate
pip install -r requirements.txt
```
## Running the Server
**On Mac:**
```bash
cd ~/Desktop/link-tracker
source venv/bin/activate
python main.py
```
**On Windows:**
```bash
cd %USERPROFILE%\Desktop\link-tracker
venv\Scripts\activate
python main.py
```
You should see something like:
```
INFO: Started server process [12345]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8000
```
**The server is now running!** Keep this window open while you use the system.
## Getting Your URL Reports
### For Colinkiri Indexing Service
Open your web browser and go to:
```
http://localhost:8000/urls/for-colinkiri
```
This will show you a list of URLs, one per line, that you can copy and paste into colinkiri. **Important:** After you visit this URL, those pages are marked as "submitted" so they won't appear in future reports.
### For Other Indexing Services
Open your web browser and go to:
```
http://localhost:8000/urls/for-indexer
```
Same as above, but for a different indexing service.
## Updating to the Latest Version
When updates are available, you can easily update the project:
1. Stop the server (press `Ctrl + C` in the terminal)
2. Update the code:
```bash
git pull origin main
```
3. Update any new dependencies:
```bash
pip install -r requirements.txt
```
4. Restart the server:
```bash
python main.py
```
**Note:** Your data (link_tracker.db file) will not be affected by updates.
## Stopping the Server
To stop the server, go back to your Terminal/Command Prompt window and press `Ctrl + C`.
## Troubleshooting
**"Command not found" errors:**
- Make sure Python is installed and added to your PATH
- Try using `python3` instead of `python` on Mac
- Try using `py` instead of `python` on Windows
**"Port already in use" error:**
- Another program is using port 8000
- Try changing the port in main.py (last line): `uvicorn.run(app, host="0.0.0.0", port=8001)`
- Then use `http://localhost:8001` instead of `http://localhost:8000`
**Can't access the URLs:**
- Make sure the server is running (you should see the "Uvicorn running" message)
- Check that you're using the correct URL: `http://localhost:8000`
- Try refreshing your browser
## Other Available Endpoints
These are for advanced users or developers:
### View All Captured Pages
```
http://localhost:8000/pages
```
Shows all pages that have been captured with link information.
### View Statistics
```
http://localhost:8000/stats
```
Shows overall statistics about captured pages and links.
### Search for Pages Linking to Specific URL
```
http://localhost:8000/search/linking-to?target_url=https://example.com
```
Find all pages that link to a specific URL.
### View Links for Specific Client
```
http://localhost:8000/clients/clientdomain.com/links
```
See all links pointing to a specific client domain.
### API Documentation
```
http://localhost:8000/docs
```
Interactive API documentation (for developers).
## Data Storage
The system automatically creates a file called `link_tracker.db` in the same folder as `main.py`. This file contains all your captured data. **Don't delete this file** unless you want to lose all your data.
## Support
If you run into issues:
1. Make sure Python is properly installed
2. Make sure you're in the correct folder when running commands
3. Check that the server is running before trying to access URLs
4. Try restarting the server if something seems stuck

3
requirements.txt 100644
View File

@ -0,0 +1,3 @@
fastapi==0.104.1
uvicorn[standard]==0.24.0
pydantic==2.8.2