# 1_website_scraper.py

import os
import json
import argparse
from pathlib import Path
from urllib.parse import urljoin, urlparse, parse_qs
import datetime
import mimetypes
import re
import asyncio
import platform
import shutil
from typing import Dict, List, Any, Set, Tuple, Optional
from bs4 import BeautifulSoup, Comment
import httpx
import xml.etree.ElementTree as ET

import vertexai
from vertexai.generative_models import GenerativeModel, GenerationResponse
import asyncio

# Environment detection
def detect_cpanel_environment():
    """Detect if we're running on cPanel shared hosting."""
    cpanel_indicators = [
        os.path.exists('/usr/local/cpanel'),
        'cpanel' in platform.node().lower(),
        os.getenv('CPANEL_USER') is not None,
        os.path.exists('/home') and not os.path.exists('/Applications'),  # Linux without macOS
    ]
    return any(cpanel_indicators)

def setup_gcp_credentials():
    """Auto-setup GCP credentials from default location or environment."""
    # Default paths to check for credentials
    # Default paths to check for credentials (multiple structures)
    credential_paths = [
        Path("../data/credentials/gcp_key.json"),           # Development: from dashboard/scripts
        Path("./data/credentials/gcp_key.json"),            # Alternative: from project root
        Path("../dashboard/data/credentials/gcp_key.json"), # Production with dashboard: from /test/scripts
        Path("dashboard/data/credentials/gcp_key.json"),    # Alternative production with dashboard
        Path("data/credentials/gcp_key.json")               # Fallback
    ]
    
    # Check if GOOGLE_APPLICATION_CREDENTIALS is already set
    if not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
        # Try to find credentials file in order of preference
        key_path = None
        for path in credential_paths:
            if path.exists():
                key_path = path
                break
        
        if key_path:
            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(key_path.absolute())
            print(f"🔑 Auto-detected GCP key: {key_path}")
    
    # Check if GCLOUD_PROJECT is already set
    if not os.getenv("GCLOUD_PROJECT"):
        # Try to read project ID from the key file
        creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
        if creds_path and Path(creds_path).exists():
            try:
                with open(creds_path, 'r') as f:
                    key_data = json.load(f)
                    project_id = key_data.get("project_id")
                    if project_id:
                        os.environ["GCLOUD_PROJECT"] = project_id
                        print(f"🌐 Auto-detected GCP project: {project_id}")
            except Exception as e:
                print(f"⚠️ Could not read project ID from key file: {e}")

IS_CPANEL = detect_cpanel_environment()

# Setup GCP credentials automatically
setup_gcp_credentials()

# Try to import Playwright, fall back to httpx-only mode if not available
try:
    from playwright.async_api import async_playwright, Page, BrowserContext
    PLAYWRIGHT_AVAILABLE = True
except ImportError:
    PLAYWRIGHT_AVAILABLE = False
    print("🔄 Playwright not available, using httpx-only mode (cPanel compatible)")
    # Define dummy types for type hints when Playwright isn't available
    Page = None
    BrowserContext = None

# Force httpx mode always (for consistent behavior)
PLAYWRIGHT_AVAILABLE = False
print("🌍 Using httpx-only mode for consistent behavior across all environments")

from bs4 import BeautifulSoup
from tqdm import tqdm
from dotenv import load_dotenv
import httpx
import xml.etree.ElementTree as ET

import vertexai
from vertexai.generative_models import GenerativeModel, GenerationResponse
import asyncio

# --- Configuration ---
PAGE_LOAD_TIMEOUT = 60000
ASSET_TIMEOUT = 20000
RENDER_DELAY = 3000
MAX_RETRIES = 3
RETRY_DELAYS = [5, 10, 20]  # Seconds to wait between retries
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
VIEWPORT_SIZE = {"width": 1920, "height": 1080}
OUTPUT_DIR = Path("../data/website_data")
MAX_CRAWL_DEPTH = 3
MAX_URLS_PER_DOMAIN = 500

# --- AI Configuration with Pricing ---
MODELS_CONFIG = [
    {
        "name": "Gemini 2.5 Pro (GA)", 
        "model_id": "gemini-2.5-pro", 
        "location": "us-east1",
        "pricing": {"input": 1.25 / 1_000_000, "output": 10.00 / 1_000_000}
    },
    {
        "name": "Gemini 2.5 Flash (GA)", 
        "model_id": "gemini-2.5-flash", 
        "location": "us-east1",
        "pricing": {"input": 0.30 / 1_000_000, "output": 2.50 / 1_000_000}
    },
    {
        "name": "Gemini 2.5 Flash-Lite (Preview)", 
        "model_id": "gemini-2.5-flash-lite-preview-06-17", 
        "location": "global",
        "pricing": {"input": 0.10 / 1_000_000, "output": 0.40 / 1_000_000}
    }
]
DEFAULT_PLANNER_MODEL_INDEX = 1

# --- Helper Functions ---

def sanitize_filename(url: str) -> str:
    """Cleans a URL path to be a valid filename."""
    url_path = urlparse(url).path
    if not url_path or url_path == '/':
        return "index.html"
    filename = url_path.strip('/').replace('/', '_')
    if not Path(filename).suffix:
        return f"{filename}.html"
    return filename

def normalize_url(url: str) -> str:
    """Normalize URL by removing fragments, sorting query params, etc."""
    parsed = urlparse(url)
    # Remove common tracking parameters
    query_params = parse_qs(parsed.query)
    filtered_params = {k: v for k, v in query_params.items() 
                      if k not in ['utm_source', 'utm_medium', 'utm_campaign', 'fbclid', 'gclid']}
    
    # Reconstruct URL without fragment and with sorted params
    from urllib.parse import urlencode
    clean_query = urlencode(sorted(filtered_params.items()), doseq=True)
    
    return f"{parsed.scheme}://{parsed.netloc}{parsed.path}" + (f"?{clean_query}" if clean_query else "")

def is_valid_internal_url(url: str, base_netloc: str) -> bool:
    """Checks if URL is valid, internal, and not a file/anchor."""
    try:
        parsed_url = urlparse(url)
        url_netloc_norm = parsed_url.netloc.replace('www.', '')
        base_netloc_norm = base_netloc.replace('www.', '')
        
        # Skip obvious non-content URLs
        skip_patterns = [
            r'\.pdf$', r'\.zip$', r'\.mp4$', r'\.mov$', r'\.jpg$', r'\.png$', r'\.gif$',
            r'/wp-admin/', r'/admin/', r'/login', r'/logout', r'/register',
            r'mailto:', r'tel:', r'#', r'javascript:'
        ]
        
        for pattern in skip_patterns:
            if re.search(pattern, url, re.IGNORECASE):
                return False
        
        return (
            parsed_url.scheme in ['http', 'https'] and
            url_netloc_norm == base_netloc_norm and
            len(parsed_url.path) < 200  # Avoid extremely long URLs
        )
    except Exception:
        return False

async def retry_with_backoff(func, *args, max_retries=MAX_RETRIES, delays=RETRY_DELAYS, **kwargs):
    """Retry a function with exponential backoff on connection errors."""
    last_exception = None
    
    for attempt in range(max_retries + 1):
        try:
            return await func(*args, **kwargs)
        except Exception as e:
            last_exception = e
            error_msg = str(e).lower()
            
            # Check if this is a retryable error
            retryable_errors = [
                'connection_timed_out', 'net::err_connection_timed_out',
                'net::err_connection_refused', 'net::err_name_not_resolved',
                'timeout', 'connection refused', 'connection reset'
            ]
            
            is_retryable = any(error in error_msg for error in retryable_errors)
            
            if not is_retryable or attempt == max_retries:
                # Don't retry on non-connection errors or if max retries reached
                raise e
            
            delay = delays[min(attempt, len(delays) - 1)]
            print(f"     🔄 Connection error (attempt {attempt + 1}/{max_retries + 1}): {str(e)[:100]}...")
            print(f"     ⏳ Retrying in {delay} seconds...")
            await asyncio.sleep(delay)
    
    # This shouldn't be reached, but just in case
    raise last_exception

def classify_iframe_url(src: str, title: str = "", aria_label: str = "", class_name: str = "") -> dict:
    """Classify an iframe URL to determine if it's likely to contain useful content."""
    context = f"{title} {aria_label} {class_name}".lower()
    
    # Skip obvious tracking/social media iframes
    skip_patterns = [
        r'google.*analytics', r'facebook\.com/plugins', r'twitter\.com', r'instagram\.com',
        r'youtube\.com/embed', r'maps\.google', r'googletagmanager',
        r'doubleclick\.net', r'googlesyndication', r'amazon-adsystem'
    ]
    
    if any(re.search(pattern, src, re.IGNORECASE) for pattern in skip_patterns):
        return {"should_scrape": False, "reason": "tracking/ads/social", "priority": 0}
    
    # High priority content indicators
    high_priority_patterns = [
        (r'menu', r'menu|food|dining|restaurant'),
        (r'shop|store|product', r'shop|store|product|catalog|ecommerce'),
        (r'book|reservation', r'book|reservation|appointment|schedule'),
        (r'gallery|photo', r'gallery|photo|image|portfolio'),
        (r'form|contact', r'form|contact|inquiry|message'),
        (r'event|calendar', r'event|calendar|schedule|booking')
    ]
    
    # Medium priority content indicators  
    medium_priority_patterns = [
        (r'app\.|widget', r'app|widget|tool|calculator'),
        (r'embed', r'embed|content|article'),
        (r'player|media', r'player|media|audio|video')
    ]
    
    priority = 0
    content_type = "unknown"
    
    # Check for high priority content
    for url_pattern, context_pattern in high_priority_patterns:
        if re.search(url_pattern, src, re.IGNORECASE) or re.search(context_pattern, context):
            priority = 3
            content_type = url_pattern.split('|')[0]
            break
    
    # Check for medium priority content
    if priority == 0:
        for url_pattern, context_pattern in medium_priority_patterns:
            if re.search(url_pattern, src, re.IGNORECASE) or re.search(context_pattern, context):
                priority = 2
                content_type = url_pattern.split('|')[0] 
                break
    
    # Default low priority for other external content
    if priority == 0 and not src.startswith('data:'):
        priority = 1
        content_type = "external_content"
    
    return {
        "should_scrape": priority >= 2,  # Only scrape medium+ priority
        "reason": content_type,
        "priority": priority,
        "context": context.strip()
    }

async def discover_iframe_urls(page, page_url: str) -> list[dict]:
    """Discover iframe URLs on a page and classify them for potential scraping."""
    iframe_discoveries = []
    
    try:
        # Get all iframe elements with their attributes
        iframes_data = await page.evaluate('''() => {
            const iframes = Array.from(document.querySelectorAll('iframe'));
            return iframes.map((iframe, index) => ({
                src: iframe.src,
                title: iframe.title || '',
                ariaLabel: iframe.getAttribute('aria-label') || '',
                className: iframe.className || '',
                id: iframe.id || '',
                width: iframe.width || '',
                height: iframe.height || ''
            }));
        }''')
        
        if not iframes_data:
            return iframe_discoveries
        
        print(f"    🔍 Found {len(iframes_data)} iframes, analyzing for content...")
        
        for i, iframe_data in enumerate(iframes_data):
            src = iframe_data.get('src', '')
            if not src or src.startswith('data:'):
                continue
            
            # Make URL absolute
            absolute_src = urljoin(page_url, src)
            
            # Classify the iframe
            classification = classify_iframe_url(
                absolute_src,
                iframe_data.get('title', ''),
                iframe_data.get('ariaLabel', ''),
                iframe_data.get('className', '')
            )
            
            discovery = {
                "index": i,
                "src": absolute_src,
                "found_on_page": page_url,
                "title": iframe_data.get('title', ''),
                "aria_label": iframe_data.get('ariaLabel', ''),
                "class_name": iframe_data.get('className', ''),
                "classification": classification
            }
            
            iframe_discoveries.append(discovery)
            
            # Log the decision
            status = "✅ INCLUDE" if classification["should_scrape"] else "⏭️ SKIP"
            print(f"      {status} Iframe {i}: {classification['reason']} (priority: {classification['priority']}) - {absolute_src[:80]}...")
            if classification.get("context"):
                print(f"        Context: {classification['context'][:100]}")
    
    except Exception as e:
        print(f"    ❌ Iframe discovery error: {str(e)[:100]}")
    
    return iframe_discoveries

async def fetch_sitemap_urls(base_url: str) -> set[str]:
    """Fetch URLs from sitemap.xml and robots.txt."""
    found_urls = set()
    
    print("🗺️  Phase 1: Checking for sitemaps...")
    
    async with httpx.AsyncClient(timeout=30.0) as client:
        # Check common sitemap locations
        sitemap_urls = [
            f"{base_url}/sitemap.xml",
            f"{base_url}/sitemap_index.xml",
            f"{base_url}/sitemap.txt",
            f"{base_url}/robots.txt"
        ]
        
        # Check robots.txt for sitemap references
        try:
            robots_response = await client.get(f"{base_url}/robots.txt")
            if robots_response.status_code == 200:
                robots_content = robots_response.text
                sitemap_matches = re.findall(r'Sitemap:\s*(.+)', robots_content, re.IGNORECASE)
                sitemap_urls.extend(sitemap_matches)
                print(f"   - Found {len(sitemap_matches)} sitemaps in robots.txt")
        except Exception:
            pass
        
        # Process all sitemap URLs
        for sitemap_url in set(sitemap_urls):
            try:
                response = await client.get(sitemap_url)
                if response.status_code == 200:
                    content = response.text
                    
                    if sitemap_url.endswith('.xml'):
                        # Parse XML sitemap
                        try:
                            root = ET.fromstring(content)
                            # Handle different sitemap namespaces
                            for url_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
                                loc_elem = url_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
                                if loc_elem is not None:
                                    found_urls.add(loc_elem.text.strip())
                            
                            # Handle sitemap index files
                            for sitemap_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap'):
                                loc_elem = sitemap_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
                                if loc_elem is not None:
                                    # Recursively fetch sub-sitemaps
                                    sub_response = await client.get(loc_elem.text.strip())
                                    if sub_response.status_code == 200:
                                        sub_root = ET.fromstring(sub_response.text)
                                        for url_elem in sub_root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
                                            loc_elem = url_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
                                            if loc_elem is not None:
                                                found_urls.add(loc_elem.text.strip())
                                                
                        except ET.ParseError:
                            pass
                    
                    elif sitemap_url.endswith('.txt'):
                        # Parse text sitemap
                        for line in content.split('\n'):
                            line = line.strip()
                            if line and line.startswith('http'):
                                found_urls.add(line)
                                
            except Exception as e:
                print(f"   - Could not fetch {sitemap_url}: {e}")
    
    print(f"   - Found {len(found_urls)} URLs in sitemaps")
    return found_urls

async def crawl_site_manually(base_url: str, context) -> tuple[set[str], list[dict]]:
    """Manual crawl to discover additional URLs and iframe content."""
    print("🕷️  Phase 2: Manual site crawling with iframe discovery...")
    
    found_urls = set()
    visited_urls = set()
    urls_to_visit = {base_url}
    base_netloc = urlparse(base_url).netloc
    all_iframe_discoveries = []
    
    depth = 0
    while urls_to_visit and depth < MAX_CRAWL_DEPTH and len(found_urls) < MAX_URLS_PER_DOMAIN:
        current_level_urls = urls_to_visit.copy()
        urls_to_visit.clear()
        depth += 1
        
        print(f"   - Crawling depth {depth}: {len(current_level_urls)} URLs")
        
        for url in current_level_urls:
            if url in visited_urls:
                continue
                
            visited_urls.add(url)
            page = None
            
            try:
                page = await context.new_page()
                
                # Navigate with retry logic
                async def navigate_crawl_page():
                    return await page.goto(url, wait_until="domcontentloaded", timeout=PAGE_LOAD_TIMEOUT)
                
                await retry_with_backoff(navigate_crawl_page)
                await page.wait_for_timeout(2000)  # Brief wait for dynamic content
                
                # Discover iframes on this page
                iframe_discoveries = await discover_iframe_urls(page, url)
                all_iframe_discoveries.extend(iframe_discoveries)
                
                # Extract all links from the page including SPA navigation
                links = await page.evaluate("""
                    () => {
                        const links = new Set();
                        
                        // Standard links
                        document.querySelectorAll('a[href]').forEach(link => {
                            const href = link.href;
                            if (href && !href.startsWith('javascript:') && !href.startsWith('mailto:') && !href.startsWith('tel:') && !href.startsWith('#')) {
                                links.add(href);
                            }
                        });
                        
                        // Navigation and footer links
                        document.querySelectorAll('nav a, .menu a, .navigation a, footer a, .footer a, header a').forEach(link => {
                            const href = link.href;
                            if (href && !href.startsWith('javascript:') && !href.startsWith('mailto:') && !href.startsWith('tel:') && !href.startsWith('#')) {
                                links.add(href);
                            }
                        });
                        
                        // Look for SPA-style navigation by extracting href attributes that might be relative paths
                        document.querySelectorAll('a[href^="/"], a[href^="./"], a[href^="../"]').forEach(link => {
                            const href = link.getAttribute('href');
                            if (href && !href.startsWith('#') && !href.includes('javascript:')) {
                                // Convert relative URLs to absolute
                                const absoluteUrl = new URL(href, window.location.origin).href;
                                links.add(absoluteUrl);
                            }
                        });
                        
                        // Also check for common navigation patterns in text content
                        const navTexts = ['about', 'contact', 'services', 'products', 'menu', 'gallery', 'team', 'blog', 'news'];
                        document.querySelectorAll('a').forEach(link => {
                            const text = link.textContent.toLowerCase().trim();
                            const href = link.getAttribute('href');
                            if (href && navTexts.some(navText => text.includes(navText))) {
                                if (href.startsWith('/') || href.startsWith('./') || href.startsWith('../')) {
                                    const absoluteUrl = new URL(href, window.location.origin).href;
                                    links.add(absoluteUrl);
                                } else if (href.startsWith('http')) {
                                    links.add(href);
                                }
                            }
                        });
                        
                        return Array.from(links);
                    }
                """)
                
                for link in links:
                    normalized_link = normalize_url(link)
                    if is_valid_internal_url(normalized_link, base_netloc):
                        found_urls.add(normalized_link)
                        if normalized_link not in visited_urls and depth < MAX_CRAWL_DEPTH:
                            urls_to_visit.add(normalized_link)
                            
            except Exception as e:
                print(f"   - Failed to crawl {url}: {str(e)[:100]}...")
            finally:
                if page:
                    await page.close()
    
    # Add iframe URLs that should be scraped
    iframe_urls_to_scrape = [
        discovery["src"] for discovery in all_iframe_discoveries 
        if discovery["classification"]["should_scrape"]
    ]
    
    found_urls.update(iframe_urls_to_scrape)
    
    print(f"   - Manual crawl found {len(found_urls)} total URLs")
    print(f"   - Including {len(iframe_urls_to_scrape)} iframe content URLs")
    
    return found_urls, all_iframe_discoveries

async def get_gemini_model(model_index: int):
    """Initialize and return Gemini model."""
    try:
        # Auto-setup credentials if not already configured
        setup_gcp_credentials()
        
        model_config = MODELS_CONFIG[model_index]
        gcloud_project = os.getenv("GCLOUD_PROJECT")
        if not gcloud_project:
            print("🔴 Error: GCLOUD_PROJECT not found. Please check your GCP setup.")
            return None
        
        vertexai.init(project=gcloud_project, location=model_config["location"])
        return GenerativeModel(model_config["model_id"])
    except Exception as e:
        print(f"🔴 Error initializing Vertex AI: {e}")
        return None

async def ai_filter_urls_for_redesign(all_urls: set[str], iframe_discoveries: list[dict], site_dir: Path, model_index: int) -> tuple[dict, dict]:
    """Use AI to intelligently select URLs most important for website redesign, including iframe context."""
    
    model = await get_gemini_model(model_index)
    if not model:
        raise ConnectionError("Could not initialize AI model for URL filtering.")
    
    # Convert URLs to list and prepare iframe context
    url_list = sorted(list(all_urls))
    
    # Create iframe context for AI
    iframe_context = {}
    for discovery in iframe_discoveries:
        if discovery["classification"]["should_scrape"]:
            page_url = discovery["found_on_page"]
            if page_url not in iframe_context:
                iframe_context[page_url] = []
            iframe_context[page_url].append({
                "iframe_url": discovery["src"],
                "title": discovery["title"],
                "type": discovery["classification"]["reason"],
                "context": discovery["classification"]["context"]
            })
    
    print(f"🧠 Phase 3: AI filtering {len(all_urls)} URLs for redesign relevance...")
    
    prompt = f"""
You are an expert web designer planning a complete website redesign. Your task is to select the 5-10 MOST IMPORTANT pages from this website that will give you everything needed to understand the business and create an amazing new design.

CRITICAL SELECTION RULES:
1. **MAXIMUM 5-10 PAGES**: You must be highly selective and choose only the most essential pages
2. **ALWAYS INCLUDE**: Homepage (highest priority), About, Contact, Main Services/Menu page
3. **BE SELECTIVE**: Only include pages that provide unique, essential information about the business
4. **CONTACT IS ESSENTIAL**: Any page that could be contact info (even with "copy-of" in URL) should be included
5. **IFRAME CONTENT**: Some pages have important content in iframes (like menus, shops, booking forms) - these are valuable!
6. **AVOID DUPLICATES**: Skip similar/repetitive pages - choose only the best representative example
7. **FOCUS ON CORE BUSINESS**: Prioritize pages that show what the company does and how to contact them

IFRAME CONTENT DISCOVERED:
{json.dumps(iframe_context, indent=2)}

DISCOVERED URLs ({len(url_list)} total):
{json.dumps(url_list, indent=2)}

INSTRUCTIONS:
Analyze each URL carefully and select ONLY the 5-10 most essential pages for a complete website redesign. Be ruthless in your selection - quality over quantity. Pay special attention to:
- Homepage (MUST include)
- About/Company information (MUST include if exists)
- Contact information (MUST include if exists)
- Main services/products/menu (select ONE best page)
- Pages with iframe content (especially menus, shops, booking forms)
- One example of each unique content type (not multiple similar pages)

Return a JSON object with:
- "page_limit": integer (5-10 maximum, be conservative)
- "selection_reasoning": string explaining your strategy for choosing these specific pages
- "included_pages": array of selected URLs (5-10 pages maximum)
- "excluded_reasoning": string explaining what you skipped and why (focus on avoiding duplicates)

REMEMBER: Less is more! Choose only the pages absolutely essential to understand the business and its offerings.
"""
    
    response = await model.generate_content_async(prompt)
    
    # Calculate usage with proper pricing
    usage_data = {"cost": 0, "tokens": 0}
    try:
        usage = response.usage_metadata
        model_config = MODELS_CONFIG[model_index]
        pricing = model_config.get("pricing", {"input": 0, "output": 0})
        
        input_tokens = usage.prompt_token_count
        output_tokens = usage.candidates_token_count
        
        input_cost = input_tokens * pricing["input"]
        output_cost = output_tokens * pricing["output"]
        
        usage_data["cost"] = input_cost + output_cost
        usage_data["tokens"] = input_tokens + output_tokens
        
        print(f"   - AI Usage: {usage_data['tokens']:,} tokens, ${usage_data['cost']:.6f}")
        
    except Exception as e:
        print(f"   - Cost calculation error: {e}")
    
    # Parse JSON response
    json_match = re.search(r'\{.*\}', response.text, re.DOTALL)
    if not json_match:
        raise ValueError("AI did not return valid JSON response.")
    
    plan = json.loads(json_match.group(0))
    
    # Save the comprehensive analysis
    analysis_data = {
        "total_urls_found": len(all_urls),
        "all_discovered_urls": url_list,
        "iframe_discoveries": iframe_discoveries,
        "iframe_context": iframe_context,
        "ai_selection_plan": plan,
        "excluded_pages": list(set(all_urls) - set(plan.get("included_pages", [])))
    }
    
    analysis_path = site_dir / "url_analysis.json"
    try:
        with open(analysis_path, "w", encoding='utf-8') as f:
            json.dump(analysis_data, f, indent=4)
        print(f"📊 Full URL analysis saved to: {analysis_path}")
    except Exception as e:
        print(f"⚠️ Warning: Could not save URL analysis: {e}")
    
    print(f"✅ AI selected {len(plan.get('included_pages', []))} pages for redesign analysis")
    
    return plan, usage_data

async def discover_and_download_assets(page, base_url: str, assets_dir: Path, downloaded_urls: set) -> tuple[BeautifulSoup, set[str]]:
    """Enhanced asset discovery and download."""
    image_dir, css_dir = assets_dir / "images", assets_dir / "css"
    images_on_page = set()
    
    # Get all asset URLs from the page
    asset_urls = await page.evaluate('''() => {
        const urls = new Set();
        
        // CSS files
        document.querySelectorAll('link[rel="stylesheet"]').forEach(l => urls.add(l.href));
        document.querySelectorAll('style[data-href]').forEach(s => urls.add(s.dataset.href));
        
        // Images - including lazy loaded and responsive images
        document.querySelectorAll('img, wow-image, picture source').forEach(i => {
            if (i.src && !i.src.startsWith('data:')) urls.add(i.src);
            if (i.dataset.src && !i.dataset.src.startsWith('data:')) urls.add(i.dataset.src);
            if (i.srcset) {
                i.srcset.split(',').forEach(p => {
                    const u = p.trim().split(' ')[0];
                    if (u && !u.startsWith('data:')) urls.add(u);
                });
            }
        });
        
        // Background images from CSS
        document.querySelectorAll('*').forEach(e => {
            const styles = window.getComputedStyle(e);
            const bgImage = styles.backgroundImage;
            if (bgImage && bgImage !== 'none') {
                const matches = bgImage.match(/url\\("?([^")]+)"?\\)/g);
                if (matches) {
                    matches.forEach(match => {
                        const url = match.match(/url\\("?([^")]+)"?\\)/)[1];
                        if (!url.startsWith('data:')) urls.add(url);
                    });
                }
            }
        });
        
        return Array.from(urls);
    }''')
    
    # Download assets
    for asset_url in asset_urls:
        if not asset_url or asset_url in downloaded_urls:
            continue
            
        asset_page = None
        try:
            absolute_asset_url = urljoin(base_url, asset_url)
            if absolute_asset_url.startswith('data:'):
                continue
            
            asset_page = await page.context.new_page()
            response = await asset_page.goto(absolute_asset_url, timeout=ASSET_TIMEOUT)
            
            if response and response.ok:
                content_type = response.headers.get('content-type', '')
                body = await response.body()
                filename_str = Path(urlparse(absolute_asset_url).path).name
                
                if not filename_str:
                    continue
                
                if 'css' in content_type:
                    (css_dir / filename_str).write_text(body.decode('utf-8', errors='ignore'))
                elif 'image' in content_type:
                    ext = mimetypes.guess_extension(content_type) or ''
                    filename = Path(f"{Path(filename_str).stem}{ext}" if not Path(filename_str).suffix and ext else filename_str)
                    (image_dir / filename).write_bytes(body)
                    images_on_page.add(filename.name)
                
                downloaded_urls.add(asset_url)
                
        except Exception:
            pass
        finally:
            if asset_page:
                await asset_page.close()
    
    # Update HTML with local asset paths
    soup = BeautifulSoup(await page.content(), 'html.parser')
    
    # Update CSS links
    for tag in soup.find_all(True, href=True):
        if tag.has_attr('href'):
            filename = Path(urlparse(tag['href']).path).name
            if (css_dir / filename).exists():
                tag['href'] = f'../assets/css/{filename}'
    
    # Update image sources
    for tag in soup.find_all(True, src=True):
        if tag.has_attr('src'):
            filename = Path(urlparse(tag['src']).path).name
            if (image_dir / filename).exists():
                tag['src'] = f'../assets/images/{filename}'
    
    # Update srcset attributes
    for tag in soup.find_all(True, srcset=True):
        if tag.has_attr('srcset'):
            new_srcset = []
            for part in tag['srcset'].split(','):
                url_part = part.strip().split(' ')[0]
                filename = Path(urlparse(url_part).path).name
                if (image_dir / filename).exists():
                    size_part = part.strip().split(' ')[1] if ' ' in part.strip() else ''
                    new_srcset.append(f"../assets/images/{filename} {size_part}".strip())
            if new_srcset:
                tag['srcset'] = ', '.join(new_srcset)
    
    # Update inline styles with background images
    for element in soup.find_all(style=re.compile(r'background-image')):
        if element.has_attr('style'):
            style = element['style']
            matches = re.findall(r'url\("?([^")]+)"?\)', style)
            for match in matches:
                filename = Path(urlparse(match).path).name
                if (image_dir / filename).exists():
                    style = style.replace(match, f'../assets/images/{filename}')
            element['style'] = style
    
    return soup, images_on_page

async def crawl_site_manually_httpx(base_url: str) -> set[str]:
    """Deep crawling with httpx to discover URLs by following links of links."""
    print("🕷️  Phase 2: Deep crawling with httpx (links of links)...")
    
    found_urls = set()
    visited_urls = set()
    urls_to_visit = {base_url}
    base_netloc = urlparse(base_url).netloc
    
    depth = 0
    async with httpx.AsyncClient(
        timeout=30.0, 
        headers={'User-Agent': USER_AGENT}, 
        follow_redirects=True
    ) as client:
        
        while urls_to_visit and depth < MAX_CRAWL_DEPTH and len(found_urls) < MAX_URLS_PER_DOMAIN:
            current_level_urls = urls_to_visit.copy()
            urls_to_visit.clear()
            depth += 1
            
            print(f"   - Crawling depth {depth}: {len(current_level_urls)} URLs")
            
            for url in current_level_urls:
                if url in visited_urls:
                    continue
                    
                visited_urls.add(url)
                found_urls.add(url)
                
                try:
                    response = await client.get(url)
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.text, 'html.parser')
                        
                        # Extract all links from the current page
                        page_links = set()
                        
                        # Standard links
                        for link in soup.find_all('a', href=True):
                            href = link['href']
                            if href and not href.startswith(('javascript:', 'mailto:', 'tel:', '#')):
                                absolute_url = urljoin(url, href)
                                if is_valid_internal_url(absolute_url, base_netloc):
                                    page_links.add(normalize_url(absolute_url))
                        
                        # Navigation and footer links (higher priority)
                        for nav_selector in ['nav a', '.menu a', '.navigation a', 'footer a', '.footer a', 'header a']:
                            for link in soup.select(nav_selector):
                                href = link.get('href')
                                if href and not href.startswith(('javascript:', 'mailto:', 'tel:', '#')):
                                    absolute_url = urljoin(url, href)
                                    if is_valid_internal_url(absolute_url, base_netloc):
                                        page_links.add(normalize_url(absolute_url))
                        
                        # Look for common navigation patterns in text content
                        nav_texts = ['about', 'contact', 'services', 'products', 'menu', 'gallery', 'team', 'blog', 'news', 'portfolio']
                        for link in soup.find_all('a', href=True):
                            text = link.get_text().lower().strip()
                            href = link['href']
                            if href and any(nav_text in text for nav_text in nav_texts):
                                if not href.startswith(('javascript:', 'mailto:', 'tel:', '#')):
                                    absolute_url = urljoin(url, href)
                                    if is_valid_internal_url(absolute_url, base_netloc):
                                        page_links.add(normalize_url(absolute_url))
                        
                        # Look for SPA/React Router patterns in JavaScript
                        html_content = response.text
                        
                        # Check for React Router routes in script tags or data attributes
                        import re
                        
                        # Look for route patterns like "/about", "/contact", etc.
                        route_patterns = [
                            r'["\']\/([a-zA-Z0-9\-_]+)["\']',  # "/about", "/contact"
                            r'to=["\']\/([a-zA-Z0-9\-_]+)["\']',  # React Router Link to="/about"
                            r'href=["\']\/([a-zA-Z0-9\-_]+)["\']',  # href="/about"
                            r'path=["\']\/([a-zA-Z0-9\-_]+)["\']',  # Route path="/about"
                        ]
                        
                        for pattern in route_patterns:
                            matches = re.findall(pattern, html_content)
                            for match in matches:
                                # Skip common non-page routes
                                if match not in ['api', 'assets', 'static', 'js', 'css', 'images', 'img', 'fonts', 'favicon']:
                                    potential_url = f"{base_url.rstrip('/')}/{match}"
                                    if is_valid_internal_url(potential_url, base_netloc):
                                        page_links.add(normalize_url(potential_url))
                        
                        # Look for JavaScript bundles (Vite, Webpack, etc.) and analyze them
                        js_files = []
                        for script in soup.find_all('script', src=True):
                            js_src = script.get('src')
                            if js_src and ('.js' in js_src):
                                js_url = urljoin(url, js_src)
                                js_files.append(js_url)
                        
                        # Analyze JavaScript bundles for routes
                        for js_url in js_files[:3]:  # Limit to first 3 JS files to avoid too many requests
                            try:
                                js_response = await client.get(js_url)
                                if js_response.status_code == 200:
                                    js_content = js_response.text
                                    
                                    # Look for React Router patterns in JS bundles
                                    js_route_patterns = [
                                        r'path:\s*["\']\/([a-zA-Z0-9\-_]+)["\']',  # path: "/about"
                                        r'["\']\/([a-zA-Z0-9\-_]+)["\'](?=\s*[,}])',  # "/about" followed by comma or }
                                        r'to=["\']\/([a-zA-Z0-9\-_]+)["\']',  # to="/about"
                                        r'href=["\']\/([a-zA-Z0-9\-_]+)["\']',  # href="/about"
                                        r'navigate\(["\']\/([a-zA-Z0-9\-_]+)["\']',  # navigate("/about")
                                    ]
                                    
                                    js_routes_found = set()
                                    for pattern in js_route_patterns:
                                        matches = re.findall(pattern, js_content)
                                        for match in matches:
                                            # Skip common non-page routes
                                            if match not in ['api', 'assets', 'static', 'js', 'css', 'images', 'img', 'fonts', 'favicon', 'login', 'logout', 'admin']:
                                                js_routes_found.add(match)
                                    
                                    # Add found routes to page_links
                                    for route in js_routes_found:
                                        potential_url = f"{base_url.rstrip('/')}/{route}"
                                        if is_valid_internal_url(potential_url, base_netloc):
                                            page_links.add(normalize_url(potential_url))
                                    
                                    if js_routes_found:
                                        print(f"     - Found {len(js_routes_found)} routes in JS bundle: {', '.join(sorted(js_routes_found))}")
                                        
                            except Exception as e:
                                print(f"     - Could not analyze JS bundle {js_url}: {str(e)[:50]}...")
                        
                        # Look for menu/navigation data in JSON-LD or data attributes
                        for script in soup.find_all('script', type='application/json'):
                            try:
                                import json
                                data = json.loads(script.string or '{}')
                                # Look for navigation items in JSON data
                                def extract_urls_from_json(obj, prefix=''):
                                    urls = set()
                                    if isinstance(obj, dict):
                                        for key, value in obj.items():
                                            if key in ['href', 'url', 'link', 'path'] and isinstance(value, str):
                                                if value.startswith('/'):
                                                    full_url = f"{base_url.rstrip('/')}{value}"
                                                    if is_valid_internal_url(full_url, base_netloc):
                                                        urls.add(normalize_url(full_url))
                                            elif isinstance(value, (dict, list)):
                                                urls.update(extract_urls_from_json(value, prefix))
                                    elif isinstance(obj, list):
                                        for item in obj:
                                            if isinstance(item, (dict, list)):
                                                urls.update(extract_urls_from_json(item, prefix))
                                    return urls
                                
                                json_urls = extract_urls_from_json(data)
                                page_links.update(json_urls)
                            except:
                                pass
                        
                        # Add new URLs to visit in next depth level
                        for new_url in page_links:
                            if new_url not in visited_urls and depth < MAX_CRAWL_DEPTH:
                                urls_to_visit.add(new_url)
                        
                        print(f"     - Found {len(page_links)} links on {urlparse(url).path or '/'}")
                        
                    else:
                        print(f"     - HTTP {response.status_code} for {urlparse(url).path or '/'}")
                        
                except Exception as e:
                    print(f"     - Error crawling {urlparse(url).path or '/'}: {str(e)[:50]}...")
    
    print(f"   - Deep crawling complete: {len(found_urls)} unique URLs discovered")
    return found_urls

async def run_enhanced_scraper_httpx(base_url: str, limit_override: int | None, model_index: int):
    """Simplified scraper using httpx only (cPanel compatible)."""
    load_dotenv()
    
    # Initialize SPA extractor
    spa_extractor = SPAContentExtractor()
    
    if not base_url.startswith(('http://', 'https://')):
        base_url = 'https://' + base_url
    
    print(f"🚀 [START] httpx-only scraping (cPanel mode): {base_url}")
    domain = urlparse(base_url).netloc.replace('www.', '')
    site_output_dir = OUTPUT_DIR / domain
    
    # Create directory structure
    pages_dir = site_output_dir / "pages"
    assets_dir = site_output_dir / "assets"
    screenshots_dir = site_output_dir / "screenshots"
    
    for d in [pages_dir, assets_dir / "images", assets_dir / "css", screenshots_dir]:
        d.mkdir(parents=True, exist_ok=True)
    
    manifest = {"site_name": domain, "site_url": base_url, "pages": []}
    downloaded_urls = set()
    
    # Phase 1: Sitemap discovery
    sitemap_urls = await fetch_sitemap_urls(base_url)
    
    # Phase 2: Deep crawling with httpx (links of links)
    discovered_urls = await crawl_site_manually_httpx(base_url)
    
    # Combine all discovered URLs
    all_urls = sitemap_urls.union(discovered_urls)
    if not all_urls:
        # Fallback to just the homepage if no links found
        all_urls = {base_url}
    
    print(f"📊 Total unique URLs discovered: {len(all_urls)}")
    
    # Phase 3: AI filtering for redesign relevance
    usage_data = {"cost": 0, "tokens": 0}
    try:
        print(f"🧠 Phase 3: AI filtering {len(all_urls)} URLs for redesign relevance...")
        ai_plan, ai_usage = await ai_filter_urls_for_redesign(all_urls, [], site_output_dir, model_index)
        usage_data["cost"] += ai_usage.get("cost", 0)
        usage_data["tokens"] += ai_usage.get("tokens", 0)
        
        limit = limit_override if limit_override is not None else ai_plan.get("page_limit", 10)
        urls_to_process = ai_plan.get("included_pages", [])[:limit]
        print(f"   - AI Usage: {ai_usage.get('tokens', 0)} tokens, ${ai_usage.get('cost', 0):.6f}")
        print(f"✅ AI selected {len(urls_to_process)} pages for redesign analysis")
    except Exception as e:
        print(f"❌ AI filtering failed: {e}. Using top discovered URLs.")
        urls_to_process = list(all_urls)[:limit_override or 10]
    
    if not urls_to_process:
        urls_to_process = [base_url]
    
    print(f"📥 Processing {len(urls_to_process)} pages with httpx...")
    
    async with httpx.AsyncClient(
        timeout=30.0,
        headers={'User-Agent': USER_AGENT},
        follow_redirects=True
    ) as client:
        
        with tqdm(total=len(urls_to_process), desc=f"[{domain}] Pages") as pbar:
            for url in urls_to_process:
                try:
                    pbar.write(f"  -> Scraping: {url}")
                    
                    response = await client.get(url)
                    
                    if response.status_code >= 400:
                        pbar.write(f"     ⚠️  HTTP {response.status_code} error")
                        continue
                    
                    html_content = response.text
                    soup = BeautifulSoup(html_content, 'html.parser')
                    
                    # Enhanced SPA detection and content extraction
                    spa_info = spa_extractor.detect_spa(html_content, url)
                    enhanced_content = html_content
                    
                    if spa_info['is_spa']:
                        pbar.write(f"     🎯 SPA detected: {spa_info['framework']} - enhancing content...")
                        
                        # Try to fetch and analyze JavaScript bundles for routes
                        js_routes = []
                        for script in soup.find_all('script', src=True):
                            js_src = script.get('src')
                            if js_src and any(keyword in js_src.lower() for keyword in ['app', 'main', 'bundle', 'chunk']):
                                try:
                                    js_url = urljoin(url, js_src)
                                    js_response = await client.get(js_url)
                                    if js_response.status_code == 200:
                                        js_content = js_response.text
                                        routes = spa_extractor.extract_spa_routes_from_js(js_content)
                                        js_routes.extend(routes)
                                        pbar.write(f"       📄 Found {len(routes)} routes in {js_src}")
                                except:
                                    pass
                        
                        # Generate enhanced content for SPAs
                        fallback_content = spa_extractor.generate_spa_content_fallback(spa_info, url)
                        
                        # Add enhanced content to the HTML
                        if fallback_content:
                            enhanced_section = soup.new_tag('div', id='spa-enhanced-content', style='display:none;')
                            enhanced_section.string = fallback_content
                            if soup.body:
                                soup.body.append(enhanced_section)
                        
                        # Add discovered routes as meta information
                        if js_routes:
                            routes_meta = soup.new_tag('meta')
                            routes_meta['name'] = 'spa-discovered-routes'
                            routes_meta['content'] = ','.join(js_routes[:10])  # Limit to avoid huge meta tags
                            if soup.head:
                                soup.head.append(routes_meta)
                            pbar.write(f"       🔗 Added {len(js_routes)} discovered routes to page metadata")
                    
                    # Basic asset download
                    images_on_page = set()
                    for img in soup.find_all(['img']):
                        src = img.get('src')
                        if src and not src.startswith('data:'):
                            try:
                                absolute_url = urljoin(url, src)
                                asset_response = await client.get(absolute_url)
                                if asset_response.status_code == 200:
                                    filename = Path(urlparse(src).path).name
                                    if filename and filename.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
                                        (assets_dir / "images" / filename).write_bytes(asset_response.content)
                                        images_on_page.add(filename)
                                        img['src'] = f'../assets/images/{filename}'
                            except:
                                pass
                    
                    # Save enhanced HTML
                    page_filename = sanitize_filename(url)
                    (pages_dir / page_filename).write_text(str(soup.prettify()), encoding='utf-8')
                    
                    # Create placeholder screenshot
                    screenshot_filename = f"{Path(page_filename).stem}.png"
                    screenshot_path = screenshots_dir / screenshot_filename
                    
                    # Simple placeholder file
                    try:
                        from PIL import Image, ImageDraw
                        img = Image.new('RGB', (1200, 800), color='white')
                        draw = ImageDraw.Draw(img)
                        draw.text((50, 50), f"Screenshot placeholder\n{url}", fill='black')
                        img.save(screenshot_path)
                    except:
                        screenshot_path.touch()
                    
                    # Add to manifest
                    page_info = {
                        "url": url,
                        "final_url": str(response.url),
                        "local_path": f"pages/{page_filename}",
                        "screenshot": f"screenshots/{screenshot_filename}",
                        "title": soup.title.string if soup.title else "",
                        "images": sorted(list(images_on_page)),
                        "is_iframe_content": False,
                        "status_code": response.status_code,
                        "scrape_method": "httpx_enhanced"
                    }
                    
                    # Add SPA information if detected
                    if spa_info['is_spa']:
                        page_info["spa_info"] = {
                            "framework": spa_info['framework'],
                            "has_ssr": spa_info['has_ssr'],
                            "routes_discovered": len(js_routes) if 'js_routes' in locals() else 0,
                            "enhanced_content_added": bool(fallback_content)
                        }
                    
                    manifest["pages"].append(page_info)
                    
                except Exception as e:
                    pbar.write(f"❌ Failed: {url} - {str(e)[:100]}...")
                
                pbar.update(1)
    
    # Save manifest if we got any pages
    if manifest["pages"]:
        manifest['assets'] = {
            'images': sorted([p.name for p in (assets_dir / "images").iterdir()]),
            'css': []
        }
        manifest['scrape_date'] = datetime.datetime.now().isoformat()
        manifest['total_urls_discovered'] = len(all_urls)
        manifest['iframe_urls_included'] = 0
        manifest['ai_selection_used'] = usage_data['cost'] > 0
        
        (site_output_dir / "manifest.json").write_text(
            json.dumps(manifest, indent=4), encoding='utf-8'
        )
        
        print(f"✅ [DONE] httpx scraping complete: {site_output_dir}")
        print(f"📊 Scraped {len(manifest['pages'])} pages")
        print(f"   💰 Step cost: ${usage_data['cost']:.6f} ({usage_data['tokens']} tokens)")
    else:
        print("❌ No pages were successfully scraped")
        raise Exception("No pages scraped")

async def run_enhanced_scraper_playwright(base_url: str, limit_override: int | None, model_index: int):
    """Enhanced scraper with iframe URL discovery and smart AI filtering (Playwright version)."""
    load_dotenv()
    
    if not base_url.startswith(('http://', 'https://')):
        base_url = 'https://' + base_url
    
    print(f"🚀 [START] Enhanced scraping with iframe URL discovery: {base_url}")
    domain = urlparse(base_url).netloc.replace('www.', '')
    site_output_dir = OUTPUT_DIR / domain
    
    # Create directory structure
    pages_dir = site_output_dir / "pages"
    assets_dir = site_output_dir / "assets"
    screenshots_dir = site_output_dir / "screenshots"
    
    for d in [pages_dir, assets_dir / "images", assets_dir / "css", screenshots_dir]:
        d.mkdir(parents=True, exist_ok=True)
    
    manifest = {"site_name": domain, "site_url": base_url, "pages": []}
    downloaded_urls = set()
    usage_data = {"cost": 0, "tokens": 0}
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(user_agent=USER_AGENT, viewport=VIEWPORT_SIZE)
        
        # Phase 1: Sitemap discovery
        sitemap_urls = await fetch_sitemap_urls(base_url)
        
        # Phase 2: Manual crawling with iframe discovery
        crawled_urls, iframe_discoveries = await crawl_site_manually(base_url, context)
        
        # Combine all discovered URLs
        all_urls = sitemap_urls.union(crawled_urls)
        base_netloc = urlparse(base_url).netloc
        
        # Filter to only valid URLs (but keep iframe URLs even if external)
        iframe_urls = {discovery["src"] for discovery in iframe_discoveries if discovery["classification"]["should_scrape"]}
        
        valid_urls = {normalize_url(url) for url in all_urls 
                     if is_valid_internal_url(url, base_netloc) or url in iframe_urls}
        
        print(f"📊 Total unique URLs discovered: {len(valid_urls)}")
        print(f"    - Including {len(iframe_urls)} iframe content URLs")
        
        # Special handling for known SPA sites like fullgen.ai
        if 'fullgen.ai' in base_url:
            print(f"   🎯 Detected SPA site (fullgen.ai) - adding known navigation pages")
            spa_pages = [
                f"{base_url.rstrip('/')}/",
                f"{base_url.rstrip('/')}/about",
                f"{base_url.rstrip('/')}/services", 
                f"{base_url.rstrip('/')}/contact"
            ]
            valid_urls.update(spa_pages)
            print(f"   📄 Added SPA pages: {len(spa_pages)} pages")
        
        # Phase 3: AI filtering for redesign relevance
        try:
            ai_plan, ai_usage = await ai_filter_urls_for_redesign(valid_urls, iframe_discoveries, site_output_dir, model_index)
            usage_data["cost"] += ai_usage.get("cost", 0)
            usage_data["tokens"] += ai_usage.get("tokens", 0)
            
            limit = limit_override if limit_override is not None else ai_plan.get("page_limit", 25)
            urls_to_process = ai_plan.get("included_pages", [])[:limit]
        except Exception as e:
            print(f"❌ AI filtering failed: {e}. Using top discovered URLs.")
            urls_to_process = list(valid_urls)[:25]
        
        if not urls_to_process:
            urls_to_process = [base_url]
        
        print(f"\n📥 Phase 4: Processing {len(urls_to_process)} selected pages...")
        
        # Process selected pages
        with tqdm(total=len(urls_to_process), desc=f"[{domain}] Pages") as pbar:
            for url in urls_to_process:
                page = None
                try:
                    # Determine if this is an iframe URL
                    is_iframe_url = url in iframe_urls
                    iframe_context = None
                    
                    if is_iframe_url:
                        # Find the iframe context
                        for discovery in iframe_discoveries:
                            if discovery["src"] == url:
                                iframe_context = discovery
                                break
                    
                    pbar.write(f"  -> Scraping: {url[:100]}{'...' if len(url) > 100 else ''}")
                    if iframe_context:
                        pbar.write(f"     📱 Iframe content: {iframe_context['classification']['reason']} from {iframe_context['found_on_page']}")
                    
                    page = await context.new_page()
                    
                    # Special handling for iframe URLs
                    if is_iframe_url and iframe_context:
                        parent_page_url = iframe_context['found_on_page']
                        pbar.write(f"     🎯 Setting iframe-specific headers and referrer...")
                        
                        # Set headers that mimic iframe embedding
                        await page.set_extra_http_headers({
                            'Referer': parent_page_url,
                            'Sec-Fetch-Dest': 'iframe',
                            'Sec-Fetch-Mode': 'navigate',
                            'Sec-Fetch-Site': 'cross-site',
                            'X-Requested-With': 'iframe'
                        })
                    
                    # Navigate to the page with retry logic
                    async def navigate_to_page():
                        return await page.goto(url, wait_until="domcontentloaded", timeout=PAGE_LOAD_TIMEOUT)
                    
                    response = await retry_with_backoff(navigate_to_page)
                    
                    # Check if we got redirected or got an error
                    if response:
                        final_url = page.url
                        status_code = response.status
                        
                        if final_url != url:
                            pbar.write(f"     🔄 Redirected from {url[:80]}... to {final_url[:80]}...")
                        
                        if status_code >= 400:
                            pbar.write(f"     ⚠️  HTTP {status_code} error for {url}")
                            continue
                        
                        # For iframe URLs, log what we actually got
                        if is_iframe_url:
                            page_title = await page.title()
                            page_content_sample = await page.evaluate('() => document.body ? document.body.innerText.substring(0, 200) : "No body content"')
                            pbar.write(f"     📄 Page title: '{page_title}'")
                            pbar.write(f"     📝 Content preview: {page_content_sample[:100]}...")
                            
                            # Check if this looks like the expected content
                            if iframe_context['classification']['reason'] == 'menu':
                                # Look for menu-like content
                                has_menu_content = await page.evaluate('''() => {
                                    const text = document.body ? document.body.innerText.toLowerCase() : '';
                                    const menuKeywords = ['menu', 'food', 'price', 'appetizer', 'entree', 'dessert', 'drink', 'beverage', '$'];
                                    return menuKeywords.some(keyword => text.includes(keyword));
                                }''')
                                
                                if not has_menu_content:
                                    pbar.write(f"     ⚠️  Warning: Expected menu content but didn't find menu keywords")
                    
                    await page.wait_for_timeout(RENDER_DELAY)
                    
                    # Download assets and update HTML
                    final_soup, images_on_page = await discover_and_download_assets(page, url, assets_dir, downloaded_urls)
                    
                    # Save HTML
                    page_filename = sanitize_filename(url)
                    (pages_dir / page_filename).write_text(str(final_soup.prettify()), encoding='utf-8')
                    
                    # Take screenshot
                    screenshot_filename = f"{Path(page_filename).stem}.png"
                    await page.screenshot(path=screenshots_dir / screenshot_filename, full_page=True)
                    
                    # Add to manifest
                    page_info = {
                        "url": url,
                        "final_url": page.url if response else url,  # Track redirects
                        "local_path": f"pages/{page_filename}",
                        "screenshot": f"screenshots/{screenshot_filename}",
                        "title": final_soup.title.string if final_soup.title else "",
                        "images": sorted(list(images_on_page)),
                        "is_iframe_content": is_iframe_url,
                        "status_code": response.status if response else None
                    }
                    
                    if iframe_context:
                        page_info["iframe_context"] = {
                            "found_on_page": iframe_context["found_on_page"],
                            "content_type": iframe_context["classification"]["reason"],
                            "title": iframe_context["title"],
                            "aria_label": iframe_context["aria_label"]
                        }
                    
                    # Add SPA information if detected
                    if spa_info['is_spa']:
                        page_info["spa_info"] = {
                            "framework": spa_info['framework'],
                            "has_ssr": spa_info['has_ssr'],
                            "routes_discovered": len(js_routes) if 'js_routes' in locals() else 0,
                            "enhanced_content_added": bool(fallback_content)
                        }
                    
                    manifest["pages"].append(page_info)
                    
                except Exception as e:
                    pbar.write(f"\n❌ [{domain}] Failed: {url} - {str(e)[:150]}...")
                    
                    # For iframe URLs, try an alternative approach
                    if is_iframe_url and iframe_context:
                        pbar.write(f"     🔄 Trying alternative approach for iframe URL...")
                        try:
                            # Try with different user agent or headers
                            alt_page = await context.new_page()
                            await alt_page.set_extra_http_headers({
                                'Referer': iframe_context['found_on_page'],
                                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
                            })
                            
                            alt_response = await alt_page.goto(url, wait_until="networkidle", timeout=PAGE_LOAD_TIMEOUT)
                            if alt_response and alt_response.status < 400:
                                pbar.write(f"     ✅ Alternative approach worked!")
                                
                                # Process this page
                                await alt_page.wait_for_timeout(RENDER_DELAY)
                                final_soup, images_on_page = await discover_and_download_assets(alt_page, url, assets_dir, downloaded_urls)
                                
                                page_filename = sanitize_filename(url)
                                (pages_dir / page_filename).write_text(str(final_soup.prettify()), encoding='utf-8')
                                
                                screenshot_filename = f"{Path(page_filename).stem}.png"
                                await alt_page.screenshot(path=screenshots_dir / screenshot_filename, full_page=True)
                                
                                page_info = {
                                    "url": url,
                                    "final_url": alt_page.url,
                                    "local_path": f"pages/{page_filename}",
                                    "screenshot": f"screenshots/{screenshot_filename}",
                                    "title": final_soup.title.string if final_soup.title else "",
                                    "images": sorted(list(images_on_page)),
                                    "is_iframe_content": True,
                                    "status_code": alt_response.status,
                                    "iframe_context": {
                                        "found_on_page": iframe_context["found_on_page"],
                                        "content_type": iframe_context["classification"]["reason"],
                                        "title": iframe_context["title"],
                                        "aria_label": iframe_context["aria_label"]
                                    }
                                }
                                manifest["pages"].append(page_info)
                                
                            await alt_page.close()
                        except Exception as alt_e:
                            pbar.write(f"     ❌ Alternative approach also failed: {str(alt_e)[:100]}...")
                finally:
                    if page:
                        await page.close()
                pbar.update(1)
        
        # Only save manifest if we successfully scraped at least one page
        if manifest["pages"]:
            # Finalize manifest
            manifest['assets'] = {
                'images': sorted([p.name for p in (assets_dir / "images").iterdir()]),
                'css': sorted([p.name for p in (assets_dir / "css").iterdir()])
            }
            manifest['scrape_date'] = datetime.datetime.now().isoformat()
            manifest['total_urls_discovered'] = len(all_urls)
            manifest['iframe_urls_included'] = len([p for p in manifest["pages"] if p.get("is_iframe_content")])
            manifest['ai_selection_used'] = True
            
            # Save manifest
            (site_output_dir / "manifest.json").write_text(
                json.dumps(manifest, indent=4), encoding='utf-8'
            )
        else:
            print("❌ No pages were successfully scraped")
            print("🔄 Attempting alternative scraping approaches...")
            
            # Try alternative approaches for stubborn sites
            alternative_success = False
            
            # Approach 1: Try with different browser settings
            try:
                print("   🔄 Trying with relaxed browser settings...")
                alt_context = await browser.new_context(
                    user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                    viewport={"width": 1280, "height": 720},
                    ignore_https_errors=True,
                    java_script_enabled=True
                )
                
                alt_page = await alt_context.new_page()
                
                # Try a very simple navigation with longer timeout
                try:
                    await alt_page.goto(base_url, wait_until="load", timeout=90000)
                    await alt_page.wait_for_timeout(5000)
                    
                    # If we get here, try to extract basic content
                    title = await alt_page.title()
                    content = await alt_page.content()
                    
                    if title and len(content) > 1000:
                        print(f"   ✅ Alternative approach successful! Got page with title: '{title}'")
                        
                        # Save basic content
                        soup = BeautifulSoup(content, 'html.parser')
                        (pages_dir / "index.html").write_text(str(soup.prettify()), encoding='utf-8')
                        
                        # Take screenshot
                        await alt_page.screenshot(path=screenshots_dir / "index.png", full_page=True)
                        
                        # Create minimal manifest
                        manifest["pages"] = [{
                            "url": base_url,
                            "final_url": alt_page.url,
                            "local_path": "pages/index.html",
                            "screenshot": "screenshots/index.png",
                            "title": title,
                            "images": [],
                            "is_iframe_content": False,
                            "status_code": 200,
                            "scrape_method": "alternative_browser_settings"
                        }]
                        
                        manifest['assets'] = {'images': [], 'css': []}
                        manifest['scrape_date'] = datetime.datetime.now().isoformat()
                        manifest['total_urls_discovered'] = 1
                        manifest['iframe_urls_included'] = 0
                        manifest['ai_selection_used'] = False
                        manifest['scrape_notes'] = "Used alternative browser settings due to connection issues"
                        
                        (site_output_dir / "manifest.json").write_text(
                            json.dumps(manifest, indent=4), encoding='utf-8'
                        )
                        
                        alternative_success = True
                        
                except Exception as alt_e:
                    print(f"   ❌ Alternative browser settings failed: {str(alt_e)[:100]}...")
                
                await alt_context.close()
                
            except Exception as e:
                print(f"   ❌ Alternative approach failed: {str(e)[:100]}...")
            
            if not alternative_success:
                print("❌ All scraping approaches failed - not saving manifest.json")
                # Clean up empty directories if no content was scraped
                import shutil
                if site_output_dir.exists():
                    shutil.rmtree(site_output_dir)
                raise Exception("No pages were successfully scraped")
        
        await browser.close()
    
    print(f"✅ [DONE] Enhanced scraping complete: {site_output_dir}")
    print(json.dumps(usage_data))

async def run_enhanced_scraper(base_url: str, limit_override: int | None, model_index: int):
    """Main scraper function that always uses httpx mode for consistent behavior."""
    print("🌐 Using httpx mode (forced for consistent behavior)")
    await run_enhanced_scraper_httpx(base_url, limit_override, model_index)

# Enhanced SPA Detection and Content Extraction
class SPAContentExtractor:
    """Enhanced content extractor for Single Page Applications"""
    
    def __init__(self):
        self.spa_indicators = [
            'react', 'vue', 'angular', 'next.js', 'nuxt', 'gatsby',
            'app.js', 'main.js', 'bundle.js', 'chunk.js',
            'data-reactroot', 'ng-app', 'v-app', '__NEXT_DATA__'
        ]
        
    def detect_spa(self, html_content: str, url: str) -> Dict[str, Any]:
        """Detect if this is a Single Page Application and extract what we can"""
        soup = BeautifulSoup(html_content, 'html.parser')
        
        spa_info = {
            'is_spa': False,
            'framework': None,
            'has_ssr': False,
            'meta_content': {},
            'structured_data': [],
            'fallback_content': '',
            'navigation_hints': []
        }
        
        # Check for SPA indicators
        html_text = html_content.lower()
        for indicator in self.spa_indicators:
            if indicator in html_text:
                spa_info['is_spa'] = True
                if indicator in ['react', 'data-reactroot', '__next_data__']:
                    spa_info['framework'] = 'React/Next.js'
                elif indicator in ['vue', 'v-app', 'nuxt']:
                    spa_info['framework'] = 'Vue/Nuxt'
                elif indicator in ['angular', 'ng-app']:
                    spa_info['framework'] = 'Angular'
                break
        
        # Extract meta information (often available even in SPAs)
        meta_tags = soup.find_all('meta')
        for meta in meta_tags:
            if meta.get('name'):
                spa_info['meta_content'][meta.get('name')] = meta.get('content', '')
            elif meta.get('property'):
                spa_info['meta_content'][meta.get('property')] = meta.get('content', '')
        
        # Look for structured data (JSON-LD)
        json_scripts = soup.find_all('script', type='application/ld+json')
        for script in json_scripts:
            try:
                data = json.loads(script.string)
                spa_info['structured_data'].append(data)
            except:
                pass
        
        # Check for Server-Side Rendering hints
        if soup.find(text=re.compile(r'window\.__INITIAL_STATE__|window\.__PRELOADED_STATE__|__NEXT_DATA__')):
            spa_info['has_ssr'] = True
        
        # Extract any visible text content (fallback)
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        # Get text content
        text_content = soup.get_text()
        # Clean up whitespace
        lines = (line.strip() for line in text_content.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        spa_info['fallback_content'] = ' '.join(chunk for chunk in chunks if chunk)
        
        # Look for navigation hints in comments or data attributes
        nav_hints = []
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            if any(word in comment.lower() for word in ['nav', 'menu', 'route', 'page']):
                nav_hints.append(comment.strip())
        
        # Look for data attributes that might indicate routes
        for element in soup.find_all(attrs={"data-route": True}):
            nav_hints.append(element.get('data-route'))
        
        spa_info['navigation_hints'] = nav_hints
        
        return spa_info
    
    def extract_spa_routes_from_js(self, js_content: str) -> List[str]:
        """Extract route patterns from JavaScript bundles"""
        routes = []
        
        # Common route patterns
        route_patterns = [
            r'["\']/([\w\-/]+)["\']',  # "/path/to/route"
            r'path:\s*["\']/([\w\-/]*)["\']',  # path: "/route"
            r'route:\s*["\']/([\w\-/]*)["\']',  # route: "/route"
            r'component:\s*["\'][\w\-/]+["\']',  # component references
        ]
        
        for pattern in route_patterns:
            matches = re.findall(pattern, js_content, re.IGNORECASE)
            for match in matches:
                if match and len(match) > 0 and not any(skip in match.lower() for skip in ['http', 'api', 'cdn', '.js', '.css']):
                    routes.append(f"/{match.strip('/')}")
        
        return list(set(routes))
    
    def generate_spa_content_fallback(self, spa_info: Dict[str, Any], url: str) -> str:
        """Generate meaningful content from SPA analysis"""
        content_parts = []
        
        # Use meta content
        if spa_info['meta_content']:
            if 'description' in spa_info['meta_content']:
                content_parts.append(f"Description: {spa_info['meta_content']['description']}")
            if 'og:title' in spa_info['meta_content']:
                content_parts.append(f"Title: {spa_info['meta_content']['og:title']}")
            if 'og:description' in spa_info['meta_content']:
                content_parts.append(f"About: {spa_info['meta_content']['og:description']}")
        
        # Use structured data
        for data in spa_info['structured_data']:
            if isinstance(data, dict):
                if 'name' in data:
                    content_parts.append(f"Business: {data['name']}")
                if 'description' in data:
                    content_parts.append(f"Description: {data['description']}")
        
        # Use fallback content if substantial
        if len(spa_info['fallback_content']) > 100:
            content_parts.append(f"Content: {spa_info['fallback_content'][:500]}...")
        
        # Add framework info
        if spa_info['framework']:
            content_parts.append(f"Framework: {spa_info['framework']}")
        
        return '\n'.join(content_parts) if content_parts else f"Single Page Application detected at {url}"

# Initialize the SPA extractor
spa_extractor = SPAContentExtractor()

if __name__ == "__main__":
    model_options_text = ', '.join([f"{i}='{config['name']}'" for i, config in enumerate(MODELS_CONFIG)])
    
    parser = argparse.ArgumentParser(description="Enhanced website scraper with iframe URL discovery and AI-powered page selection.")
    parser.add_argument("url", type=str, help="Base URL to scrape")
    parser.add_argument("--limit", type=int, help="Override AI-suggested page limit")
    parser.add_argument("--planner-model-index", type=int, default=DEFAULT_PLANNER_MODEL_INDEX, 
                       choices=range(len(MODELS_CONFIG)), 
                       help=f"AI model for planning ({model_options_text})")
    
    args = parser.parse_args()
    
    if not args.url:
        print("❌ URL required")
        parser.print_help()
    else:
        asyncio.run(run_enhanced_scraper(args.url, args.limit, args.planner_model_index))