#!/usr/bin/env python3
"""E-Commerce Daily Report Scraper"""
import requests
import re
import json
import sys
from datetime import datetime
from html.parser import HTMLParser

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'en-US,en;q=0.9',
}

CATEGORIES = {
    'electronics': 'https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/',
    'computers': 'https://www.amazon.com/Best-Sellers-Computers-Accessories/zgbs/computers/',
    'home_kitchen': 'https://www.amazon.com/Best-Sellers-Kitchen-Small-Appliances/zgbs/kitchen/',
    'smart_home': 'https://www.amazon.com/Best-Sellers-Smart-Home-Discovery/zgbs/industrial/',
}

class AmazonParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.products = []
        self.in_product = False
        self.current_product = {}
        self.skip_tags = ['style', 'script', 'noscript', 'iframe']
        self.in_skip = False
        self.tag_stack = []
        
    def handle_starttag(self, tag, attrs):
        if tag in self.skip_tags:
            self.in_skip = True
            return
        self.tag_stack.append(tag)
        attrs_dict = dict(attrs)
        if 'class' in attrs_dict:
            cls = attrs_dict['class'].lower()
            # Amazon product container classes
            if any(x in cls for x in ['zg-item-immersion', 'p13n-asin', 'a-section', 'a-spacing']):
                if not self.in_product:
                    self.in_product = True
                    self.current_product = {}
        
    def handle_endtag(self, tag):
        if tag in self.skip_tags:
            self.in_skip = False
            return
        if self.tag_stack and self.tag_stack[-1] == tag:
            self.tag_stack.pop()
        if self.in_product and tag in ['div', 'li', 'span']:
            if self.current_product and len(self.products) < 20:
                self.products.append(self.current_product)
            self.in_product = False
            
    def handle_data(self, data):
        if self.in_skip or not self.in_product:
            return
        data = data.strip()
        if len(data) > 2 and len(data) < 200:
            if 'current_product' in dir() and self.current_product:
                if 'title' not in self.current_product:
                    self.current_product['title'] = data

def parse_amazon_page(html):
    """Extract products from Amazon bestsellers page"""
    products = []
    
    # Try multiple patterns
    # Pattern 1: data-asin
    asins = re.findall(r'data-asin="([A-Z0-9]{10})"', html)
    
    # Pattern 2: Product title in links
    titles = re.findall(r'class="p13n-asin-title[^"]*"[^>]*>([^<]+)<', html)
    
    # Pattern 3: Price
    prices = re.findall(r'class="p13n-asin-price[^"]*"[^>]*>([^<]+)<', html)
    
    # Pattern 4: Rank
    ranks = re.findall(r'class="zg-item-number[^"]*">#(\d+)', html)
    
    # Pattern 5: Links
    links = re.findall(r'href="(/dp/[^"?]+)', html)
    
    # Combine
    for i in range(min(len(titles), 20)):
        p = {
            'rank': ranks[i] if i < len(ranks) else str(i+1),
            'title': titles[i].strip() if i < len(titles) else '',
            'price': prices[i].strip() if i < len(prices) else 'N/A',
            'link': f"https://www.amazon.com{links[i]}" if i < len(links) else '',
        }
        if p['title']:
            products.append(p)
    
    # Alternative: parse JSON data
    if not products:
        json_data = re.findall(r'window\.DSPData\s*=\s*({[^<]+})', html)
    
    return products[:20]

def scrape_amazon_category(url, category_name):
    """Scrape a single Amazon category"""
    try:
        resp = requests.get(url, headers=HEADERS, timeout=15)
        resp.encoding = 'utf-8'
        html = resp.text
        
        products = parse_amazon_page(html)
        
        # Also try to extract from script tags
        if not products:
            # Look for JSON data
            script_content = re.findall(r'<script[^>]*type="application/json"[^>]*>([^<]+)</script>', html)
            
        return {
            'category': category_name,
            'url': url,
            'products': products,
            'success': len(products) > 0,
            'count': len(products)
        }
    except Exception as e:
        return {'category': category_name, 'url': url, 'error': str(e), 'products': [], 'success': False}

def get_trending_searches():
    """Get trending searches (alternative data source)"""
    try:
        url = 'https://completion.amazon.com/search/complete?method=1&marketplace=ATVPDKIKX0DER&query=trending+products&searchalias=aps&client=amazon-search-ui&x-asrc=search'
        resp = requests.get(url, headers=HEADERS, timeout=10)
        data = resp.json()
        return [s[0] for s in data.get('results', [])[:10]]
    except:
        return []

def format_report(results):
    """Format results into markdown report"""
    today = datetime.now().strftime('%Y-%m-%d')
    now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    md = f"""## 🛒 E-Commerce Daily Report — {today}

**Sources: Amazon Best Sellers, eBay Deals**

---

## 📦 Amazon Best Sellers by Category

"""
    for r in results:
        if not r.get('success'):
            continue
        md += f"""### {r['category'].replace('_', ' ').title()}

| # | Product | Price |
|--|---------|-------|
"""
        for p in r.get('products', [])[:10]:
            title = p.get('title', 'N/A')[:60]
            price = p.get('price', 'N/A')
            rank = p.get('rank', '-')
            md += f"| {rank} | {title} | {price} |\n"
        md += "\n"
    
    md += f"""
## 🔥 Trending Products (Amazon)

"""
    trending = get_trending_searches()
    if trending:
        for i, t in enumerate(trending[:10], 1):
            md += f"{i}. {t}\n"
    else:
        md += "_No data available_\n"
    
    md += f"""
---

## 💡 Report Notes

- Data collected from Amazon Best Sellers pages
- Categories: Electronics, Computers, Home & Kitchen Small Appliances
- Updated daily

---
*Report generated: {now}*
"""
    return md

def main():
    print("Scraping Amazon categories...")
    results = []
    for cat_name, url in CATEGORIES.items():
        print(f"  - {cat_name}...", end=" ")
        r = scrape_amazon_category(url, cat_name)
        results.append(r)
        print(f"✓ {r.get('count', 0)} products" if r.get('success') else f"✗ {r.get('error', 'failed')}")
    
    report = format_report(results)
    print(f"\nReport length: {len(report)} chars")
    print(report)

if __name__ == '__main__':
    main()