#!/usr/bin/env python3
"""Scrape halo.gocat.top articles and generate static blog"""

import os
import re
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

BASE_URL = "https://halo.gocat.top"
ARTICLES_DIR = "/tmp/halo_articles"
os.makedirs(ARTICLES_DIR, exist_ok=True)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
}

def get_sitemap():
    """Get all article URLs from sitemap"""
    resp = requests.get(f"{BASE_URL}/sitemap.xml", headers=headers, timeout=30)
    urls = re.findall(r'<loc>(https://halo\.gocat\.top/archives/[^<]+)</loc>', resp.text)
    return urls

def scrape_article(url):
    """Scrape single article and return (title, content, date)"""
    try:
        resp = requests.get(url, headers=headers, timeout=30)
        soup = BeautifulSoup(resp.text, 'html.parser')
        
        # Get title
        title_tag = soup.find('h1') or soup.find('title')
        title = title_tag.get_text(strip=True) if title_tag else "Untitled"
        
        # Get date from meta or URL
        date_match = re.search(r'/(\d{4}-\d{2}-\d{2})/', resp.text)
        date = date_match.group(1) if date_match else ""
        
        # Get main content - try common article containers
        article = (soup.find('article') or 
                   soup.find('div', class_=re.compile('content|article|post')) or
                   soup.find('main') or
                   soup.find('div', id='content'))
        
        if article:
            # Remove unwanted elements
            for tag in article.find_all(['script', 'style', 'nav', 'footer', 'header', 'aside']):
                tag.decompose()
            content = str(article)
        else:
            content = resp.text
        
        return title, content, date
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None, None, None

def generate_index(articles):
    """Generate index.html with all articles"""
    articles.sort(key=lambda x: x['date'], reverse=True)
    
    html = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Cat's Blog - 备份自 halo.gocat.top</title>
    <style>
        body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; max-width: 900px; margin: 0 auto; padding: 20px; background: #f5f5f5; }
        h1 { color: #333; border-bottom: 2px solid #007bff; padding-bottom: 10px; }
        .article-list { list-style: none; padding: 0; }
        .article-item { background: white; margin: 10px 0; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
        .article-item h2 { margin: 0 0 10px 0; font-size: 1.2em; }
        .article-item h2 a { color: #007bff; text-decoration: none; }
        .article-item h2 a:hover { text-decoration: underline; }
        .article-date { color: #666; font-size: 0.9em; }
        .article-link { color: #28a745; text-decoration: none; }
        .header { background: #007bff; color: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; }
        .header h1 { color: white; border: none; margin: 0; }
        .header p { margin: 10px 0 0 0; opacity: 0.8; }
    </style>
</head>
<body>
    <div class="header">
        <h1>🐱 Cat's Blog</h1>
        <p>备份自 halo.gocat.top | 共 {count} 篇文章</p>
    </div>
    <ul class="article-list">
""".format(count=len(articles))
    
    for art in articles:
        slug = art['url'].split('/')[-1]
        html += f"""
        <li class="article-item">
            <h2><a href="/blog/{slug}.html">{art['title']}</a></h2>
            <p class="article-date">📅 {art['date']}</p>
            <a class="article-link" href="/blog/{slug}.html">阅读全文 →</a>
        </li>
"""
    
    html += """    </ul>
</body>
</html>"""
    return html

def generate_article_page(title, content, url, date):
    """Generate individual article page"""
    slug = url.split('/')[-1]
    
    # Clean up content - fix relative URLs
    content = re.sub(r'(src|href)="/(?!/)', r'\1="https://halo.gocat.top/', content)
    
    html = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{title} - Cat's Blog</title>
    <style>
        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; max-width: 900px; margin: 0 auto; padding: 20px; background: #f5f5f5; }}
        .article {{ background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
        h1 {{ color: #333; margin-top: 0; }}
        .meta {{ color: #666; font-size: 0.9em; margin-bottom: 20px; }}
        .back {{ display: inline-block; margin-bottom: 20px; color: #007bff; text-decoration: none; }}
        .back:hover {{ text-decoration: underline; }}
        img {{ max-width: 100%; height: auto; }}
        pre {{ background: #f4f4f4; padding: 15px; overflow-x: auto; border-radius: 4px; }}
        code {{ background: #f4f4f4; padding: 2px 6px; border-radius: 3px; }}
    </style>
</head>
<body>
    <a class="back" href="/blog/">← 返回列表</a>
    <article class="article">
        <h1>{title}</h1>
        <p class="meta">📅 {date} | <a href="{url}">原文链接</a></p>
        <hr style="border: none; border-top: 1px solid #eee; margin: 20px 0;">
        <div class="content">
        {content}
        </div>
    </article>
</body>
</html>"""
    return html

def main():
    print("📥 获取文章列表...")
    urls = get_sitemap()
    urls = [u for u in urls if '/archives/' in u]  # Filter only articles
    print(f"找到 {len(urls)} 篇文章")
    
    articles = []
    for i, url in enumerate(urls):
        slug = url.split('/')[-1]
        html_file = os.path.join(ARTICLES_DIR, f"{slug}.html")
        
        print(f"[{i+1}/{len(urls)}] 抓取: {slug}")
        
        if os.path.exists(html_file):
            print(f"  ✓ 已存在，跳过")
            with open(html_file, 'r') as f:
                content = f.read()
            title_match = re.search(r'<title>([^<]+)</title>', content)
            title = title_match.group(1).replace(' - Cat\'s Blog', '') if title_match else slug
            date_match = re.search(r'📅 ([\d-]+)', content)
            date = date_match.group(1) if date_match else ""
        else:
            title, content, date = scrape_article(url)
            if title:
                page_html = generate_article_page(title, content, url, date)
                with open(html_file, 'w') as f:
                    f.write(page_html)
                print(f"  ✓ 保存: {title[:50]}")
            else:
                print(f"  ✗ 失败")
                continue
        
        articles.append({'title': title, 'url': url, 'date': date, 'slug': slug})
        time.sleep(0.5)  # Be polite
    
    # Generate index
    print("\n📝 生成索引页...")
    index_html = generate_index(articles)
    with open(os.path.join(ARTICLES_DIR, "index.html"), 'w') as f:
        f.write(index_html)
    
    print(f"\n✅ 完成! 共处理 {len(articles)} 篇文章")
    print(f"📁 文件保存在: {ARTICLES_DIR}")

if __name__ == "__main__":
    main()
