#!/usr/bin/env python3
import os, re, time, requests
from bs4 import BeautifulSoup

BASE_URL = 'https://halo.gocat.top'
ARTICLES_DIR = '/tmp/halo_articles'
os.makedirs(ARTICLES_DIR, exist_ok=True)
headers = {'User-Agent': 'Mozilla/5.0'}

def get_sitemap():
    resp = requests.get(BASE_URL + '/sitemap.xml', headers=headers, timeout=30)
    return re.findall(r'<loc>(https://halo\.gocat\.top/archives/[^<]+)</loc>', resp.text)

def scrape_article(url):
    try:
        resp = requests.get(url, headers=headers, timeout=30)
        soup = BeautifulSoup(resp.text, 'html.parser')
        title_tag = soup.find('h1') or soup.find('title')
        title = title_tag.get_text(strip=True) if title_tag else 'Untitled'
        date_match = re.search(r'/(\d{4}-\d{2}-\d{2})/', resp.text)
        date = date_match.group(1) if date_match else ''
        article = soup.find('article') or soup.find('div', class_=re.compile('content|article|post')) or soup.find('main') or soup.find('div', id='content')
        if article:
            for tag in article.find_all(['script','style','nav','footer','header','aside']):
                tag.decompose()
            content = str(article)
        else:
            content = resp.text
        return title, content, date
    except Exception as e:
        print(f'Error: {e}')
        return None, None, None

CSS = '''
body { font-family: -apple-system, BlinkMacSystemFont, sans-serif; max-width: 900px; margin: 0 auto; padding: 20px; background: #f5f5f5; }
h1 { color: #333; border-bottom: 2px solid #007bff; padding-bottom: 10px; }
.article-list { list-style: none; padding: 0; }
.article-item { background: white; margin: 10px 0; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
.article-item h2 { margin: 0 0 10px 0; font-size: 1.2em; }
.article-item h2 a { color: #007bff; text-decoration: none; }
.article-date { color: #666; font-size: 0.9em; }
.header { background: #007bff; color: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; }
.header h1 { color: white; border: none; margin: 0; }
.back { display: inline-block; margin-bottom: 20px; color: #007bff; text-decoration: none; }
.article { background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
img { max-width: 100%%; height: auto; }
pre { background: #f4f4f4; padding: 15px; overflow-x: auto; border-radius: 4px; }
code { background: #f4f4f4; padding: 2px 6px; border-radius: 3px; }
'''

def generate_index(articles):
    articles.sort(key=lambda x: x['date'], reverse=True)
    items = ''
    for art in articles:
        slug = art['url'].split('/')[-1]
        items += f'''<li class="article-item"><h2><a href="/blog/{slug}.html">{art['title']}</a></h2><p class="article-date">📅 {art['date']}</p><a href="/blog/{slug}.html">阅读全文 →</a></li>'''
    return f'''<!DOCTYPE html><html><head><meta charset="UTF-8"><title>Cat's Blog</title><style>{CSS}</style></head><body><div class="header"><h1>🐱 Cat's Blog</h1><p>备份自 halo.gocat.top | {len(articles)} 篇文章</p></div><ul class="article-list">{items}</ul></body></html>'''

def generate_page(title, content, url, date):
    slug = url.split('/')[-1]
    content = re.sub(r'(src|href)="/(?!/)', r'\1="https://halo.gocat.top/', content)
    return f'''<!DOCTYPE html><html><head><meta charset="UTF-8"><title>{title}</title><style>{CSS}</style></head><body><a class="back" href="/blog/">← 返回列表</a><article class="article"><h1>{title}</h1><p class="article-date">📅 {date} | <a href="{url}">原文链接</a></p><hr style="border:none;border-top:1px solid #eee;margin:20px 0"><div>{content}</div></article></body></html>'''

def main():
    print('Getting article list...')
    urls = [u for u in get_sitemap() if '/archives/' in u]
    print(f'Found {len(urls)} articles')
    articles = []
    for i, url in enumerate(urls):
        slug = url.split('/')[-1]
        html_file = ARTICLES_DIR + '/' + slug + '.html'
        print(f'[{i+1}/{len(urls)}] {slug}')
        if os.path.exists(html_file):
            with open(html_file) as f: h = f.read()
            t = re.search(r'<title>([^<]+)</title>', h)
            d = re.search(r'📅 ([\d-]+)', h)
            title = t.group(1).replace(' - Cat\'s Blog','') if t else slug
            date = d.group(1) if d else ''
        else:
            title, content, date = scrape_article(url)
            if title:
                with open(html_file,'w') as f: f.write(generate_page(title,content,url,date))
            else: continue
        articles.append({'title':title,'url':url,'date':date,'slug':slug})
        time.sleep(0.3)
    with open(ARTICLES_DIR+'/index.html','w') as f: f.write(generate_index(articles))
    print(f'Done! {len(articles)} articles')

main()
