#!/usr/bin/env python3 """Scrape halo.gocat.top articles and generate static blog""" import os import re import time import requests from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup BASE_URL = "https://halo.gocat.top" ARTICLES_DIR = "/tmp/halo_articles" os.makedirs(ARTICLES_DIR, exist_ok=True) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', } def get_sitemap(): """Get all article URLs from sitemap""" resp = requests.get(f"{BASE_URL}/sitemap.xml", headers=headers, timeout=30) urls = re.findall(r'(https://halo\.gocat\.top/archives/[^<]+)', resp.text) return urls def scrape_article(url): """Scrape single article and return (title, content, date)""" try: resp = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(resp.text, 'html.parser') # Get title title_tag = soup.find('h1') or soup.find('title') title = title_tag.get_text(strip=True) if title_tag else "Untitled" # Get date from meta or URL date_match = re.search(r'/(\d{4}-\d{2}-\d{2})/', resp.text) date = date_match.group(1) if date_match else "" # Get main content - try common article containers article = (soup.find('article') or soup.find('div', class_=re.compile('content|article|post')) or soup.find('main') or soup.find('div', id='content')) if article: # Remove unwanted elements for tag in article.find_all(['script', 'style', 'nav', 'footer', 'header', 'aside']): tag.decompose() content = str(article) else: content = resp.text return title, content, date except Exception as e: print(f"Error scraping {url}: {e}") return None, None, None def generate_index(articles): """Generate index.html with all articles""" articles.sort(key=lambda x: x['date'], reverse=True) html = """ Cat's Blog - 备份自 halo.gocat.top

{art['title']}

📅 {art['date']}
阅读全文 →

""" return html def generate_article_page(title, content, url, date): """Generate individual article page""" slug = url.split('/')[-1] # Clean up content - fix relative URLs content = re.sub(r'(src|href)="/(?!/)', r'\1="https://halo.gocat.top/', content) html = f""" {title} - Cat's Blog ← 返回列表

{title}

📅 {date} | 原文链接

{content}

""" return html def main(): print("📥 获取文章列表...") urls = get_sitemap() urls = [u for u in urls if '/archives/' in u] # Filter only articles print(f"找到 {len(urls)} 篇文章") articles = [] for i, url in enumerate(urls): slug = url.split('/')[-1] html_file = os.path.join(ARTICLES_DIR, f"{slug}.html") print(f"[{i+1}/{len(urls)}] 抓取: {slug}") if os.path.exists(html_file): print(f" ✓ 已存在，跳过") with open(html_file, 'r') as f: content = f.read() title_match = re.search(r'([^<]+)', content) title = title_match.group(1).replace(' - Cat\'s Blog', '') if title_match else slug date_match = re.search(r'📅 ([\d-]+)', content) date = date_match.group(1) if date_match else "" else: title, content, date = scrape_article(url) if title: page_html = generate_article_page(title, content, url, date) with open(html_file, 'w') as f: f.write(page_html) print(f" ✓ 保存: {title[:50]}") else: print(f" ✗ 失败") continue articles.append({'title': title, 'url': url, 'date': date, 'slug': slug}) time.sleep(0.5) # Be polite # Generate index print("\n📝 生成索引页...") index_html = generate_index(articles) with open(os.path.join(ARTICLES_DIR, "index.html"), 'w') as f: f.write(index_html) print(f"\n✅ 完成! 共处理 {len(articles)} 篇文章") print(f"📁 文件保存在: {ARTICLES_DIR}") if __name__ == "__main__": main()

🐱 Cat's Blog

{art['title']}

{title}