#!/usr/bin/env python3 import os, re, time, requests from bs4 import BeautifulSoup BASE_URL = 'https://halo.gocat.top' ARTICLES_DIR = '/tmp/halo_articles' os.makedirs(ARTICLES_DIR, exist_ok=True) headers = {'User-Agent': 'Mozilla/5.0'} def get_sitemap(): resp = requests.get(BASE_URL + '/sitemap.xml', headers=headers, timeout=30) return re.findall(r'(https://halo\.gocat\.top/archives/[^<]+)', resp.text) def scrape_article(url): try: resp = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(resp.text, 'html.parser') title_tag = soup.find('h1') or soup.find('title') title = title_tag.get_text(strip=True) if title_tag else 'Untitled' date_match = re.search(r'/(\d{4}-\d{2}-\d{2})/', resp.text) date = date_match.group(1) if date_match else '' article = soup.find('article') or soup.find('div', class_=re.compile('content|article|post')) or soup.find('main') or soup.find('div', id='content') if article: for tag in article.find_all(['script','style','nav','footer','header','aside']): tag.decompose() content = str(article) else: content = resp.text return title, content, date except Exception as e: print(f'Error: {e}') return None, None, None CSS = ''' body { font-family: -apple-system, BlinkMacSystemFont, sans-serif; max-width: 900px; margin: 0 auto; padding: 20px; background: #f5f5f5; } h1 { color: #333; border-bottom: 2px solid #007bff; padding-bottom: 10px; } .article-list { list-style: none; padding: 0; } .article-item { background: white; margin: 10px 0; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } .article-item h2 { margin: 0 0 10px 0; font-size: 1.2em; } .article-item h2 a { color: #007bff; text-decoration: none; } .article-date { color: #666; font-size: 0.9em; } .header { background: #007bff; color: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; } .header h1 { color: white; border: none; margin: 0; } .back { display: inline-block; margin-bottom: 20px; color: #007bff; text-decoration: none; } .article { background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } img { max-width: 100%%; height: auto; } pre { background: #f4f4f4; padding: 15px; overflow-x: auto; border-radius: 4px; } code { background: #f4f4f4; padding: 2px 6px; border-radius: 3px; } ''' def generate_index(articles): articles.sort(key=lambda x: x['date'], reverse=True) items = '' for art in articles: slug = art['url'].split('/')[-1] items += f'''

{art['title']}

📅 {art['date']}

阅读全文 →

''' return f'''Cat's Blog

{items}''' def generate_page(title, content, url, date): slug = url.split('/')[-1] content = re.sub(r'(src|href)="/(?!/)', r'\1="https://halo.gocat.top/', content) return f'''{title}← 返回列表

{title}

📅 {date} | 原文链接

{content}

''' def main(): print('Getting article list...') urls = [u for u in get_sitemap() if '/archives/' in u] print(f'Found {len(urls)} articles') articles = [] for i, url in enumerate(urls): slug = url.split('/')[-1] html_file = ARTICLES_DIR + '/' + slug + '.html' print(f'[{i+1}/{len(urls)}] {slug}') if os.path.exists(html_file): with open(html_file) as f: h = f.read() t = re.search(r'([^<]+)', h) d = re.search(r'📅 ([\d-]+)', h) title = t.group(1).replace(' - Cat\'s Blog','') if t else slug date = d.group(1) if d else '' else: title, content, date = scrape_article(url) if title: with open(html_file,'w') as f: f.write(generate_page(title,content,url,date)) else: continue articles.append({'title':title,'url':url,'date':date,'slug':slug}) time.sleep(0.3) with open(ARTICLES_DIR+'/index.html','w') as f: f.write(generate_index(articles)) print(f'Done! {len(articles)} articles') main()

{art['title']}

🐱 Cat's Blog

{title}