#!/usr/bin/env python3
"""
Stage 1: EML File Scanner
Scans .eml files exported from Foxmail

Usage:
    python3 01_parse.py --scan /path/to/eml/folder --output /output
"""

import os
import re
import json
from pathlib import Path
from typing import List, Dict, Tuple
import argparse

OUTPUT_FILE = "/output/emails.jsonl"

CATEGORIES = {
    'customer/order': ['order', 'invoice', 'payment', 'quotation', 'quote', '客户', '订单', '发票', '报价'],
    'customer/inquiry': ['inquiry', 'enquiry', 'question', 'request for', 'information', '咨询', '询问'],
    'supplier': ['supplier', 'vendor', 'procurement', 'vendor contact', '供应商', '供货商', '采购'],
    'shipment': ['shipment', 'shipping', 'delivery', 'tracking', 'freight', 'cargo', '发货', '物流', '货运', '运输'],
    'quotation': ['quotation', 'quote', 'pricing', '报价单', '报价'],
    'contract': ['contract', 'agreement', 'terms', '条款', '合同', '协议'],
    'complaint': ['complaint', 'issue', 'problem', 'defect', 'quality', '投诉', '问题', '质量'],
}

def clean_text(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()[:8000]

def classify_email(subject: str, body: str, sender: str) -> Tuple[str, List[str]]:
    text = f"{subject} {body} {sender}".lower()
    for category, keywords in CATEGORIES.items():
        for kw in keywords:
            if kw.lower() in text:
                return category, [kw]
    return 'uncategorized', []

def parse_eml(filepath: Path) -> List[Dict]:
    """Parse single .eml file"""
    emails = []
    
    try:
        with open(filepath, 'rb') as f:
            content = f.read()
        
        # Try UTF-8
        text = content.decode('utf-8', errors='ignore')
        
        # Try GBK if UTF-8 doesn't have expected headers
        if 'From:' not in text and 'Subject:' not in text:
            try:
                text = content.decode('gbk', errors='ignore')
            except:
                pass
        
        email_data = {
            'id': filepath.stem,
            'folder': str(filepath.parent.name),
            'sender': '',
            'recipient': '',
            'subject': '',
            'date': '',
            'body': '',
            'category': 'uncategorized',
            'keywords': [],
            'source': str(filepath)
        }
        
        # Parse headers
        lines = text.split('\n')
        body_start = 0
        
        for i, line in enumerate(lines):
            line_stripped = line.rstrip()
            
            if not line_stripped:
                body_start = i + 1
                break
            
            # RFC2822 headers (English)
            if line_stripped.startswith('From:'):
                email_data['sender'] = clean_text(line_stripped[5:].strip())
            elif line_stripped.startswith('To:'):
                email_data['recipient'] = clean_text(line_stripped[3:].strip())
            elif line_stripped.startswith('Subject:'):
                email_data['subject'] = clean_text(line_stripped[8:].strip())
            elif line_stripped.startswith('Date:'):
                email_data['date'] = clean_text(line_stripped[5:].strip())
            # Chinese Foxmail headers
            elif line_stripped.startswith('发件人:'):
                email_data['sender'] = clean_text(line_stripped[4:].strip())
            elif line_stripped.startswith('收件人:'):
                email_data['recipient'] = clean_text(line_stripped[4:].strip())
            elif line_stripped.startswith('主题:'):
                email_data['subject'] = clean_text(line_stripped[3:].strip())
            elif line_stripped.startswith('日期:'):
                email_data['date'] = clean_text(line_stripped[3:].strip())
        
        # Body
        email_data['body'] = clean_text('\n'.join(lines[body_start:]))
        
        # Classify
        email_data['category'], email_data['keywords'] = classify_email(
            email_data['subject'], email_data['body'], email_data['sender']
        )
        
        # Only add if has content
        if email_data['subject'] or email_data['body']:
            emails.append(email_data)
        
    except Exception as e:
        print(f"  Error: {e}")
    
    return emails

def find_eml_files(root_path: str) -> List[Path]:
    """Find all .eml files recursively"""
    root = Path(root_path)
    return list(root.rglob('*.eml'))

def main():
    parser = argparse.ArgumentParser(description='EML Email Scanner')
    parser.add_argument('--scan', required=True, help='Path to .eml files')
    parser.add_argument('--output', default='/output', help='Output directory')
    args = parser.parse_args()
    
    print(f"Scanning: {args.scan}")
    
    eml_files = find_eml_files(args.scan)
    print(f"Found {len(eml_files)} .eml files")
    
    if not eml_files:
        print("No .eml files found!")
        return
    
    os.makedirs(args.output, exist_ok=True)
    output_file = os.path.join(args.output, "emails.jsonl")
    
    total = 0
    with open(output_file, 'w') as f:
        for i, eml in enumerate(eml_files):
            if i % 500 == 0:
                print(f"[{i}/{len(eml_files)}]")
            
            emails = parse_eml(eml)
            for email in emails:
                f.write(json.dumps(email, ensure_ascii=False) + '\n')
            total += len(emails)
    
    print(f"\n✅ Done! {total} emails saved to {output_file}")

if __name__ == "__main__":
    main()
