#!/usr/bin/env python3
"""
PDF OCR + Summarize using existing Ollama (Docker)
For scanned contracts/agreements

Usage:
    python3 pdf_summarize.py /input/contract.pdf

Output:
    Structured summary of key terms, clauses, parties
"""

import os
import sys
import json
import warnings
warnings.filterwarnings('ignore')

# Config - use existing Ollama
OLLAMA_HOST = os.environ.get('OLLAMA_HOST', 'http://localhost:11434')
MODEL = os.environ.get('OLLAMA_MODEL', 'qwen2.5:3b')  # Check with: docker exec ollama ollama list

# ============== PDF TEXT EXTRACTION ==============

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from PDF using pdfminer"""
    from pdfminer.high_level import extract_text
    
    try:
        text = extract_text(pdf_path)
        if text and len(text.strip()) > 100:
            print(f"[OK] Text extracted via pdfminer: {len(text)} chars")
            return text
    except Exception as e:
        print(f"[WARN] pdfminer failed: {e}")
    
    return ""

def ocr_pdf(pdf_path: str) -> str:
    """OCR scanned PDF using pytesseract"""
    from pdf2image import convert_from_path
    import pytesseract
    
    print(f"  Converting PDF to images (OCR)...")
    try:
        images = convert_from_path(pdf_path, dpi=200)
        print(f"  Found {len(images)} pages")
        
        full_text = []
        for i, img in enumerate(images):
            print(f"  OCR page {i+1}/{len(images)}...")
            text = pytesseract.image_to_string(img, lang='eng+chi_sim')
            full_text.append(text)
        
        result = "\n\n".join(full_text)
        print(f"[OK] OCR complete: {len(result)} chars")
        return result
    except Exception as e:
        print(f"[ERROR] OCR failed: {e}")
        return ""

def extract_pdf_content(pdf_path: str) -> str:
    """Extract or OCR PDF content"""
    print(f"\nProcessing: {pdf_path}")
    
    # Try text extraction first
    text = extract_text_from_pdf(pdf_path)
    
    # If too little text, try OCR
    if len(text.strip()) < 200:
        print("  Text too sparse, running OCR...")
        text = ocr_pdf(pdf_path)
    else:
        print(f"  Extracted {len(text)} chars of text")
    
    return text

# ============== SUMMARIZE WITH OLLAMA ==============

def check_ollama():
    """Check if Ollama is accessible"""
    import httpx
    
    try:
        resp = httpx.get(f"{OLLAMA_HOST}/api/tags", timeout=5)
        if resp.status_code == 200:
            models = resp.json().get('models', [])
            print(f"[OK] Ollama connected. Models: {[m['name'] for m in models]}")
            return models
    except Exception as e:
        print(f"[WARN] Ollama not reachable at {OLLAMA_HOST}: {e}")
    return []

def summarize_with_ollama(text: str) -> str:
    """Send to Ollama for structured summary"""
    import httpx
    
    prompt = f"""You are a legal document analyst. Extract and summarize the key information from this contract/agreement.

Extract the following sections:

## 📋 DOCUMENT TYPE & PARTIES
- Type of document (Contract, Agreement, MOU, etc.)
- Party A (Full name, role)
- Party B (Full name, role)
- Date signed/effective

## 📌 KEY TERMS
- Contract duration/period
- Payment terms
- Key obligations of each party
- Termination conditions

## ⚠️ IMPORTANT CLAUSES
- Liability limitations
- Confidentiality terms
- Non-compete or exclusivity
- Force majeure
- Dispute resolution

## 💰 FINANCIAL TERMS
- Contract value/price
- Payment schedule
- Penalties or bonuses

## 🔍 RISK NOTES
- Unusual or concerning terms
- Hidden fees or obligations
- Termination risks

---

DOCUMENT CONTENT:
{text[:8000]}

Provide a clear, structured summary. If information is not available, state "Not specified". Be concise but thorough."""

    print(f"\nSending to Ollama: {MODEL}...")
    
    try:
        response = httpx.post(
            f"{OLLAMA_HOST}/api/chat",
            json={
                "model": MODEL,
                "messages": [
                    {"role": "user", "content": prompt}
                ],
                "stream": False,
                "options": {
                    "temperature": 0.3,
                    "num_ctx": 8192,
                }
            },
            timeout=120
        )
        
        if response.status_code == 200:
            result = response.json()
            summary = result['message']['content']
            print("[OK] Summary generated")
            return summary
        else:
            print(f"[ERROR] Ollama returned: {response.status_code}")
            return f"Error: {response.text}"
    except Exception as e:
        print(f"[ERROR] Ollama failed: {e}")
        return f"Error: {e}"

# ============== MAIN ==============

def main():
    if len(sys.argv) < 2:
        print("Usage: python3 pdf_summarize.py <pdf_file>")
        print("   or: python3 pdf_summarize.py /input/contract.pdf")
        print(f"\nOllama: {OLLAMA_HOST}")
        print(f"Model: {MODEL}")
        sys.exit(1)
    
    pdf_path = sys.argv[1]
    
    if not os.path.exists(pdf_path):
        print(f"[ERROR] File not found: {pdf_path}")
        sys.exit(1)
    
    # Check Ollama
    check_ollama()
    
    # Extract content
    text = extract_pdf_content(pdf_path)
    
    if not text.strip():
        print("[ERROR] Could not extract any text from PDF")
        sys.exit(1)
    
    # Summarize
    summary = summarize_with_ollama(text)
    
    # Output
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(summary)
    
    # Save to file
    output_path = pdf_path + ".summary.txt"
    with open(output_path, 'w') as f:
        f.write(f"Source: {pdf_path}\n")
        f.write(f"Model: {MODEL}\n")
        f.write("=" * 60 + "\n\n")
        f.write(summary)
    print(f"\n[Saved] {output_path}")
    
    return summary

if __name__ == "__main__":
    main()
