"""
Data sanitization and validation utilities for Vera Medical CRM
"""

import re
from typing import Optional

# ============================================================================
# Phone Number Sanitization
# ============================================================================

def sanitize_phone(phone: Optional[str]) -> Optional[str]:
    """
    Sanitize and format phone number
    Accepts various formats and returns (XXX) XXX-XXXX or original if invalid
    """
    if not phone:
        return None
    
    # Remove all non-digit characters
    digits = re.sub(r'\D', '', phone)
    
    # If 10 digits, format as (XXX) XXX-XXXX
    if len(digits) == 10:
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
    
    # If 11 digits starting with 1, format as +1 (XXX) XXX-XXXX
    if len(digits) == 11 and digits[0] == '1':
        return f"+1 ({digits[1:4]}) {digits[4:7]}-{digits[7:]}"
    
    # Otherwise return original (might be international or extension)
    return phone.strip()


# ============================================================================
# Postal Code Sanitization (Canadian)
# ============================================================================

def sanitize_postal_code(postal_code: Optional[str]) -> Optional[str]:
    """
    Sanitize and format Canadian postal code
    Accepts various formats and returns A1A 1A1 format
    Tries to fix common issues like 'O' instead of '0', extra characters, etc.
    """
    if not postal_code:
        return None
    
    # Remove spaces, hyphens, and convert to uppercase
    cleaned = postal_code.replace(' ', '').replace('-', '').upper()
    
    # Try to fix common issues: replace 'O' with '0' in digit positions
    if len(cleaned) >= 6:
        # Try to fix: positions 1, 3, 5 should be digits
        fixed = list(cleaned[:6])
        for i in [1, 3, 5]:
            if i < len(fixed) and fixed[i] == 'O':
                fixed[i] = '0'
        cleaned = ''.join(fixed)
    
    # Canadian postal code pattern: A1A 1A1 (6 characters)
    if len(cleaned) == 6 and re.match(r'^[A-Z]\d[A-Z]\d[A-Z]\d$', cleaned):
        return f"{cleaned[:3]} {cleaned[3:]}"
    
    # If longer than 6, try to take first 6 characters
    if len(cleaned) > 6:
        cleaned = cleaned[:6]
        if re.match(r'^[A-Z]\d[A-Z]\d[A-Z]\d$', cleaned):
            return f"{cleaned[:3]} {cleaned[3:]}"
    
    # Return sanitized (trimmed) original if doesn't match pattern
    # This allows invalid postal codes to be stored (non-blocking validation)
    return postal_code.strip()


def validate_postal_code(postal_code: Optional[str]) -> bool:
    """Validate Canadian postal code format"""
    if not postal_code:
        return True  # Optional field
    
    cleaned = postal_code.replace(' ', '').replace('-', '').upper()
    return bool(re.match(r'^[A-Z]\d[A-Z]\d[A-Z]\d$', cleaned))


# ============================================================================
# Email Sanitization
# ============================================================================

def sanitize_email(email: Optional[str]) -> Optional[str]:
    """
    Sanitize email address
    Returns lowercase trimmed email or None
    """
    if not email:
        return None
    
    email = email.strip().lower()
    
    # Basic email validation
    if '@' in email and '.' in email.split('@')[1]:
        return email
    
    return None


def validate_email(email: Optional[str]) -> bool:
    """Validate email format"""
    if not email:
        return True  # Optional field
    
    # Basic email regex pattern
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))


# ============================================================================
# Name Sanitization
# ============================================================================

def sanitize_name(name: Optional[str]) -> Optional[str]:
    """
    Sanitize person name
    Capitalizes first letter of each word, handles prefixes
    """
    if not name:
        return None
    
    name = name.strip()
    
    # Handle common prefixes that should stay lowercase
    prefixes = ['van', 'von', 'de', 'del', 'da', 'di']
    
    words = name.split()
    capitalized = []
    
    for i, word in enumerate(words):
        # First word is always capitalized
        if i == 0:
            capitalized.append(word.capitalize())
        # Check if word is a prefix
        elif word.lower() in prefixes:
            capitalized.append(word.lower())
        # Handle hyphenated names
        elif '-' in word:
            parts = word.split('-')
            capitalized.append('-'.join(p.capitalize() for p in parts))
        # Handle names with apostrophes (O'Brien, D'Angelo)
        elif "'" in word:
            parts = word.split("'")
            capitalized.append("'".join(p.capitalize() for p in parts))
        else:
            capitalized.append(word.capitalize())
    
    return ' '.join(capitalized)


def sanitize_company_name(name: Optional[str]) -> Optional[str]:
    """
    Sanitize company name
    Preserves capitalization but trims whitespace
    """
    if not name:
        return None
    
    return ' '.join(name.split()).strip()


# ============================================================================
# Text Sanitization
# ============================================================================

def sanitize_text(text: Optional[str]) -> Optional[str]:
    """
    Sanitize general text fields
    Removes excessive whitespace, trims
    """
    if not text:
        return None
    
    # Replace multiple spaces with single space
    text = ' '.join(text.split())
    
    return text.strip() if text else None


# ============================================================================
# Fuzzy Matching for Deduplication
# ============================================================================

def normalize_for_comparison(text: Optional[str]) -> str:
    """
    Normalize text for fuzzy matching
    Removes spaces, punctuation, converts to lowercase
    """
    if not text:
        return ""
    
    # Convert to lowercase and remove all non-alphanumeric
    normalized = re.sub(r'[^a-z0-9]', '', text.lower())
    
    return normalized


def fuzzy_match_score(str1: Optional[str], str2: Optional[str]) -> float:
    """
    Calculate similarity score between two strings (0.0 to 1.0)
    Uses simple character-based comparison
    """
    if not str1 or not str2:
        return 0.0
    
    norm1 = normalize_for_comparison(str1)
    norm2 = normalize_for_comparison(str2)
    
    if norm1 == norm2:
        return 1.0
    
    # Levenshtein distance approximation
    if len(norm1) == 0 or len(norm2) == 0:
        return 0.0
    
    # Check if one contains the other
    if norm1 in norm2 or norm2 in norm1:
        shorter = min(len(norm1), len(norm2))
        longer = max(len(norm1), len(norm2))
        return shorter / longer
    
    # Simple character overlap score
    set1 = set(norm1)
    set2 = set(norm2)
    
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    
    return intersection / union if union > 0 else 0.0


# ============================================================================
# Data Structure Sanitization
# ============================================================================

def sanitize_form_data(data: dict) -> dict:
    """
    Sanitize all form data based on field type
    Returns sanitized dictionary
    """
    sanitized = {}
    
    for key, value in data.items():
        # Skip empty values
        if value is None or (isinstance(value, str) and not value.strip()):
            sanitized[key] = None
            continue
        
        # Phone fields
        if 'phone' in key.lower():
            sanitized[key] = sanitize_phone(value)
        
        # Email fields
        elif 'email' in key.lower():
            sanitized[key] = sanitize_email(value)
        
        # Postal code fields
        elif 'postal' in key.lower():
            sanitized[key] = sanitize_postal_code(value)
        
        # Name fields (but not company names)
        elif any(x in key.lower() for x in ['first_name', 'last_name']) and 'company' not in key.lower():
            sanitized[key] = sanitize_name(value)
        
        # Company name fields
        elif 'company' in key.lower() and 'name' in key.lower():
            sanitized[key] = sanitize_company_name(value)
        
        # Firm name
        elif 'firm' in key.lower() and 'name' in key.lower():
            sanitized[key] = sanitize_company_name(value)
        
        # Text fields
        elif isinstance(value, str):
            sanitized[key] = sanitize_text(value)
        
        else:
            sanitized[key] = value
    
    return sanitized


# ============================================================================
# Validation
# ============================================================================

def validate_form_data(data: dict) -> tuple[bool, list[str]]:
    """
    Validate form data
    Returns (is_valid, list_of_errors)
    Note: Postal codes are validated but don't block form submission (non-blocking)
    """
    errors = []
    warnings = []
    
    # Validate emails
    for key, value in data.items():
        if 'email' in key.lower() and value:
            if not validate_email(value):
                errors.append(f"Invalid email format: {key}")
        
        # Validate postal codes (non-blocking - just warn)
        if 'postal' in key.lower() and value:
            if not validate_postal_code(value):
                # Don't add to errors - postal codes are optional and non-blocking
                # The value will be sanitized and stored as-is if invalid
                pass
    
    return (len(errors) == 0, errors)