"""
Text Cleaning Utilities for Menu Data
Cleans and normalizes text before embedding generation
"""
import re
from typing import List, Dict, Any
from app.utils.logger import logger


def clean_text(text: str) -> str:
    """
    Clean and normalize a single text string
    
    Args:
        text: Input text
        
    Returns:
        Cleaned text
    """
    if not text:
        return ""
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters (keep Arabic, English, numbers, spaces)
    text = re.sub(r'[^\u0600-\u06FFa-zA-Z0-9\s،,.]', '', text)
    
    # Normalize Arabic characters
    text = normalize_arabic(text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text


def normalize_arabic(text: str) -> str:
    """
    Normalize Arabic text
    
    - Remove diacritics (تشكيل)
    - Normalize Alef variants
    - Normalize Teh Marbuta
    
    Args:
        text: Arabic text
        
    Returns:
        Normalized text
    """
    if not text:
        return ""
    
    # Remove Arabic diacritics (تشكيل)
    arabic_diacritics = re.compile(r'[\u064B-\u065F\u0670]')
    text = arabic_diacritics.sub('', text)
    
    # Normalize Alef variants (أ، إ، آ → ا)
    text = re.sub(r'[أإآ]', 'ا', text)
    
    # Normalize Teh Marbuta (ة → ه)
    text = re.sub(r'ة', 'ه', text)
    
    # Normalize Yeh variants (ى → ي)
    text = re.sub(r'ى', 'ي', text)
    
    return text


def clean_menu_texts(menu_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Clean all text fields in menu data
    
    Args:
        menu_data: List of menu items
        
    Returns:
        Cleaned menu data
    """
    logger.info(f"🧹 Cleaning {len(menu_data)} menu items...")
    
    cleaned_data = []
    
    for item in menu_data:
        cleaned_item = item.copy()
        
        # Clean text fields
        text_fields = ['name_ar', 'name_en', 'description', 'category']
        for field in text_fields:
            if field in cleaned_item and cleaned_item[field]:
                cleaned_item[field] = clean_text(str(cleaned_item[field]))
        
        # Clean tags (list of strings)
        if 'tags' in cleaned_item and cleaned_item['tags']:
            if isinstance(cleaned_item['tags'], list):
                cleaned_item['tags'] = [clean_text(tag) for tag in cleaned_item['tags']]
            elif isinstance(cleaned_item['tags'], str):
                # Convert string to list if needed
                cleaned_item['tags'] = [clean_text(tag.strip()) for tag in cleaned_item['tags'].split(',')]
        
        cleaned_data.append(cleaned_item)
    
    logger.info(f"✅ Cleaned {len(cleaned_data)} items")
    
    return cleaned_data


def prepare_text_for_embedding(item: Dict[str, Any]) -> str:
    """
    Prepare a single menu item for embedding
    
    Combines: name_ar + name_en + description + tags
    
    Args:
        item: Menu item dictionary
        
    Returns:
        Combined text string
    """
    text_parts = []
    
    # Add names (with weight - repeat 2x for importance)
    if item.get('name_ar'):
        text_parts.extend([item['name_ar']] * 2)
    
    if item.get('name_en'):
        text_parts.extend([item['name_en']] * 2)
    
    # Add description
    if item.get('description'):
        text_parts.append(item['description'])
    
    # Add category
    if item.get('category'):
        text_parts.append(item['category'])
    
    # Add tags (with weight - repeat 3x for importance)
    if item.get('tags'):
        tags = item['tags']
        if isinstance(tags, list):
            text_parts.extend(tags * 3)
        elif isinstance(tags, str):
            text_parts.extend([tags] * 3)
    
    # Join with spaces
    combined_text = ' '.join(filter(None, text_parts))
    
    # Clean the combined text
    return clean_text(combined_text)


def extract_keywords(text: str, min_length: int = 2) -> List[str]:
    """
    Extract keywords from text
    
    Args:
        text: Input text
        min_length: Minimum keyword length
        
    Returns:
        List of keywords
    """
    if not text:
        return []
    
    # Split by whitespace and punctuation
    words = re.split(r'[\s،,،.]+', text)
    
    # Filter short words and clean
    keywords = [
        clean_text(word)
        for word in words
        if len(word) >= min_length
    ]
    
    # Remove duplicates while preserving order
    seen = set()
    unique_keywords = []
    for keyword in keywords:
        if keyword and keyword not in seen:
            seen.add(keyword)
            unique_keywords.append(keyword)
    
    return unique_keywords


def calculate_text_similarity(text1: str, text2: str) -> float:
    """
    Calculate simple text similarity (Jaccard similarity)
    
    Args:
        text1: First text
        text2: Second text
        
    Returns:
        Similarity score (0-1)
    """
    if not text1 or not text2:
        return 0.0
    
    # Extract keywords
    keywords1 = set(extract_keywords(text1))
    keywords2 = set(extract_keywords(text2))
    
    if not keywords1 or not keywords2:
        return 0.0
    
    # Calculate Jaccard similarity
    intersection = keywords1.intersection(keywords2)
    union = keywords1.union(keywords2)
    
    return len(intersection) / len(union) if union else 0.0


def validate_menu_item(item: Dict[str, Any]) -> bool:
    """
    Validate that menu item has required fields
    
    Args:
        item: Menu item dictionary
        
    Returns:
        True if valid
    """
    required_fields = ['code', 'name_ar']
    
    for field in required_fields:
        if field not in item or not item[field]:
            logger.warning(f"⚠️  Item missing required field: {field}")
            return False
    
    return True


def enrich_menu_item(item: Dict[str, Any]) -> Dict[str, Any]:
    """
    Enrich menu item with additional fields
    
    Args:
        item: Menu item dictionary
        
    Returns:
        Enriched item
    """
    enriched = item.copy()
    
    # Add search_text field (for quick text search)
    enriched['search_text'] = prepare_text_for_embedding(item)
    
    # Add keywords field
    enriched['keywords'] = extract_keywords(enriched['search_text'])
    
    # Add normalized names
    if 'name_ar' in enriched:
        enriched['name_ar_normalized'] = normalize_arabic(enriched['name_ar'])
    
    return enriched

