import streamlit as st
import easyocr
import pandas as pd
from io import BytesIO
from PIL import Image
import numpy as np
import os
from pathlib import Path
from gliner import GLiNER
import cv2
import re

# Set environment variables for model storage
os.environ['GLINER_HOME'] = str(Path.home() / '.gliner_models')
os.environ['TRANSFORMERS_CACHE'] = str(Path.home() / '.gliner_models' / 'cache')

# Initialize EasyOCR reader with English and Arabic support
reader = easyocr.Reader(['en', 'ar'])

def get_model_path():
    """Get the path to the local model directory."""
    base_dir = Path.home() / '.gliner_models'
    model_dir = base_dir / 'gliner_large-v2.1'
    return model_dir

def download_model():
    """Download the model if it doesn't exist locally."""
    model_dir = get_model_path()
    if not model_dir.exists():
        st.info("Downloading GLiNER model for the first time... This may take a few minutes.")
        try:
            model_dir.parent.mkdir(parents=True, exist_ok=True)
            temp_model = GLiNER.from_pretrained("urchade/gliner_large-v2.1")
            temp_model.save_pretrained(str(model_dir))
            st.success("Model downloaded successfully!")
            return temp_model
        except Exception as e:
            st.error(f"Error downloading model: {str(e)}")
            raise e
    return None

@st.cache_resource
def load_gliner_model():
    """Load the GLiNER model, downloading it if necessary."""
    model_dir = get_model_path()
    if model_dir.exists():
        try:
            return GLiNER.from_pretrained(str(model_dir))
        except Exception as e:
            st.warning("Error loading existing model. Attempting to redownload...")
            import shutil
            shutil.rmtree(model_dir, ignore_errors=True)
    
    model = download_model()
    if model:
        return model
    return GLiNER.from_pretrained(str(model_dir))

def preprocess_image(image):
    """
    Preprocess the image using OpenCV:
    - Convert to grayscale
    - Apply median blur for denoising
    - Apply thresholding (Otsu) for binarization
    """
    img_array = np.array(image)
    gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
    denoised = cv2.medianBlur(gray, 3)
    _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return thresh

def clean_extracted_text(text):
    """
    Clean the extracted text:
    - Remove unwanted characters while preserving Arabic Unicode blocks,
      English letters, digits, spaces, and common punctuation.
    - Normalize extra spaces.
    """
    cleaned = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FFA-Za-z0-9\s@.,-]', '', text)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

def extract_text_from_image(image):
    """
    Preprocess the image and extract text using EasyOCR.
    """
    preprocessed_image = preprocess_image(image)
    return reader.readtext(preprocessed_image, detail=0, paragraph=True)

def process_entities(text: str, model, threshold: float, nested_ner: bool) -> dict:
    """
    Process text with GLiNER model to extract business card entities.
    """
    # Define business card labels
    labels = "person name, company name, job title, phone, email, address"
    labels = [label.strip() for label in labels.split(",")]
    
    # Get predictions
    entities = model.predict_entities(
        text, 
        labels, 
        flat_ner=not nested_ner,
        threshold=threshold
    )
    
    # Format results
    formatted_entities = []
    for entity in entities:
        formatted_entities.append({
            "entity": entity["label"],
            "word": entity["text"],
            "start": entity["start"],
            "end": entity["end"]
        })
    
    # Organize results by category
    results = {
        "Person Name": [],
        "Company Name": [],
        "Job Title": [],
        "Phone": [],
        "Email": [],
        "Address": []
    }
    
    for entity in formatted_entities:
        category = entity["entity"].title()
        if category in results:
            results[category].append(entity["word"])
    
    # Join multiple entries with semicolons
    return {k: "; ".join(set(v)) if v else "" for k, v in results.items()}

def main():
    st.title("Business Card Information Extractor")
    
    # Model settings in sidebar
    st.sidebar.title("Settings")
    
    threshold = st.sidebar.slider(
        "Detection Threshold",
        min_value=0.0,
        max_value=1.0,
        value=0.3,
        step=0.05,
        help="Lower values will detect more entities"
    )
    
    nested_ner = st.sidebar.checkbox(
        "Enable Nested NER",
        value=True,
        help="Allow detection of nested entities"
    )

    # Upload options
    upload_type = st.sidebar.radio("Upload Type", ("Single", "Batch"))
    
    # File uploader for business card images
    uploaded_files = st.file_uploader(
        "Upload Business Card Image(s)", 
        type=["png", "jpg", "jpeg"], 
        accept_multiple_files=(upload_type == "Batch")
    )

    if uploaded_files:
        # Load GLiNER model
        model = load_gliner_model()
        
        results = []
        files_to_process = uploaded_files if isinstance(uploaded_files, list) else [uploaded_files]
        progress_bar = st.progress(0)
        
        for idx, file in enumerate(files_to_process):
            with st.expander(f"Processing {file.name}"):
                image = Image.open(file)
                # Extract text using OCR after preprocessing
                extracted_text_list = extract_text_from_image(image)
                raw_text = " ".join(extracted_text_list)
                # Clean the extracted text
                clean_text = clean_extracted_text(raw_text)
                
                st.text("Extracted Text:")
                st.text(clean_text)
                
                # Process extracted text with GLiNER for entity recognition
                result = process_entities(clean_text, model, threshold, nested_ner)
                result["File Name"] = file.name
                results.append(result)
                
                st.json(result)
            
            progress_bar.progress((idx + 1) / len(files_to_process))
        
        if results:
            st.success("Processing Complete!")
            
            # Convert results to a DataFrame
            df = pd.DataFrame(results)
            cols = ["File Name"] + [col for col in df.columns if col != "File Name"]
            df = df[cols]
            
            st.dataframe(df, use_container_width=True)
            
            csv = df.to_csv(index=False)
            st.download_button(
                "Download Results CSV",
                csv,
                "business_card_results.csv",
                "text/csv",
                key='download-csv'
            )

if __name__ == "__main__":
    main()