from app.logger import get_logger
logger = get_logger(__name__)

from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image
import torch

# Load processor and model (ViT)
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:--------> ",device)
model.to(device)

def generate_caption(image_path: str) -> str:
    logger.info("Generating caption...")
    try:
        # Open and convert image to RGB
        image = Image.open(image_path).convert('RGB')
        
        # Preprocess image and prepare inputs
        inputs = processor(images=image, return_tensors="pt").to(device)

        # Generate caption (greedy decoding for now)
        output = model.generate(**inputs, max_length=16, num_beams=1)
        
        # Decode output to text
        caption = tokenizer.decode(output[0], skip_special_tokens=True)
        
        logger.info(f"Caption generated: {caption}")
        return caption
    except Exception as e:
        logger.exception("Failed to generate caption")
        raise