import cv2 import numpy as np from torchvision.transforms.v2 import ToPILImage from PIL import Image from transformers import pipeline import torch from imgutils.detect import detect_heads from src.utils.device import determine_accelerator topil = ToPILImage() # 1. Initialize the filtering pipeline device = determine_accelerator() print("Loading AI Model...") pipe = pipeline( "image-text-to-text", model="google/gemma-3-12b-it", device=device, torch_dtype=torch.bfloat16, ) def crop_and_mask_characters_gradio(pil_img): """ Crops character regions from an image, saves them as separate files, and generates binary masks for each cropped region using the Gemini 2.0 Flash Exp model. Args: uploaded_file_obj (str): The path to the input image. """ img = np.array(pil_img) # Convert the image to grayscale gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Apply thresholding to create a binary image _, thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV) # Find contours in the binary image contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # Create output directories if they don't exist # cropped_dir = Path(temp_dir) / 'cropped_dir' # masks_dir = cropped_dir # os.makedirs(cropped_dir, exist_ok=True) # os.makedirs(masks_dir, exist_ok=True) coord_info_list = [] i = 0 # Iterate through the contours and crop the regions for contour in contours: # Get the bounding box of the contour x, y, w, h = cv2.boundingRect(contour) if w < 256 or h < 256: # Skip small contours continue # Crop the region cropped_img = img[y:y+h, x:x+w] messages = [ { "role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}] }, { "role": "user", "content": [ {"type": "image", "image": topil(cropped_img)}, {"type": "text", "text": "You are given a black-and-white line drawing as input. Please analyze the image carefully. If the drawing contains the majority of a head or face—meaning most key facial features or the overall shape of the head are visible—respond with 'True'. Otherwise, respond with 'False'. Do not contain any punctuation or extra spaces in your answer. Just respond with 'True' or 'False'"} ] } ] result = detect_heads(topil(cropped_img)) if len(result) == 0: continue output = pipe(text=messages, max_new_tokens=200) if output[0]["generated_text"][-1]["content"] == 'False': # print(f"Skipping character {i+1} as it does not contain a head or face.") continue i += 1 # Append the coordinates to the list coord_info_list.append((i,(x,y,w,h))) return coord_info_list