Spaces:
Running
Running
import cv2 | |
import numpy as np | |
from torchvision.transforms.v2 import ToPILImage | |
from PIL import Image | |
from transformers import pipeline | |
import torch | |
from imgutils.detect import detect_heads | |
from src.utils.device import determine_accelerator | |
topil = ToPILImage() | |
# 1. Initialize the filtering pipeline | |
device = determine_accelerator() | |
print("Loading AI Model...") | |
pipe = pipeline( | |
"image-text-to-text", | |
model="google/gemma-3-12b-it", | |
device=device, | |
torch_dtype=torch.bfloat16, | |
) | |
def crop_and_mask_characters_gradio(pil_img): | |
""" | |
Crops character regions from an image, saves them as separate files, | |
and generates binary masks for each cropped region using the Gemini 2.0 Flash Exp model. | |
Args: | |
uploaded_file_obj (str): The path to the input image. | |
""" | |
img = np.array(pil_img) | |
# Convert the image to grayscale | |
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
# Apply thresholding to create a binary image | |
_, thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV) | |
# Find contours in the binary image | |
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
# Create output directories if they don't exist | |
# cropped_dir = Path(temp_dir) / 'cropped_dir' | |
# masks_dir = cropped_dir | |
# os.makedirs(cropped_dir, exist_ok=True) | |
# os.makedirs(masks_dir, exist_ok=True) | |
coord_info_list = [] | |
i = 0 | |
# Iterate through the contours and crop the regions | |
for contour in contours: | |
# Get the bounding box of the contour | |
x, y, w, h = cv2.boundingRect(contour) | |
if w < 256 or h < 256: # Skip small contours | |
continue | |
# Crop the region | |
cropped_img = img[y:y+h, x:x+w] | |
messages = [ | |
{ | |
"role": "system", | |
"content": [{"type": "text", "text": "You are a helpful assistant."}] | |
}, | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "image", "image": topil(cropped_img)}, | |
{"type": "text", "text": "You are given a black-and-white line drawing as input. Please analyze the image carefully. If the drawing contains the majority of a head or face—meaning most key facial features or the overall shape of the head are visible—respond with 'True'. Otherwise, respond with 'False'. Do not contain any punctuation or extra spaces in your answer. Just respond with 'True' or 'False'"} | |
] | |
} | |
] | |
result = detect_heads(topil(cropped_img)) | |
if len(result) == 0: | |
continue | |
output = pipe(text=messages, max_new_tokens=200) | |
if output[0]["generated_text"][-1]["content"] == 'False': | |
# print(f"Skipping character {i+1} as it does not contain a head or face.") | |
continue | |
i += 1 | |
# Append the coordinates to the list | |
coord_info_list.append((i,(x,y,w,h))) | |
return coord_info_list | |