webtoon_cropper / src /wise_crop /detect_and_crop.py
wise-water's picture
init commit
13aa528
import cv2
import numpy as np
from torchvision.transforms.v2 import ToPILImage
from PIL import Image
from transformers import pipeline
import torch
from imgutils.detect import detect_heads
from src.utils.device import determine_accelerator
topil = ToPILImage()
# 1. Initialize the filtering pipeline
device = determine_accelerator()
print("Loading AI Model...")
pipe = pipeline(
"image-text-to-text",
model="google/gemma-3-12b-it",
device=device,
torch_dtype=torch.bfloat16,
)
def crop_and_mask_characters_gradio(pil_img):
"""
Crops character regions from an image, saves them as separate files,
and generates binary masks for each cropped region using the Gemini 2.0 Flash Exp model.
Args:
uploaded_file_obj (str): The path to the input image.
"""
img = np.array(pil_img)
# Convert the image to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Apply thresholding to create a binary image
_, thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV)
# Find contours in the binary image
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Create output directories if they don't exist
# cropped_dir = Path(temp_dir) / 'cropped_dir'
# masks_dir = cropped_dir
# os.makedirs(cropped_dir, exist_ok=True)
# os.makedirs(masks_dir, exist_ok=True)
coord_info_list = []
i = 0
# Iterate through the contours and crop the regions
for contour in contours:
# Get the bounding box of the contour
x, y, w, h = cv2.boundingRect(contour)
if w < 256 or h < 256: # Skip small contours
continue
# Crop the region
cropped_img = img[y:y+h, x:x+w]
messages = [
{
"role": "system",
"content": [{"type": "text", "text": "You are a helpful assistant."}]
},
{
"role": "user",
"content": [
{"type": "image", "image": topil(cropped_img)},
{"type": "text", "text": "You are given a black-and-white line drawing as input. Please analyze the image carefully. If the drawing contains the majority of a head or face—meaning most key facial features or the overall shape of the head are visible—respond with 'True'. Otherwise, respond with 'False'. Do not contain any punctuation or extra spaces in your answer. Just respond with 'True' or 'False'"}
]
}
]
result = detect_heads(topil(cropped_img))
if len(result) == 0:
continue
output = pipe(text=messages, max_new_tokens=200)
if output[0]["generated_text"][-1]["content"] == 'False':
# print(f"Skipping character {i+1} as it does not contain a head or face.")
continue
i += 1
# Append the coordinates to the list
coord_info_list.append((i,(x,y,w,h)))
return coord_info_list