Muzammal Shafique
commited on
Commit
·
a6db851
1
Parent(s):
8a798e1
Initial commit of hateful meme app
Browse files- app.py +70 -0
- clip_processor/merges.txt +0 -0
- clip_processor/preprocessor_config.json +28 -0
- clip_processor/special_tokens_map.json +30 -0
- clip_processor/tokenizer.json +0 -0
- clip_processor/tokenizer_config.json +32 -0
- clip_processor/vocab.json +0 -0
- hateful_meme_clip_model.pth +3 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from transformers import CLIPProcessor, CLIPModel
|
4 |
+
import torch.nn as nn
|
5 |
+
import pytesseract
|
6 |
+
|
7 |
+
class CLIPMultimodalClassifier(nn.Module):
|
8 |
+
def __init__(self, clip_model):
|
9 |
+
super(CLIPMultimodalClassifier, self).__init__()
|
10 |
+
self.clip_model = clip_model
|
11 |
+
# CLIP's embedding dimension (512 for ViT-B/32)
|
12 |
+
embed_dim = clip_model.config.projection_dim
|
13 |
+
# Classification layer: from (embed_dim_image + embed_dim_text) to 2 classes
|
14 |
+
self.classifier = nn.Linear(embed_dim * 2, 2)
|
15 |
+
def forward(self, input_ids, attention_mask, pixel_values):
|
16 |
+
# Get image and text embeddings from CLIP
|
17 |
+
outputs = self.clip_model(input_ids=input_ids,
|
18 |
+
attention_mask=attention_mask,
|
19 |
+
pixel_values=pixel_values)
|
20 |
+
# CLIPModel outputs have image_embeds and text_embeds
|
21 |
+
img_embeds = outputs.image_embeds # shape: (batch, embed_dim)
|
22 |
+
text_embeds = outputs.text_embeds # shape: (batch, embed_dim)
|
23 |
+
# Concatenate the embeddings
|
24 |
+
fused_embeds = torch.cat([img_embeds, text_embeds], dim=1) # shape: (batch, 2*embed_dim)
|
25 |
+
# Feed to classifier to get logits
|
26 |
+
logits = self.classifier(fused_embeds)
|
27 |
+
return logits
|
28 |
+
|
29 |
+
MODEL_PATH = "hateful_meme_clip_model.pth"
|
30 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
31 |
+
# Load the saved model weights for inference
|
32 |
+
model_infer = CLIPMultimodalClassifier(CLIPModel.from_pretrained("openai/clip-vit-base-patch16"))
|
33 |
+
model_infer.load_state_dict(torch.load(MODEL_PATH, map_location=device))
|
34 |
+
model_infer.to(device)
|
35 |
+
model_infer.eval()
|
36 |
+
|
37 |
+
# Load the saved processor
|
38 |
+
proc = CLIPProcessor.from_pretrained("clip_processor")
|
39 |
+
|
40 |
+
# Define inference function
|
41 |
+
def classify_meme(image):
|
42 |
+
# image is a PIL Image input from Gradio
|
43 |
+
# 1. Extract text from image using OCR
|
44 |
+
text = pytesseract.image_to_string(image)
|
45 |
+
if text.strip() == "":
|
46 |
+
text = "<no text found>" # handle cases with no detected text
|
47 |
+
# 2. Preprocess image and text
|
48 |
+
inputs = proc(text=[text], images=image, return_tensors="pt", padding=True)
|
49 |
+
input_ids = inputs["input_ids"].to(device)
|
50 |
+
attention_mask = inputs["attention_mask"].to(device)
|
51 |
+
pixel_values = inputs["pixel_values"].to(device)
|
52 |
+
# 3. Get model prediction
|
53 |
+
with torch.no_grad():
|
54 |
+
logits = model_infer(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
|
55 |
+
probs = logits.softmax(dim=1).cpu().numpy()[0]
|
56 |
+
# 4. Interpret the result
|
57 |
+
confidence = probs[0] # probability of class 'Not Hateful'
|
58 |
+
label = "Not Hateful" if confidence < 0.5 else "Hateful"
|
59 |
+
# Return label and confidence
|
60 |
+
return f"Meme is {label} \nConfidence: {confidence:.2f}"
|
61 |
+
|
62 |
+
# Create Gradio interface
|
63 |
+
iface = gr.Interface(fn=classify_meme,
|
64 |
+
inputs=gr.Image(type="pil"),
|
65 |
+
outputs=gr.Textbox(label="Prediction"),
|
66 |
+
title="Hateful Meme Classifier",
|
67 |
+
description="Upload a meme image to check if it's hateful or not. The model will analyze both the image and text in the meme. The model's decision threshold is 0.5. i.e., if the confidance is less than 0.5, the meme is not hateful, else it is hateful.")
|
68 |
+
|
69 |
+
# Launch interface for local testing (if running locally, this will start a web server)
|
70 |
+
iface.launch(debug=True)
|
clip_processor/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
clip_processor/preprocessor_config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"crop_size": {
|
3 |
+
"height": 224,
|
4 |
+
"width": 224
|
5 |
+
},
|
6 |
+
"do_center_crop": true,
|
7 |
+
"do_convert_rgb": true,
|
8 |
+
"do_normalize": true,
|
9 |
+
"do_rescale": true,
|
10 |
+
"do_resize": true,
|
11 |
+
"image_mean": [
|
12 |
+
0.48145466,
|
13 |
+
0.4578275,
|
14 |
+
0.40821073
|
15 |
+
],
|
16 |
+
"image_processor_type": "CLIPImageProcessor",
|
17 |
+
"image_std": [
|
18 |
+
0.26862954,
|
19 |
+
0.26130258,
|
20 |
+
0.27577711
|
21 |
+
],
|
22 |
+
"processor_class": "CLIPProcessor",
|
23 |
+
"resample": 3,
|
24 |
+
"rescale_factor": 0.00392156862745098,
|
25 |
+
"size": {
|
26 |
+
"shortest_edge": 224
|
27 |
+
}
|
28 |
+
}
|
clip_processor/special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|startoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|endoftext|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<|endoftext|>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
clip_processor/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
clip_processor/tokenizer_config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"49406": {
|
5 |
+
"content": "<|startoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": true,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
},
|
12 |
+
"49407": {
|
13 |
+
"content": "<|endoftext|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": false,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false,
|
18 |
+
"special": true
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"bos_token": "<|startoftext|>",
|
22 |
+
"clean_up_tokenization_spaces": false,
|
23 |
+
"do_lower_case": true,
|
24 |
+
"eos_token": "<|endoftext|>",
|
25 |
+
"errors": "replace",
|
26 |
+
"extra_special_tokens": {},
|
27 |
+
"model_max_length": 77,
|
28 |
+
"pad_token": "<|endoftext|>",
|
29 |
+
"processor_class": "CLIPProcessor",
|
30 |
+
"tokenizer_class": "CLIPTokenizer",
|
31 |
+
"unk_token": "<|endoftext|>"
|
32 |
+
}
|
clip_processor/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
hateful_meme_clip_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7305010365472397f078de38b136d6f9ea007d8885647347dfe5d6c781d30ea8
|
3 |
+
size 598660114
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==5.26.0
|
2 |
+
torch==2.6.0
|
3 |
+
transformers==4.51.1
|
4 |
+
pytesseract==0.3.13
|