Muzammal Shafique commited on
Commit
a6db851
·
1 Parent(s): 8a798e1

Initial commit of hateful meme app

Browse files
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import CLIPProcessor, CLIPModel
4
+ import torch.nn as nn
5
+ import pytesseract
6
+
7
+ class CLIPMultimodalClassifier(nn.Module):
8
+ def __init__(self, clip_model):
9
+ super(CLIPMultimodalClassifier, self).__init__()
10
+ self.clip_model = clip_model
11
+ # CLIP's embedding dimension (512 for ViT-B/32)
12
+ embed_dim = clip_model.config.projection_dim
13
+ # Classification layer: from (embed_dim_image + embed_dim_text) to 2 classes
14
+ self.classifier = nn.Linear(embed_dim * 2, 2)
15
+ def forward(self, input_ids, attention_mask, pixel_values):
16
+ # Get image and text embeddings from CLIP
17
+ outputs = self.clip_model(input_ids=input_ids,
18
+ attention_mask=attention_mask,
19
+ pixel_values=pixel_values)
20
+ # CLIPModel outputs have image_embeds and text_embeds
21
+ img_embeds = outputs.image_embeds # shape: (batch, embed_dim)
22
+ text_embeds = outputs.text_embeds # shape: (batch, embed_dim)
23
+ # Concatenate the embeddings
24
+ fused_embeds = torch.cat([img_embeds, text_embeds], dim=1) # shape: (batch, 2*embed_dim)
25
+ # Feed to classifier to get logits
26
+ logits = self.classifier(fused_embeds)
27
+ return logits
28
+
29
+ MODEL_PATH = "hateful_meme_clip_model.pth"
30
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
+ # Load the saved model weights for inference
32
+ model_infer = CLIPMultimodalClassifier(CLIPModel.from_pretrained("openai/clip-vit-base-patch16"))
33
+ model_infer.load_state_dict(torch.load(MODEL_PATH, map_location=device))
34
+ model_infer.to(device)
35
+ model_infer.eval()
36
+
37
+ # Load the saved processor
38
+ proc = CLIPProcessor.from_pretrained("clip_processor")
39
+
40
+ # Define inference function
41
+ def classify_meme(image):
42
+ # image is a PIL Image input from Gradio
43
+ # 1. Extract text from image using OCR
44
+ text = pytesseract.image_to_string(image)
45
+ if text.strip() == "":
46
+ text = "<no text found>" # handle cases with no detected text
47
+ # 2. Preprocess image and text
48
+ inputs = proc(text=[text], images=image, return_tensors="pt", padding=True)
49
+ input_ids = inputs["input_ids"].to(device)
50
+ attention_mask = inputs["attention_mask"].to(device)
51
+ pixel_values = inputs["pixel_values"].to(device)
52
+ # 3. Get model prediction
53
+ with torch.no_grad():
54
+ logits = model_infer(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
55
+ probs = logits.softmax(dim=1).cpu().numpy()[0]
56
+ # 4. Interpret the result
57
+ confidence = probs[0] # probability of class 'Not Hateful'
58
+ label = "Not Hateful" if confidence < 0.5 else "Hateful"
59
+ # Return label and confidence
60
+ return f"Meme is {label} \nConfidence: {confidence:.2f}"
61
+
62
+ # Create Gradio interface
63
+ iface = gr.Interface(fn=classify_meme,
64
+ inputs=gr.Image(type="pil"),
65
+ outputs=gr.Textbox(label="Prediction"),
66
+ title="Hateful Meme Classifier",
67
+ description="Upload a meme image to check if it's hateful or not. The model will analyze both the image and text in the meme. The model's decision threshold is 0.5. i.e., if the confidance is less than 0.5, the meme is not hateful, else it is hateful.")
68
+
69
+ # Launch interface for local testing (if running locally, this will start a web server)
70
+ iface.launch(debug=True)
clip_processor/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
clip_processor/preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "CLIPImageProcessor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "processor_class": "CLIPProcessor",
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 224
27
+ }
28
+ }
clip_processor/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
clip_processor/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
clip_processor/tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "49406": {
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49407": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|startoftext|>",
22
+ "clean_up_tokenization_spaces": false,
23
+ "do_lower_case": true,
24
+ "eos_token": "<|endoftext|>",
25
+ "errors": "replace",
26
+ "extra_special_tokens": {},
27
+ "model_max_length": 77,
28
+ "pad_token": "<|endoftext|>",
29
+ "processor_class": "CLIPProcessor",
30
+ "tokenizer_class": "CLIPTokenizer",
31
+ "unk_token": "<|endoftext|>"
32
+ }
clip_processor/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
hateful_meme_clip_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7305010365472397f078de38b136d6f9ea007d8885647347dfe5d6c781d30ea8
3
+ size 598660114
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==5.26.0
2
+ torch==2.6.0
3
+ transformers==4.51.1
4
+ pytesseract==0.3.13