Spaces:

muzammal-shafique
/

hateful-memes-classifier

Sleeping

App Files Files Community

Muzammal Shafique commited on Apr 25

Commit

a6db851

1 Parent(s): 8a798e1

Initial commit of hateful meme app

Browse files

Files changed (9) hide show

app.py +70 -0
clip_processor/merges.txt +0 -0
clip_processor/preprocessor_config.json +28 -0
clip_processor/special_tokens_map.json +30 -0
clip_processor/tokenizer.json +0 -0
clip_processor/tokenizer_config.json +32 -0
clip_processor/vocab.json +0 -0
hateful_meme_clip_model.pth +3 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import gradio as gr
+import torch
+from transformers import CLIPProcessor, CLIPModel
+import torch.nn as nn
+import pytesseract
+class CLIPMultimodalClassifier(nn.Module):
+    def __init__(self, clip_model):
+        super(CLIPMultimodalClassifier, self).__init__()
+        self.clip_model = clip_model
+        # CLIP's embedding dimension (512 for ViT-B/32)
+        embed_dim = clip_model.config.projection_dim
+        # Classification layer: from (embed_dim_image + embed_dim_text) to 2 classes
+        self.classifier = nn.Linear(embed_dim * 2, 2)
+    def forward(self, input_ids, attention_mask, pixel_values):
+        # Get image and text embeddings from CLIP
+        outputs = self.clip_model(input_ids=input_ids,
+                                   attention_mask=attention_mask,
+                                   pixel_values=pixel_values)
+        # CLIPModel outputs have image_embeds and text_embeds
+        img_embeds = outputs.image_embeds    # shape: (batch, embed_dim)
+        text_embeds = outputs.text_embeds    # shape: (batch, embed_dim)
+        # Concatenate the embeddings
+        fused_embeds = torch.cat([img_embeds, text_embeds], dim=1)  # shape: (batch, 2*embed_dim)
+        # Feed to classifier to get logits
+        logits = self.classifier(fused_embeds)
+        return logits
+MODEL_PATH = "hateful_meme_clip_model.pth"
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Load the saved model weights for inference
+model_infer = CLIPMultimodalClassifier(CLIPModel.from_pretrained("openai/clip-vit-base-patch16"))
+model_infer.load_state_dict(torch.load(MODEL_PATH, map_location=device))
+model_infer.to(device)
+model_infer.eval()
+# Load the saved processor
+proc = CLIPProcessor.from_pretrained("clip_processor")
+# Define inference function
+def classify_meme(image):
+    # image is a PIL Image input from Gradio
+    # 1. Extract text from image using OCR
+    text = pytesseract.image_to_string(image)
+    if text.strip() == "":
+        text = "<no text found>"  # handle cases with no detected text
+    # 2. Preprocess image and text
+    inputs = proc(text=[text], images=image, return_tensors="pt", padding=True)
+    input_ids = inputs["input_ids"].to(device)
+    attention_mask = inputs["attention_mask"].to(device)
+    pixel_values = inputs["pixel_values"].to(device)
+    # 3. Get model prediction
+    with torch.no_grad():
+        logits = model_infer(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
+        probs = logits.softmax(dim=1).cpu().numpy()[0]
+    # 4. Interpret the result
+    confidence = probs[0]  # probability of class 'Not Hateful'
+    label = "Not Hateful" if confidence < 0.5 else "Hateful"
+    # Return label and confidence
+    return f"Meme is {label} \nConfidence: {confidence:.2f}"
+# Create Gradio interface
+iface = gr.Interface(fn=classify_meme,
+                     inputs=gr.Image(type="pil"),
+                     outputs=gr.Textbox(label="Prediction"),
+                     title="Hateful Meme Classifier",
+                     description="Upload a meme image to check if it's hateful or not. The model will analyze both the image and text in the meme. The model's decision threshold is 0.5. i.e., if the confidance is less than 0.5, the meme is not hateful, else it is hateful.")
+# Launch interface for local testing (if running locally, this will start a web server)
+iface.launch(debug=True)

clip_processor/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

clip_processor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "processor_class": "CLIPProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

clip_processor/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

clip_processor/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

clip_processor/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "49406": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49407": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 77,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "CLIPProcessor",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": "<|endoftext|>"
+}

clip_processor/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

hateful_meme_clip_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7305010365472397f078de38b136d6f9ea007d8885647347dfe5d6c781d30ea8
+size 598660114

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio==5.26.0
+torch==2.6.0
+transformers==4.51.1
+pytesseract==0.3.13