Diptaraj Sen commited on
Commit
d10976f
·
1 Parent(s): a14c96e

image captioning model changed

Browse files
Files changed (2) hide show
  1. .gitignore +2 -0
  2. app/captioning.py +9 -7
.gitignore CHANGED
@@ -2,4 +2,6 @@ venv/
2
  __pycache__/
3
  outputs/
4
  logs/
 
 
5
  *.pyc
 
2
  __pycache__/
3
  outputs/
4
  logs/
5
+ test.ipynb
6
+ .streamlit/secrets.toml
7
  *.pyc
app/captioning.py CHANGED
@@ -1,14 +1,16 @@
1
  from app.logger import get_logger
2
  logger = get_logger(__name__)
3
 
4
- from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
5
  from PIL import Image
6
  import torch
7
 
8
- # Load processor and model (ViT)
9
- model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
10
- processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
11
- tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 
 
12
  # Move model to GPU if available
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
  print("DEVICE:--------> ",device)
@@ -21,10 +23,10 @@ def generate_caption(image_path: str) -> str:
21
  image = Image.open(image_path).convert('RGB')
22
 
23
  # Preprocess image and prepare inputs
24
- inputs = processor(images=image, return_tensors="pt").to(device)
25
 
26
  # Generate caption (greedy decoding for now)
27
- output = model.generate(**inputs, max_length=16, num_beams=1)
28
 
29
  # Decode output to text
30
  caption = tokenizer.decode(output[0], skip_special_tokens=True)
 
1
  from app.logger import get_logger
2
  logger = get_logger(__name__)
3
 
4
+ from transformers import AutoTokenizer, AutoImageProcessor, VisionEncoderDecoderModel
5
  from PIL import Image
6
  import torch
7
 
8
+ model_id = "cnmoro/nano-image-captioning"
9
+ # Load model, tokenizer, and image processor
10
+ model = VisionEncoderDecoderModel.from_pretrained(model_id)
11
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
12
+ image_processor = AutoImageProcessor.from_pretrained(model_id)
13
+
14
  # Move model to GPU if available
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
  print("DEVICE:--------> ",device)
 
23
  image = Image.open(image_path).convert('RGB')
24
 
25
  # Preprocess image and prepare inputs
26
+ inputs = image_processor(images=image, return_tensors="pt").to(device)
27
 
28
  # Generate caption (greedy decoding for now)
29
+ output = model.generate(**inputs, max_length=30, num_beams=1)
30
 
31
  # Decode output to text
32
  caption = tokenizer.decode(output[0], skip_special_tokens=True)