Diptaraj Sen
commited on
Commit
·
d10976f
1
Parent(s):
a14c96e
image captioning model changed
Browse files- .gitignore +2 -0
- app/captioning.py +9 -7
.gitignore
CHANGED
@@ -2,4 +2,6 @@ venv/
|
|
2 |
__pycache__/
|
3 |
outputs/
|
4 |
logs/
|
|
|
|
|
5 |
*.pyc
|
|
|
2 |
__pycache__/
|
3 |
outputs/
|
4 |
logs/
|
5 |
+
test.ipynb
|
6 |
+
.streamlit/secrets.toml
|
7 |
*.pyc
|
app/captioning.py
CHANGED
@@ -1,14 +1,16 @@
|
|
1 |
from app.logger import get_logger
|
2 |
logger = get_logger(__name__)
|
3 |
|
4 |
-
from transformers import
|
5 |
from PIL import Image
|
6 |
import torch
|
7 |
|
8 |
-
|
9 |
-
model
|
10 |
-
|
11 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
|
12 |
# Move model to GPU if available
|
13 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
14 |
print("DEVICE:--------> ",device)
|
@@ -21,10 +23,10 @@ def generate_caption(image_path: str) -> str:
|
|
21 |
image = Image.open(image_path).convert('RGB')
|
22 |
|
23 |
# Preprocess image and prepare inputs
|
24 |
-
inputs =
|
25 |
|
26 |
# Generate caption (greedy decoding for now)
|
27 |
-
output = model.generate(**inputs, max_length=
|
28 |
|
29 |
# Decode output to text
|
30 |
caption = tokenizer.decode(output[0], skip_special_tokens=True)
|
|
|
1 |
from app.logger import get_logger
|
2 |
logger = get_logger(__name__)
|
3 |
|
4 |
+
from transformers import AutoTokenizer, AutoImageProcessor, VisionEncoderDecoderModel
|
5 |
from PIL import Image
|
6 |
import torch
|
7 |
|
8 |
+
model_id = "cnmoro/nano-image-captioning"
|
9 |
+
# Load model, tokenizer, and image processor
|
10 |
+
model = VisionEncoderDecoderModel.from_pretrained(model_id)
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
12 |
+
image_processor = AutoImageProcessor.from_pretrained(model_id)
|
13 |
+
|
14 |
# Move model to GPU if available
|
15 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
16 |
print("DEVICE:--------> ",device)
|
|
|
23 |
image = Image.open(image_path).convert('RGB')
|
24 |
|
25 |
# Preprocess image and prepare inputs
|
26 |
+
inputs = image_processor(images=image, return_tensors="pt").to(device)
|
27 |
|
28 |
# Generate caption (greedy decoding for now)
|
29 |
+
output = model.generate(**inputs, max_length=30, num_beams=1)
|
30 |
|
31 |
# Decode output to text
|
32 |
caption = tokenizer.decode(output[0], skip_special_tokens=True)
|