Upload files with `vila-upload`.
Browse filesUpload auto_processor.py
Upload modeling_vila.py
Upload llm/tokenizer_config.json
- auto_processor.py +17 -9
- llm/tokenizer_config.json +1 -1
- modeling_vila.py +19 -9
auto_processor.py
CHANGED
@@ -153,16 +153,19 @@ class VILAProcessor(ProcessorMixin):
|
|
153 |
# image_processor_class = "VILAImageProcessor"
|
154 |
# tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
|
155 |
|
156 |
-
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, **kwargs):
|
157 |
self.image_token = MEDIA_TOKENS["image"]
|
158 |
self.video_token = MEDIA_TOKENS["video"]
|
159 |
self.config = config
|
160 |
self.image_processor = image_processor
|
161 |
self.tokenizer = tokenizer
|
|
|
|
|
|
|
162 |
# self.pad_token_id = tokenizer.pad_token_id
|
163 |
-
self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0]
|
164 |
self.eos_token_id = self.tokenizer.eos_token_id
|
165 |
-
|
166 |
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
167 |
|
168 |
@staticmethod
|
@@ -193,6 +196,7 @@ class VILAProcessor(ProcessorMixin):
|
|
193 |
) -> tuple[list[PIL.Image.Image] | None, list[torch.Tensor | list[PIL.Image.Image]] | None, Optional[dict]]:
|
194 |
"""
|
195 |
referernce from qwen_vl_utils
|
|
|
196 |
"""
|
197 |
vision_infos = extract_vision_info(conversations)
|
198 |
## Read images or videos
|
@@ -233,12 +237,12 @@ class VILAProcessor(ProcessorMixin):
|
|
233 |
|
234 |
@classmethod
|
235 |
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
|
|
236 |
if os.path.isdir(pretrained_model_name_or_path):
|
237 |
pretrained_model_name_or_path = pretrained_model_name_or_path
|
238 |
else:
|
239 |
print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
|
240 |
from huggingface_hub import snapshot_download
|
241 |
-
|
242 |
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
|
243 |
|
244 |
image_processor = AutoImageProcessor.from_pretrained(
|
@@ -248,7 +252,7 @@ class VILAProcessor(ProcessorMixin):
|
|
248 |
osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True
|
249 |
)
|
250 |
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
|
251 |
-
return cls(image_processor=image_processor, tokenizer=tokenizer, config=config)
|
252 |
|
253 |
def __repr__(self):
|
254 |
# NOTE(ligeng): hard coded image_processor to avoid serialization error. Dirty fix
|
@@ -275,7 +279,7 @@ class VILAProcessor(ProcessorMixin):
|
|
275 |
if kwargs.get("text", None) is not None:
|
276 |
conversation = kwargs.get("text")
|
277 |
assert conversation is not None, "`conversation` or `text` is required"
|
278 |
-
padding_side = kwargs.get("padding_side",
|
279 |
|
280 |
input_ids_list = []
|
281 |
attention_mask = []
|
@@ -289,7 +293,8 @@ class VILAProcessor(ProcessorMixin):
|
|
289 |
media[name] += feat.media[name]
|
290 |
for name in feat.media_config:
|
291 |
media_config[name].update(feat.media_config[name])
|
292 |
-
|
|
|
293 |
input_ids = pad_fn(
|
294 |
input_ids_list,
|
295 |
padding_value=self.pad_token_id,
|
@@ -299,9 +304,10 @@ class VILAProcessor(ProcessorMixin):
|
|
299 |
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
300 |
attention_mask[input_ids == self.pad_token_id] = False
|
301 |
# print("[DEBUGAAA]", self.pad_token_id, self.tokenizer.pad_token_id); exit(0)
|
302 |
-
|
303 |
return BatchFeature(
|
304 |
data={
|
|
|
305 |
"input_ids": input_ids,
|
306 |
"attention_mask": attention_mask,
|
307 |
"media": media,
|
@@ -329,6 +335,8 @@ class VILAProcessor(ProcessorMixin):
|
|
329 |
self.config.image_processor = self.image_processor
|
330 |
if self.config.image_aspect_ratio == "dynamic":
|
331 |
images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
|
|
|
|
|
332 |
conversation[0]["value"] = conversation[0]["value"].replace(
|
333 |
DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
|
334 |
)
|
@@ -352,7 +360,7 @@ class VILAProcessor(ProcessorMixin):
|
|
352 |
raise ValueError(f"Unsupported media type: {name}")
|
353 |
|
354 |
inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
|
355 |
-
input_ids = inputs.input_ids[0].
|
356 |
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
357 |
return BatchFeature(
|
358 |
data={
|
|
|
153 |
# image_processor_class = "VILAImageProcessor"
|
154 |
# tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
|
155 |
|
156 |
+
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, padding_side="left", **kwargs):
|
157 |
self.image_token = MEDIA_TOKENS["image"]
|
158 |
self.video_token = MEDIA_TOKENS["video"]
|
159 |
self.config = config
|
160 |
self.image_processor = image_processor
|
161 |
self.tokenizer = tokenizer
|
162 |
+
self.padding_side = padding_side
|
163 |
+
|
164 |
+
# This is a special setting for Qwen.
|
165 |
# self.pad_token_id = tokenizer.pad_token_id
|
166 |
+
self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0] # 151643
|
167 |
self.eos_token_id = self.tokenizer.eos_token_id
|
168 |
+
|
169 |
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
170 |
|
171 |
@staticmethod
|
|
|
196 |
) -> tuple[list[PIL.Image.Image] | None, list[torch.Tensor | list[PIL.Image.Image]] | None, Optional[dict]]:
|
197 |
"""
|
198 |
referernce from qwen_vl_utils
|
199 |
+
NVILA does not depend on the function, but the interface is the same.
|
200 |
"""
|
201 |
vision_infos = extract_vision_info(conversations)
|
202 |
## Read images or videos
|
|
|
237 |
|
238 |
@classmethod
|
239 |
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
240 |
+
padding_side = kwargs.get("padding_side", "left")
|
241 |
if os.path.isdir(pretrained_model_name_or_path):
|
242 |
pretrained_model_name_or_path = pretrained_model_name_or_path
|
243 |
else:
|
244 |
print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
|
245 |
from huggingface_hub import snapshot_download
|
|
|
246 |
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
|
247 |
|
248 |
image_processor = AutoImageProcessor.from_pretrained(
|
|
|
252 |
osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True
|
253 |
)
|
254 |
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
|
255 |
+
return cls(image_processor=image_processor, tokenizer=tokenizer, config=config, padding_side=padding_side)
|
256 |
|
257 |
def __repr__(self):
|
258 |
# NOTE(ligeng): hard coded image_processor to avoid serialization error. Dirty fix
|
|
|
279 |
if kwargs.get("text", None) is not None:
|
280 |
conversation = kwargs.get("text")
|
281 |
assert conversation is not None, "`conversation` or `text` is required"
|
282 |
+
padding_side = kwargs.get("padding_side", self.padding_side)
|
283 |
|
284 |
input_ids_list = []
|
285 |
attention_mask = []
|
|
|
293 |
media[name] += feat.media[name]
|
294 |
for name in feat.media_config:
|
295 |
media_config[name].update(feat.media_config[name])
|
296 |
+
|
297 |
+
# pad the input_ids to batchfy
|
298 |
input_ids = pad_fn(
|
299 |
input_ids_list,
|
300 |
padding_value=self.pad_token_id,
|
|
|
304 |
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
305 |
attention_mask[input_ids == self.pad_token_id] = False
|
306 |
# print("[DEBUGAAA]", self.pad_token_id, self.tokenizer.pad_token_id); exit(0)
|
307 |
+
input_texts = self.tokenizer.batch_decode(input_ids)
|
308 |
return BatchFeature(
|
309 |
data={
|
310 |
+
"input_texts": input_texts,
|
311 |
"input_ids": input_ids,
|
312 |
"attention_mask": attention_mask,
|
313 |
"media": media,
|
|
|
335 |
self.config.image_processor = self.image_processor
|
336 |
if self.config.image_aspect_ratio == "dynamic":
|
337 |
images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
|
338 |
+
# print("DEBUG", len(images)); input()
|
339 |
+
# NOTE: this only works for images appears at the first conversation
|
340 |
conversation[0]["value"] = conversation[0]["value"].replace(
|
341 |
DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
|
342 |
)
|
|
|
360 |
raise ValueError(f"Unsupported media type: {name}")
|
361 |
|
362 |
inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
|
363 |
+
input_ids = inputs.input_ids[0].unsqueeze(0)#.cuda()
|
364 |
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
365 |
return BatchFeature(
|
366 |
data={
|
llm/tokenizer_config.json
CHANGED
@@ -78,7 +78,7 @@
|
|
78 |
"legacy": false,
|
79 |
"model_max_length": 4096,
|
80 |
"pad_token": "[PAD]",
|
81 |
-
"padding_side": "
|
82 |
"split_special_tokens": false,
|
83 |
"tokenizer_class": "Qwen2Tokenizer",
|
84 |
"unk_token": null
|
|
|
78 |
"legacy": false,
|
79 |
"model_max_length": 4096,
|
80 |
"pad_token": "[PAD]",
|
81 |
+
"padding_side": "left",
|
82 |
"split_special_tokens": false,
|
83 |
"tokenizer_class": "Qwen2Tokenizer",
|
84 |
"unk_token": null
|
modeling_vila.py
CHANGED
@@ -201,17 +201,19 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
201 |
else:
|
202 |
raise ValueError("`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config.")
|
203 |
|
204 |
-
# loading on
|
205 |
-
device_map = kwargs.get("device_map", "
|
206 |
self.mm_projector = build_mm_projector(mm_projector_cfg, config)
|
207 |
self.vision_tower = build_vision_tower(vision_tower_cfg, config)
|
208 |
-
if "auto"
|
209 |
self.mm_projector = self.mm_projector.cuda()
|
210 |
self.vision_tower = self.vision_tower.cuda()
|
211 |
# set device_map auto can autoamtically shard llm to different devices
|
212 |
self.llm, self.tokenizer = self.init_llm(llm_cfg, config, device_map=device_map)
|
213 |
|
214 |
-
# NOTE(ligeng):
|
|
|
|
|
215 |
self.encoders = {"image": BasicImageEncoder(self), "video": BasicVideoEncoder(self)}
|
216 |
|
217 |
self.post_config()
|
@@ -418,6 +420,7 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
418 |
weights_only: bool = True,
|
419 |
**kwargs,
|
420 |
):
|
|
|
421 |
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
|
422 |
return cls._from_config(config, **kwargs)
|
423 |
|
@@ -450,6 +453,10 @@ class VILAPretrainedModel(PreTrainedModel):
|
|
450 |
self.vision_tower = self.vision_tower.to(torch.float16)
|
451 |
######################################################################
|
452 |
self.training = self.llm.training
|
|
|
|
|
|
|
|
|
453 |
## configuration
|
454 |
if getattr(self.config, "llm_cfg", None) is None:
|
455 |
self.config.llm_cfg = self.llm.config
|
@@ -595,10 +602,6 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
595 |
return image_features
|
596 |
|
597 |
def train(self, mode: bool = True):
|
598 |
-
if mode:
|
599 |
-
self.tokenizer.padding_side = "right"
|
600 |
-
else:
|
601 |
-
self.tokenizer.padding_side = "left"
|
602 |
super().train(mode)
|
603 |
return self
|
604 |
|
@@ -657,6 +660,7 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
657 |
input = media_embeds[name].popleft()
|
658 |
label = torch.full([input.shape[0]], IGNORE_INDEX, device=labels[k].device, dtype=labels[k].dtype)
|
659 |
elif input_ids[k][pos].item() in self.pad_token_list:
|
|
|
660 |
end = pos + 1
|
661 |
pos = end
|
662 |
continue
|
@@ -1102,6 +1106,12 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
1102 |
input_tokens: 36000 001 002 003 004
|
1103 |
input_emds: <media emd> 001 002 003 004
|
1104 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
1105 |
# TODO: there is still a padding left vs right issue unsovled here.
|
1106 |
# print("prev args:",input_ids.shape, media, media_config, None, attention_mask)
|
1107 |
inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
|
@@ -1110,11 +1120,11 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
1110 |
output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
|
1111 |
# print("output_ids", self.tokenizer.batch_decode(output_ids))
|
1112 |
# input("wait for debug")
|
|
|
1113 |
if return_output_ids_only:
|
1114 |
return_value = output_ids
|
1115 |
else:
|
1116 |
# by default, return the input_ids and output_ids concatenated to keep consistency with the community VLMs like qwen
|
1117 |
-
# print(f"[DEBUG REMOTE] input_ids: {input_ids.shape}, output_ids: {output_ids.shape} attention_mask: {attention_mask.shape} {generation_kwargs=}"); exit(0)
|
1118 |
generation_config = generation_kwargs.get("generation_config", None)
|
1119 |
if generation_config is not None:
|
1120 |
num_generations = generation_config.num_return_sequences
|
|
|
201 |
else:
|
202 |
raise ValueError("`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config.")
|
203 |
|
204 |
+
# loading on auto by default
|
205 |
+
device_map = kwargs.get("device_map", "auto")
|
206 |
self.mm_projector = build_mm_projector(mm_projector_cfg, config)
|
207 |
self.vision_tower = build_vision_tower(vision_tower_cfg, config)
|
208 |
+
if device_map in ["auto", "cuda"]:
|
209 |
self.mm_projector = self.mm_projector.cuda()
|
210 |
self.vision_tower = self.vision_tower.cuda()
|
211 |
# set device_map auto can autoamtically shard llm to different devices
|
212 |
self.llm, self.tokenizer = self.init_llm(llm_cfg, config, device_map=device_map)
|
213 |
|
214 |
+
# NOTE(ligeng): hard code to set padding_side to left
|
215 |
+
self.tokenizer.padding_side = "left"
|
216 |
+
# TODO(ligeng): need to add other decoders from config
|
217 |
self.encoders = {"image": BasicImageEncoder(self), "video": BasicVideoEncoder(self)}
|
218 |
|
219 |
self.post_config()
|
|
|
420 |
weights_only: bool = True,
|
421 |
**kwargs,
|
422 |
):
|
423 |
+
# print("DEBUG2", kwargs); input()
|
424 |
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
|
425 |
return cls._from_config(config, **kwargs)
|
426 |
|
|
|
453 |
self.vision_tower = self.vision_tower.to(torch.float16)
|
454 |
######################################################################
|
455 |
self.training = self.llm.training
|
456 |
+
if self.training:
|
457 |
+
self.train()
|
458 |
+
else:
|
459 |
+
self.eval()
|
460 |
## configuration
|
461 |
if getattr(self.config, "llm_cfg", None) is None:
|
462 |
self.config.llm_cfg = self.llm.config
|
|
|
602 |
return image_features
|
603 |
|
604 |
def train(self, mode: bool = True):
|
|
|
|
|
|
|
|
|
605 |
super().train(mode)
|
606 |
return self
|
607 |
|
|
|
660 |
input = media_embeds[name].popleft()
|
661 |
label = torch.full([input.shape[0]], IGNORE_INDEX, device=labels[k].device, dtype=labels[k].dtype)
|
662 |
elif input_ids[k][pos].item() in self.pad_token_list:
|
663 |
+
# skip pad tokens
|
664 |
end = pos + 1
|
665 |
pos = end
|
666 |
continue
|
|
|
1106 |
input_tokens: 36000 001 002 003 004
|
1107 |
input_emds: <media emd> 001 002 003 004
|
1108 |
"""
|
1109 |
+
# NOTE: hard code to move to GPU
|
1110 |
+
input_ids = input_ids.cuda()
|
1111 |
+
media = {k: [v.cuda() for v in media[k]] for k in media}
|
1112 |
+
if attention_mask is not None:
|
1113 |
+
attention_mask = attention_mask.cuda()
|
1114 |
+
|
1115 |
# TODO: there is still a padding left vs right issue unsovled here.
|
1116 |
# print("prev args:",input_ids.shape, media, media_config, None, attention_mask)
|
1117 |
inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
|
|
|
1120 |
output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
|
1121 |
# print("output_ids", self.tokenizer.batch_decode(output_ids))
|
1122 |
# input("wait for debug")
|
1123 |
+
|
1124 |
if return_output_ids_only:
|
1125 |
return_value = output_ids
|
1126 |
else:
|
1127 |
# by default, return the input_ids and output_ids concatenated to keep consistency with the community VLMs like qwen
|
|
|
1128 |
generation_config = generation_kwargs.get("generation_config", None)
|
1129 |
if generation_config is not None:
|
1130 |
num_generations = generation_config.num_return_sequences
|