Ligeng-Zhu commited on
Commit
3cd8b83
·
verified ·
1 Parent(s): cb6db22

Upload files with `vila-upload`.

Browse files

Upload auto_processor.py
Upload modeling_vila.py
Upload llm/tokenizer_config.json

Files changed (3) hide show
  1. auto_processor.py +17 -9
  2. llm/tokenizer_config.json +1 -1
  3. modeling_vila.py +19 -9
auto_processor.py CHANGED
@@ -153,16 +153,19 @@ class VILAProcessor(ProcessorMixin):
153
  # image_processor_class = "VILAImageProcessor"
154
  # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
155
 
156
- def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, **kwargs):
157
  self.image_token = MEDIA_TOKENS["image"]
158
  self.video_token = MEDIA_TOKENS["video"]
159
  self.config = config
160
  self.image_processor = image_processor
161
  self.tokenizer = tokenizer
 
 
 
162
  # self.pad_token_id = tokenizer.pad_token_id
163
- self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0]
164
  self.eos_token_id = self.tokenizer.eos_token_id
165
- # self.pad_token_id = 151643
166
  super().__init__(image_processor, tokenizer, chat_template=chat_template)
167
 
168
  @staticmethod
@@ -193,6 +196,7 @@ class VILAProcessor(ProcessorMixin):
193
  ) -> tuple[list[PIL.Image.Image] | None, list[torch.Tensor | list[PIL.Image.Image]] | None, Optional[dict]]:
194
  """
195
  referernce from qwen_vl_utils
 
196
  """
197
  vision_infos = extract_vision_info(conversations)
198
  ## Read images or videos
@@ -233,12 +237,12 @@ class VILAProcessor(ProcessorMixin):
233
 
234
  @classmethod
235
  def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
236
  if os.path.isdir(pretrained_model_name_or_path):
237
  pretrained_model_name_or_path = pretrained_model_name_or_path
238
  else:
239
  print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
240
  from huggingface_hub import snapshot_download
241
-
242
  pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
243
 
244
  image_processor = AutoImageProcessor.from_pretrained(
@@ -248,7 +252,7 @@ class VILAProcessor(ProcessorMixin):
248
  osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True
249
  )
250
  config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
251
- return cls(image_processor=image_processor, tokenizer=tokenizer, config=config)
252
 
253
  def __repr__(self):
254
  # NOTE(ligeng): hard coded image_processor to avoid serialization error. Dirty fix
@@ -275,7 +279,7 @@ class VILAProcessor(ProcessorMixin):
275
  if kwargs.get("text", None) is not None:
276
  conversation = kwargs.get("text")
277
  assert conversation is not None, "`conversation` or `text` is required"
278
- padding_side = kwargs.get("padding_side", "left")
279
 
280
  input_ids_list = []
281
  attention_mask = []
@@ -289,7 +293,8 @@ class VILAProcessor(ProcessorMixin):
289
  media[name] += feat.media[name]
290
  for name in feat.media_config:
291
  media_config[name].update(feat.media_config[name])
292
-
 
293
  input_ids = pad_fn(
294
  input_ids_list,
295
  padding_value=self.pad_token_id,
@@ -299,9 +304,10 @@ class VILAProcessor(ProcessorMixin):
299
  attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
300
  attention_mask[input_ids == self.pad_token_id] = False
301
  # print("[DEBUGAAA]", self.pad_token_id, self.tokenizer.pad_token_id); exit(0)
302
-
303
  return BatchFeature(
304
  data={
 
305
  "input_ids": input_ids,
306
  "attention_mask": attention_mask,
307
  "media": media,
@@ -329,6 +335,8 @@ class VILAProcessor(ProcessorMixin):
329
  self.config.image_processor = self.image_processor
330
  if self.config.image_aspect_ratio == "dynamic":
331
  images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
 
 
332
  conversation[0]["value"] = conversation[0]["value"].replace(
333
  DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
334
  )
@@ -352,7 +360,7 @@ class VILAProcessor(ProcessorMixin):
352
  raise ValueError(f"Unsupported media type: {name}")
353
 
354
  inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
355
- input_ids = inputs.input_ids[0].cuda().unsqueeze(0)
356
  attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
357
  return BatchFeature(
358
  data={
 
153
  # image_processor_class = "VILAImageProcessor"
154
  # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
155
 
156
+ def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, padding_side="left", **kwargs):
157
  self.image_token = MEDIA_TOKENS["image"]
158
  self.video_token = MEDIA_TOKENS["video"]
159
  self.config = config
160
  self.image_processor = image_processor
161
  self.tokenizer = tokenizer
162
+ self.padding_side = padding_side
163
+
164
+ # This is a special setting for Qwen.
165
  # self.pad_token_id = tokenizer.pad_token_id
166
+ self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0] # 151643
167
  self.eos_token_id = self.tokenizer.eos_token_id
168
+
169
  super().__init__(image_processor, tokenizer, chat_template=chat_template)
170
 
171
  @staticmethod
 
196
  ) -> tuple[list[PIL.Image.Image] | None, list[torch.Tensor | list[PIL.Image.Image]] | None, Optional[dict]]:
197
  """
198
  referernce from qwen_vl_utils
199
+ NVILA does not depend on the function, but the interface is the same.
200
  """
201
  vision_infos = extract_vision_info(conversations)
202
  ## Read images or videos
 
237
 
238
  @classmethod
239
  def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
240
+ padding_side = kwargs.get("padding_side", "left")
241
  if os.path.isdir(pretrained_model_name_or_path):
242
  pretrained_model_name_or_path = pretrained_model_name_or_path
243
  else:
244
  print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
245
  from huggingface_hub import snapshot_download
 
246
  pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
247
 
248
  image_processor = AutoImageProcessor.from_pretrained(
 
252
  osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True
253
  )
254
  config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
255
+ return cls(image_processor=image_processor, tokenizer=tokenizer, config=config, padding_side=padding_side)
256
 
257
  def __repr__(self):
258
  # NOTE(ligeng): hard coded image_processor to avoid serialization error. Dirty fix
 
279
  if kwargs.get("text", None) is not None:
280
  conversation = kwargs.get("text")
281
  assert conversation is not None, "`conversation` or `text` is required"
282
+ padding_side = kwargs.get("padding_side", self.padding_side)
283
 
284
  input_ids_list = []
285
  attention_mask = []
 
293
  media[name] += feat.media[name]
294
  for name in feat.media_config:
295
  media_config[name].update(feat.media_config[name])
296
+
297
+ # pad the input_ids to batchfy
298
  input_ids = pad_fn(
299
  input_ids_list,
300
  padding_value=self.pad_token_id,
 
304
  attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
305
  attention_mask[input_ids == self.pad_token_id] = False
306
  # print("[DEBUGAAA]", self.pad_token_id, self.tokenizer.pad_token_id); exit(0)
307
+ input_texts = self.tokenizer.batch_decode(input_ids)
308
  return BatchFeature(
309
  data={
310
+ "input_texts": input_texts,
311
  "input_ids": input_ids,
312
  "attention_mask": attention_mask,
313
  "media": media,
 
335
  self.config.image_processor = self.image_processor
336
  if self.config.image_aspect_ratio == "dynamic":
337
  images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
338
+ # print("DEBUG", len(images)); input()
339
+ # NOTE: this only works for images appears at the first conversation
340
  conversation[0]["value"] = conversation[0]["value"].replace(
341
  DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
342
  )
 
360
  raise ValueError(f"Unsupported media type: {name}")
361
 
362
  inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
363
+ input_ids = inputs.input_ids[0].unsqueeze(0)#.cuda()
364
  attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
365
  return BatchFeature(
366
  data={
llm/tokenizer_config.json CHANGED
@@ -78,7 +78,7 @@
78
  "legacy": false,
79
  "model_max_length": 4096,
80
  "pad_token": "[PAD]",
81
- "padding_side": "right",
82
  "split_special_tokens": false,
83
  "tokenizer_class": "Qwen2Tokenizer",
84
  "unk_token": null
 
78
  "legacy": false,
79
  "model_max_length": 4096,
80
  "pad_token": "[PAD]",
81
+ "padding_side": "left",
82
  "split_special_tokens": false,
83
  "tokenizer_class": "Qwen2Tokenizer",
84
  "unk_token": null
modeling_vila.py CHANGED
@@ -201,17 +201,19 @@ class VILAPretrainedModel(PreTrainedModel):
201
  else:
202
  raise ValueError("`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config.")
203
 
204
- # loading on cpu by default
205
- device_map = kwargs.get("device_map", "cpu")
206
  self.mm_projector = build_mm_projector(mm_projector_cfg, config)
207
  self.vision_tower = build_vision_tower(vision_tower_cfg, config)
208
- if "auto" in device_map or "cuda" in device_map:
209
  self.mm_projector = self.mm_projector.cuda()
210
  self.vision_tower = self.vision_tower.cuda()
211
  # set device_map auto can autoamtically shard llm to different devices
212
  self.llm, self.tokenizer = self.init_llm(llm_cfg, config, device_map=device_map)
213
 
214
- # NOTE(ligeng): need to add other decoders from config
 
 
215
  self.encoders = {"image": BasicImageEncoder(self), "video": BasicVideoEncoder(self)}
216
 
217
  self.post_config()
@@ -418,6 +420,7 @@ class VILAPretrainedModel(PreTrainedModel):
418
  weights_only: bool = True,
419
  **kwargs,
420
  ):
 
421
  config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
422
  return cls._from_config(config, **kwargs)
423
 
@@ -450,6 +453,10 @@ class VILAPretrainedModel(PreTrainedModel):
450
  self.vision_tower = self.vision_tower.to(torch.float16)
451
  ######################################################################
452
  self.training = self.llm.training
 
 
 
 
453
  ## configuration
454
  if getattr(self.config, "llm_cfg", None) is None:
455
  self.config.llm_cfg = self.llm.config
@@ -595,10 +602,6 @@ class VILAForCasualLM(VILAPretrainedModel):
595
  return image_features
596
 
597
  def train(self, mode: bool = True):
598
- if mode:
599
- self.tokenizer.padding_side = "right"
600
- else:
601
- self.tokenizer.padding_side = "left"
602
  super().train(mode)
603
  return self
604
 
@@ -657,6 +660,7 @@ class VILAForCasualLM(VILAPretrainedModel):
657
  input = media_embeds[name].popleft()
658
  label = torch.full([input.shape[0]], IGNORE_INDEX, device=labels[k].device, dtype=labels[k].dtype)
659
  elif input_ids[k][pos].item() in self.pad_token_list:
 
660
  end = pos + 1
661
  pos = end
662
  continue
@@ -1102,6 +1106,12 @@ class VILAForCasualLM(VILAPretrainedModel):
1102
  input_tokens: 36000 001 002 003 004
1103
  input_emds: <media emd> 001 002 003 004
1104
  """
 
 
 
 
 
 
1105
  # TODO: there is still a padding left vs right issue unsovled here.
1106
  # print("prev args:",input_ids.shape, media, media_config, None, attention_mask)
1107
  inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
@@ -1110,11 +1120,11 @@ class VILAForCasualLM(VILAPretrainedModel):
1110
  output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
1111
  # print("output_ids", self.tokenizer.batch_decode(output_ids))
1112
  # input("wait for debug")
 
1113
  if return_output_ids_only:
1114
  return_value = output_ids
1115
  else:
1116
  # by default, return the input_ids and output_ids concatenated to keep consistency with the community VLMs like qwen
1117
- # print(f"[DEBUG REMOTE] input_ids: {input_ids.shape}, output_ids: {output_ids.shape} attention_mask: {attention_mask.shape} {generation_kwargs=}"); exit(0)
1118
  generation_config = generation_kwargs.get("generation_config", None)
1119
  if generation_config is not None:
1120
  num_generations = generation_config.num_return_sequences
 
201
  else:
202
  raise ValueError("`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config.")
203
 
204
+ # loading on auto by default
205
+ device_map = kwargs.get("device_map", "auto")
206
  self.mm_projector = build_mm_projector(mm_projector_cfg, config)
207
  self.vision_tower = build_vision_tower(vision_tower_cfg, config)
208
+ if device_map in ["auto", "cuda"]:
209
  self.mm_projector = self.mm_projector.cuda()
210
  self.vision_tower = self.vision_tower.cuda()
211
  # set device_map auto can autoamtically shard llm to different devices
212
  self.llm, self.tokenizer = self.init_llm(llm_cfg, config, device_map=device_map)
213
 
214
+ # NOTE(ligeng): hard code to set padding_side to left
215
+ self.tokenizer.padding_side = "left"
216
+ # TODO(ligeng): need to add other decoders from config
217
  self.encoders = {"image": BasicImageEncoder(self), "video": BasicVideoEncoder(self)}
218
 
219
  self.post_config()
 
420
  weights_only: bool = True,
421
  **kwargs,
422
  ):
423
+ # print("DEBUG2", kwargs); input()
424
  config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
425
  return cls._from_config(config, **kwargs)
426
 
 
453
  self.vision_tower = self.vision_tower.to(torch.float16)
454
  ######################################################################
455
  self.training = self.llm.training
456
+ if self.training:
457
+ self.train()
458
+ else:
459
+ self.eval()
460
  ## configuration
461
  if getattr(self.config, "llm_cfg", None) is None:
462
  self.config.llm_cfg = self.llm.config
 
602
  return image_features
603
 
604
  def train(self, mode: bool = True):
 
 
 
 
605
  super().train(mode)
606
  return self
607
 
 
660
  input = media_embeds[name].popleft()
661
  label = torch.full([input.shape[0]], IGNORE_INDEX, device=labels[k].device, dtype=labels[k].dtype)
662
  elif input_ids[k][pos].item() in self.pad_token_list:
663
+ # skip pad tokens
664
  end = pos + 1
665
  pos = end
666
  continue
 
1106
  input_tokens: 36000 001 002 003 004
1107
  input_emds: <media emd> 001 002 003 004
1108
  """
1109
+ # NOTE: hard code to move to GPU
1110
+ input_ids = input_ids.cuda()
1111
+ media = {k: [v.cuda() for v in media[k]] for k in media}
1112
+ if attention_mask is not None:
1113
+ attention_mask = attention_mask.cuda()
1114
+
1115
  # TODO: there is still a padding left vs right issue unsovled here.
1116
  # print("prev args:",input_ids.shape, media, media_config, None, attention_mask)
1117
  inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
 
1120
  output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
1121
  # print("output_ids", self.tokenizer.batch_decode(output_ids))
1122
  # input("wait for debug")
1123
+
1124
  if return_output_ids_only:
1125
  return_value = output_ids
1126
  else:
1127
  # by default, return the input_ids and output_ids concatenated to keep consistency with the community VLMs like qwen
 
1128
  generation_config = generation_kwargs.get("generation_config", None)
1129
  if generation_config is not None:
1130
  num_generations = generation_config.num_return_sequences