qqc1989 commited on
Commit
a3b1a17
·
verified ·
1 Parent(s): 64e1cac

initial this repo

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. config.json +0 -0
  2. image/ssd_car.jpg +0 -0
  3. python/cv_resize.py +13 -0
  4. python/infer_image.py +249 -0
  5. python/infer_text.py +237 -0
  6. python/infer_video.py +252 -0
  7. python/preprocess.py +155 -0
  8. python/utils.py +296 -0
  9. qwen2_5-vl-3b-image-ax650/Qwen2.5-VL-3B-Instruct_vision_nchw448.axmodel +3 -0
  10. qwen2_5-vl-3b-image-ax650/model.embed_tokens.weight.bfloat16.bin +3 -0
  11. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l0_together.axmodel +3 -0
  12. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l10_together.axmodel +3 -0
  13. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l11_together.axmodel +3 -0
  14. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l12_together.axmodel +3 -0
  15. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l13_together.axmodel +3 -0
  16. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l14_together.axmodel +3 -0
  17. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l15_together.axmodel +3 -0
  18. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l16_together.axmodel +3 -0
  19. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l17_together.axmodel +3 -0
  20. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l18_together.axmodel +3 -0
  21. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l19_together.axmodel +3 -0
  22. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l1_together.axmodel +3 -0
  23. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l20_together.axmodel +3 -0
  24. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l21_together.axmodel +3 -0
  25. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l22_together.axmodel +3 -0
  26. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l23_together.axmodel +3 -0
  27. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l24_together.axmodel +3 -0
  28. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l25_together.axmodel +3 -0
  29. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l26_together.axmodel +3 -0
  30. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l27_together.axmodel +3 -0
  31. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l28_together.axmodel +3 -0
  32. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l29_together.axmodel +3 -0
  33. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l2_together.axmodel +3 -0
  34. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l30_together.axmodel +3 -0
  35. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l31_together.axmodel +3 -0
  36. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l32_together.axmodel +3 -0
  37. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l33_together.axmodel +3 -0
  38. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l34_together.axmodel +3 -0
  39. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l35_together.axmodel +3 -0
  40. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l3_together.axmodel +3 -0
  41. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l4_together.axmodel +3 -0
  42. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l5_together.axmodel +3 -0
  43. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l6_together.axmodel +3 -0
  44. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l7_together.axmodel +3 -0
  45. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l8_together.axmodel +3 -0
  46. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l9_together.axmodel +3 -0
  47. qwen2_5-vl-3b-image-ax650/qwen2_5_vl_post.axmodel +3 -0
  48. qwen2_5-vl-tokenizer/chat_template.json +3 -0
  49. qwen2_5-vl-tokenizer/config.json +62 -0
  50. qwen2_5-vl-tokenizer/generation_config.json +14 -0
config.json ADDED
File without changes
image/ssd_car.jpg ADDED
python/cv_resize.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ from glob import glob
3
+ import os
4
+
5
+ paths = sorted(glob("demo/*.jpg"))
6
+ print(paths)
7
+ outdir = "demo_cv308"
8
+ os.makedirs(outdir, exist_ok=True)
9
+
10
+ for p in paths:
11
+ img = cv2.imread(p)
12
+ img = cv2.resize(img, (308,308))
13
+ cv2.imwrite(f"{outdir}/{os.path.basename(p)}", img)
python/infer_image.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoConfig
2
+ import numpy as np
3
+ from ml_dtypes import bfloat16
4
+ from axengine import InferenceSession
5
+ from PIL import Image
6
+ from torchvision import transforms
7
+ import torchvision.transforms as T
8
+ from torchvision.transforms.functional import InterpolationMode
9
+ import torch
10
+ from transformers import AutoTokenizer, AutoProcessor
11
+ from qwen_vl_utils import process_vision_info
12
+ import onnxruntime
13
+ import gc
14
+ from glob import glob
15
+ from utils import get_rope_index
16
+ from transformers.image_utils import PILImageResampling
17
+ from preprocess import Qwen2VLImageProcessorExport
18
+
19
+ def post_process(data, topk=1, topp=0.001, temperature=0.1):
20
+ def top_p(l: np.ndarray, p: float) -> np.ndarray:
21
+ index = np.argsort(l)
22
+ res = l.copy()
23
+ sum_p = 0
24
+ for i in index[::-1]:
25
+ if sum_p >= p:
26
+ res[i] = 0
27
+ sum_p += res[i]
28
+ return res / sum_p
29
+
30
+ def softmax(l: np.ndarray) -> np.ndarray:
31
+ l_max = l - l.max()
32
+ l_exp = np.exp(l_max)
33
+ res = l_exp / np.sum(l_exp)
34
+ return res.astype(np.float64)
35
+
36
+ r = data.astype(np.float32)
37
+ r = r.flatten()
38
+ # topk
39
+ candidate_index = np.argpartition(r, -topk)[-topk:]
40
+ candidate_value = r[candidate_index]
41
+ # temperature
42
+ candidate_value /= temperature
43
+ # softmax
44
+ candidate_soft = softmax(candidate_value)
45
+ # topp
46
+ candidate_soft = top_p(candidate_soft, topp)
47
+ candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
48
+ pos = np.random.multinomial(1, candidate_soft).argmax()
49
+ next_token = candidate_index[pos]
50
+ return next_token, candidate_index, candidate_soft
51
+
52
+
53
+
54
+ if __name__ == "__main__":
55
+
56
+ prefill_len = 320
57
+
58
+ checkpoint_dir=f"../Qwen2.5-VL-3B-Instruct-AX650-prefill_320/"
59
+ cfg = AutoConfig.from_pretrained(
60
+ checkpoint_dir, trust_remote_code=True
61
+ )
62
+
63
+ tokenizer = AutoTokenizer.from_pretrained(
64
+ checkpoint_dir, trust_remote_code=True
65
+ )
66
+
67
+ processor = AutoProcessor.from_pretrained(checkpoint_dir)
68
+
69
+ path = "demo1.jpg"
70
+ messages = [
71
+ {
72
+ "role": "user",
73
+ "content": [
74
+ {
75
+ "type": "image",
76
+ # "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
77
+ # "image": "demo.jpg"
78
+ "image": path,
79
+ "max_pixels": 448 * 448,
80
+ },
81
+ {"type": "text", "text": "Describe this image."},
82
+ ],
83
+ }
84
+ ]
85
+
86
+ # Preparation for inference
87
+ text = processor.apply_chat_template(
88
+ messages, tokenize=False, add_generation_prompt=True
89
+ )
90
+
91
+ image_inputs, video_inputs = process_vision_info(messages)
92
+ inputs = processor(
93
+ text=[text],
94
+ images=image_inputs,
95
+ videos=video_inputs,
96
+ padding=True,
97
+ return_tensors="pt",
98
+ )
99
+
100
+ position_ids,_ = get_rope_index(cfg, inputs["input_ids"], image_grid_thw=inputs['image_grid_thw'])
101
+
102
+ # pixel_values = inputs['pixel_values_videos']
103
+ # print("pixel_values",pixel_values.shape)
104
+ # extract img feature by vit
105
+ vit_session = InferenceSession.load_from_model(f'{checkpoint_dir}/Qwen2.5-VL-3B-Instruct_vision_nchw448.axmodel')
106
+
107
+
108
+ image = Image.open(path)
109
+ image = image.resize((448,448))
110
+ img_processor = Qwen2VLImageProcessorExport(max_pixels=448*448, patch_size=14, temporal_patch_size=2, merge_size=2)
111
+ pixel_values, grid_thw = img_processor._preprocess(image, do_resize=True, resample=PILImageResampling.BICUBIC,
112
+ do_rescale=False, do_normalize=False,
113
+ do_convert_rgb=True)
114
+
115
+ # seq_len, dim = pixel_values.shape
116
+ # ht = pixel_values.reshape(t, seq_len//t, dim)
117
+ print("pixel_values.shape",pixel_values.shape)
118
+ t, seq_len,_,_ = pixel_values.shape
119
+ ht = pixel_values
120
+ vit_output = []
121
+ for i in range(t):
122
+ out = vit_session.run({"hidden_states": ht[i]})[0]
123
+ vit_output.append(out.astype(bfloat16))
124
+
125
+ del vit_session
126
+ gc.collect()
127
+
128
+ vit_output = np.concatenate(vit_output, axis=0)
129
+ vit_output = vit_output[None,:,:]
130
+
131
+ print("vit feature extract done!")
132
+
133
+ token_ids = inputs['input_ids'].squeeze().numpy().tolist()
134
+
135
+ image_start_index = np.where(np.array(token_ids) == 151652)[0].tolist()[0]
136
+ image_insert_index = image_start_index + 1
137
+ embeds = np.load(f"{checkpoint_dir}/model.embed_tokens.weight.npy")
138
+ prefill_data = np.take(embeds, token_ids, axis=0)
139
+ prefill_data = prefill_data.astype(bfloat16)
140
+
141
+ prefill_data[ image_insert_index : image_insert_index + vit_output.shape[1]] = vit_output[0, :, :]
142
+ token_len = len(token_ids)
143
+
144
+
145
+ lastN = 1023
146
+ kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
147
+ k_caches = [
148
+ np.zeros((1, lastN, kv_dim), dtype=bfloat16)
149
+ for _ in range(cfg.num_hidden_layers)
150
+ ]
151
+ v_caches = [
152
+ np.zeros((1, lastN, kv_dim), dtype=bfloat16)
153
+ for _ in range(cfg.num_hidden_layers)
154
+ ]
155
+
156
+ prefill_decoder_sessins = []
157
+ for i in range(cfg.num_hidden_layers):
158
+ session = InferenceSession.load_from_model(
159
+ f"{checkpoint_dir}/qwen2_5_vl_p{prefill_len}_l{i}_together.axmodel"
160
+ )
161
+ prefill_decoder_sessins.append(session)
162
+ post_process_session = InferenceSession.load_from_model(
163
+ f"{checkpoint_dir}/qwen2_5_vl_post.axmodel"
164
+ # "../Qwen2.5-VL-3B-Instruct-AX650-video-prefill_512/qwen2_5_vl_post.axmodel"
165
+ )
166
+ print("model load done!")
167
+
168
+ """
169
+ prefill
170
+ """
171
+
172
+ for i in range(cfg.num_hidden_layers):
173
+ prefill_decoder_sessins[i].set_runtime_context(group_id=1)
174
+
175
+ if prefill_len > 0:
176
+ indices = np.zeros((3, prefill_len), dtype=np.uint32)
177
+
178
+ indices[:, 0:token_len] = position_ids.squeeze(1).numpy().astype(np.uint32)
179
+
180
+ mask = np.zeros((1, prefill_len, prefill_len)) - 65536
181
+ data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
182
+
183
+ data[:, 0:token_len] = prefill_data
184
+ for i, t in enumerate(token_ids):
185
+ mask[:, i, : i + 1] = 0
186
+ mask = mask.astype(bfloat16)
187
+ for i in range(cfg.num_hidden_layers):
188
+ input_feed = {
189
+ "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
190
+ "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
191
+ "indices": indices,
192
+ "input": data,
193
+ "mask": mask,
194
+ }
195
+ outputs = prefill_decoder_sessins[i].run(input_feed)
196
+ k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
197
+ v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
198
+ data = outputs[2][:, :token_len, :]
199
+
200
+ post_out = post_process_session.run({"input": data[:, token_len - 1, :]})[0]
201
+ next_token, posssible_tokens, possible_soft = post_process(post_out, topk=1)
202
+ posibles = [tokenizer.decode([t]) for t in posssible_tokens]
203
+ posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
204
+ token_ids.append(next_token)
205
+ print("prefill done!")
206
+
207
+ # set to decoder
208
+ for i in range(cfg.num_hidden_layers):
209
+ prefill_decoder_sessins[i].set_runtime_context(group_id=0)
210
+
211
+ # lastN = np.max(indices)
212
+ start_ids = np.max(indices) + 1
213
+ mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
214
+ mask[:, :, :lastN] -= 65536
215
+ mask[:, :, :token_len] = 0
216
+ for start_indice in range(lastN + 1):
217
+ if prefill_len > 0 and start_indice < token_len:
218
+ continue
219
+ next_token = token_ids[start_indice]
220
+ indices = np.array([start_ids], np.uint32).reshape((1, 1))
221
+ start_ids += 1
222
+ data = embeds[next_token, :].reshape((1, 1, cfg.hidden_size)).astype(bfloat16)
223
+
224
+ for i in range(cfg.num_hidden_layers):
225
+ input_feed = {
226
+ "K_cache": k_caches[i],
227
+ "V_cache": v_caches[i],
228
+ "indices": indices,
229
+ "input": data,
230
+ "mask": mask,
231
+ }
232
+ outputs = prefill_decoder_sessins[i].run(input_feed)
233
+ k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
234
+ v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
235
+ data = outputs[2]
236
+ mask[..., start_indice] = 0
237
+ if start_indice < token_len - 1:
238
+ pass
239
+ else:
240
+ post_out = post_process_session.run({"input": data})[0]
241
+ next_token, posssible_tokens, possible_soft = post_process(post_out)
242
+ token_ids.append(next_token)
243
+ if next_token == tokenizer.eos_token_id:
244
+ # print("hit eos!")
245
+ break
246
+ print(tokenizer.decode(token_ids[token_len:]))
247
+
248
+
249
+
python/infer_text.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoConfig
2
+ import numpy as np
3
+ from ml_dtypes import bfloat16
4
+ from axengine import InferenceSession
5
+ from PIL import Image
6
+ from torchvision import transforms
7
+ import torchvision.transforms as T
8
+ from torchvision.transforms.functional import InterpolationMode
9
+ import torch
10
+ from transformers import AutoTokenizer, AutoProcessor
11
+ from qwen_vl_utils import process_vision_info
12
+ import onnxruntime
13
+ import gc
14
+ from glob import glob
15
+ from utils import get_rope_index
16
+
17
+
18
+ def post_process(data, topk=1, topp=0.9, temperature=0.6):
19
+ def top_p(l: np.ndarray, p: float) -> np.ndarray:
20
+ index = np.argsort(l)
21
+ res = l.copy()
22
+ sum_p = 0
23
+ for i in index[::-1]:
24
+ if sum_p >= p:
25
+ res[i] = 0
26
+ sum_p += res[i]
27
+ return res / sum_p
28
+
29
+ def softmax(l: np.ndarray) -> np.ndarray:
30
+ l_max = l - l.max()
31
+ l_exp = np.exp(l_max)
32
+ res = l_exp / np.sum(l_exp)
33
+ return res.astype(np.float64)
34
+
35
+ r = data.astype(np.float32)
36
+ r = r.flatten()
37
+ # topk
38
+ candidate_index = np.argpartition(r, -topk)[-topk:]
39
+ candidate_value = r[candidate_index]
40
+ # temperature
41
+ candidate_value /= temperature
42
+ # softmax
43
+ candidate_soft = softmax(candidate_value)
44
+ # topp
45
+ candidate_soft = top_p(candidate_soft, topp)
46
+ candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
47
+ pos = np.random.multinomial(1, candidate_soft).argmax()
48
+ next_token = candidate_index[pos]
49
+ return next_token, candidate_index, candidate_soft
50
+
51
+
52
+
53
+ if __name__ == "__main__":
54
+
55
+ prefill_len = 512
56
+
57
+ checkpoint_dir=f"../Qwen2.5-VL-3B-Instruct-AX650-video-prefill_512/"
58
+ cfg = AutoConfig.from_pretrained(
59
+ checkpoint_dir, trust_remote_code=True
60
+ )
61
+
62
+ tokenizer = AutoTokenizer.from_pretrained(
63
+ checkpoint_dir, trust_remote_code=True
64
+ )
65
+
66
+
67
+ processor = AutoProcessor.from_pretrained(checkpoint_dir)
68
+
69
+ messages=[
70
+ {
71
+ "role": "user",
72
+ "content":[
73
+ {"type": "text", "text": "你是谁"},
74
+ ]
75
+ }
76
+ ]
77
+
78
+ # Preparation for inference
79
+ text = processor.apply_chat_template(
80
+ messages, tokenize=False, add_generation_prompt=True
81
+ )
82
+ image_inputs, video_inputs = process_vision_info(messages)
83
+ inputs = processor(
84
+ text=[text],
85
+ images=image_inputs,
86
+ videos=video_inputs,
87
+ padding=True,
88
+ return_tensors="pt",
89
+ )
90
+
91
+ position_ids,_ = get_rope_index(cfg, inputs["input_ids"])
92
+
93
+ # pixel_values = inputs['pixel_values_videos']
94
+
95
+ # # extract img feature by vit
96
+ # vit_session = InferenceSession.load_from_model(f'{checkpoint_dir}/Qwen2.5-VL-3B-Instruct_vision.axmodel')
97
+
98
+ # t = inputs['video_grid_thw'][0,0]
99
+
100
+ # seq_len, dim = pixel_values.shape
101
+ # ht = pixel_values.reshape(t, seq_len//t, dim)
102
+ # vit_output = []
103
+ # for i in range(t):
104
+ # print(i)
105
+ # out = vit_session.run({"hidden_states": ht[i].numpy()})[0] # (1, 576, 1176)
106
+ # vit_output.append(out.astype(bfloat16))
107
+
108
+ # del vit_session
109
+ # gc.collect()
110
+
111
+ # vit_output = np.concatenate(vit_output, axis=0)
112
+ # # vit_output = np.load("vit_out.npy")
113
+ # np.save("vit_out_ax.npy",vit_output)
114
+ # vit_output = vit_output[None,:,:]
115
+
116
+
117
+ # print("vit feature extract done!")
118
+
119
+ token_ids = inputs['input_ids'].squeeze().numpy().tolist()
120
+
121
+ # image_start_index = np.where(np.array(token_ids) == 151652)[0].tolist()[0]
122
+ # image_insert_index = image_start_index + 1
123
+ embeds = np.load(f"{checkpoint_dir}/model.embed_tokens.weight.npy")
124
+ prefill_data = np.take(embeds, token_ids, axis=0)
125
+ prefill_data = prefill_data.astype(bfloat16)
126
+
127
+ # prefill_data[ image_insert_index : image_insert_index + vit_output.shape[1]] = vit_output[0, :, :]
128
+ token_len = len(token_ids)
129
+
130
+
131
+ lastN = 1023
132
+ kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
133
+ k_caches = [
134
+ np.zeros((1, lastN, kv_dim), dtype=bfloat16)
135
+ for _ in range(cfg.num_hidden_layers)
136
+ ]
137
+ v_caches = [
138
+ np.zeros((1, lastN, kv_dim), dtype=bfloat16)
139
+ for _ in range(cfg.num_hidden_layers)
140
+ ]
141
+
142
+ prefill_decoder_sessins = []
143
+ for i in range(cfg.num_hidden_layers):
144
+ session = InferenceSession.load_from_model(
145
+ f"{checkpoint_dir}/qwen2_5_vl_p{prefill_len}_l{i}_together.axmodel"
146
+ )
147
+ prefill_decoder_sessins.append(session)
148
+ post_process_session = InferenceSession.load_from_model(
149
+ f"{checkpoint_dir}/qwen2_5_vl_post.axmodel"
150
+ # "../Qwen2.5-VL-3B-Instruct-AX650-video-prefill_512/qwen2_5_vl_post.axmodel"
151
+ )
152
+ print("model load done!")
153
+
154
+ """
155
+ prefill
156
+ """
157
+ print("position_ids",position_ids)
158
+
159
+ for i in range(cfg.num_hidden_layers):
160
+ prefill_decoder_sessins[i].set_runtime_context(group_id=1)
161
+
162
+ if prefill_len > 0:
163
+ indices = np.zeros((3, prefill_len), dtype=np.uint32)
164
+
165
+ indices[:, 0:token_len] = position_ids.squeeze(1).numpy().astype(np.uint32)
166
+
167
+ mask = np.zeros((1, prefill_len, prefill_len)) - 65536
168
+ data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
169
+
170
+ data[:, 0:token_len] = prefill_data
171
+ for i, t in enumerate(token_ids):
172
+ mask[:, i, : i + 1] = 0
173
+ mask = mask.astype(bfloat16)
174
+ for i in range(cfg.num_hidden_layers):
175
+ input_feed = {
176
+ "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
177
+ "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
178
+ "indices": indices,
179
+ "input": data,
180
+ "mask": mask,
181
+ }
182
+ outputs = prefill_decoder_sessins[i].run(input_feed)
183
+ k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
184
+ v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
185
+ data = outputs[2][:, :token_len, :]
186
+
187
+ post_out = post_process_session.run({"input": data[:, token_len - 1, :]})[0]
188
+ next_token, posssible_tokens, possible_soft = post_process(post_out, topk=1)
189
+ posibles = [tokenizer.decode([t]) for t in posssible_tokens]
190
+ posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
191
+ token_ids.append(next_token)
192
+ print("prefill done!")
193
+
194
+ # set to decoder
195
+ for i in range(cfg.num_hidden_layers):
196
+ prefill_decoder_sessins[i].set_runtime_context(group_id=0)
197
+
198
+ # lastN = np.max(indices)
199
+ start_ids = np.max(indices) + 1
200
+ mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
201
+ mask[:, :, :lastN] -= 65536
202
+ mask[:, :, :token_len] = 0
203
+ for start_indice in range(lastN + 1):
204
+ if prefill_len > 0 and start_indice < token_len:
205
+ continue
206
+ next_token = token_ids[start_indice]
207
+ indices = np.array([start_ids], np.uint32).reshape((1, 1))
208
+ start_ids += 1
209
+ data = embeds[next_token, :].reshape((1, 1, cfg.hidden_size)).astype(bfloat16)
210
+
211
+ for i in range(cfg.num_hidden_layers):
212
+ input_feed = {
213
+ "K_cache": k_caches[i],
214
+ "V_cache": v_caches[i],
215
+ "indices": indices,
216
+ "input": data,
217
+ "mask": mask,
218
+ }
219
+ outputs = prefill_decoder_sessins[i].run(input_feed)
220
+ k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
221
+ v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
222
+ data = outputs[2]
223
+ mask[..., start_indice] = 0
224
+ if start_indice < token_len - 1:
225
+ pass
226
+ else:
227
+ post_out = post_process_session.run({"input": data})[0]
228
+ next_token, posssible_tokens, possible_soft = post_process(post_out)
229
+ print("next_token",next_token)
230
+ token_ids.append(next_token)
231
+ if next_token == tokenizer.eos_token_id:
232
+ # print("hit eos!")
233
+ break
234
+ print(tokenizer.decode(token_ids[token_len:]))
235
+
236
+
237
+
python/infer_video.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoConfig
2
+ import numpy as np
3
+ from ml_dtypes import bfloat16
4
+ from axengine import InferenceSession
5
+ from PIL import Image
6
+ from torchvision import transforms
7
+ import torchvision.transforms as T
8
+ from torchvision.transforms.functional import InterpolationMode
9
+ import torch
10
+ from transformers import AutoTokenizer, AutoProcessor
11
+ from qwen_vl_utils import process_vision_info
12
+ import onnxruntime
13
+ import gc
14
+ from glob import glob
15
+ from utils import get_rope_index
16
+ from transformers.image_utils import PILImageResampling
17
+ from preprocess import Qwen2VLImageProcessorExport
18
+
19
+ def post_process(data, topk=1, topp=0.001, temperature=0.1):
20
+ def top_p(l: np.ndarray, p: float) -> np.ndarray:
21
+ index = np.argsort(l)
22
+ res = l.copy()
23
+ sum_p = 0
24
+ for i in index[::-1]:
25
+ if sum_p >= p:
26
+ res[i] = 0
27
+ sum_p += res[i]
28
+ return res / sum_p
29
+
30
+ def softmax(l: np.ndarray) -> np.ndarray:
31
+ l_max = l - l.max()
32
+ l_exp = np.exp(l_max)
33
+ res = l_exp / np.sum(l_exp)
34
+ return res.astype(np.float64)
35
+
36
+ r = data.astype(np.float32)
37
+ r = r.flatten()
38
+ # topk
39
+ candidate_index = np.argpartition(r, -topk)[-topk:]
40
+ candidate_value = r[candidate_index]
41
+ # temperature
42
+ candidate_value /= temperature
43
+ # softmax
44
+ candidate_soft = softmax(candidate_value)
45
+ # topp
46
+ candidate_soft = top_p(candidate_soft, topp)
47
+ candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
48
+ pos = np.random.multinomial(1, candidate_soft).argmax()
49
+ next_token = candidate_index[pos]
50
+ return next_token, candidate_index, candidate_soft
51
+
52
+
53
+
54
+ if __name__ == "__main__":
55
+
56
+ prefill_len = 512
57
+
58
+ checkpoint_dir=f"../Qwen2.5-VL-3B-Instruct-AX650-video-prefill_512/"
59
+ cfg = AutoConfig.from_pretrained(
60
+ checkpoint_dir, trust_remote_code=True
61
+ )
62
+
63
+ tokenizer = AutoTokenizer.from_pretrained(
64
+ checkpoint_dir, trust_remote_code=True
65
+ )
66
+
67
+ processor = AutoProcessor.from_pretrained(checkpoint_dir)
68
+ paths = sorted(glob("demo_cv308/*.jpg"))
69
+ print(paths)
70
+ messages = [
71
+ {
72
+ "role": "user",
73
+ "content": [
74
+ {
75
+ "type": "video",
76
+ "video": paths,
77
+ "max_pixels": 308 * 308,
78
+ "fps": 1.0,
79
+ },
80
+ {"type": "text", "text": "描述一下这个视频的内容"},
81
+ ],
82
+ }
83
+ ]
84
+
85
+ # Preparation for inference
86
+ text = processor.apply_chat_template(
87
+ messages, tokenize=False, add_generation_prompt=True
88
+ )
89
+
90
+ image_inputs, video_inputs = process_vision_info(messages)
91
+ inputs = processor(
92
+ text=[text],
93
+ images=image_inputs,
94
+ videos=video_inputs,
95
+ padding=True,
96
+ return_tensors="pt",
97
+ )
98
+
99
+ position_ids,_ = get_rope_index(cfg, inputs["input_ids"], video_grid_thw=inputs['video_grid_thw'], second_per_grid_ts=inputs['second_per_grid_ts'])
100
+
101
+ # pixel_values = inputs['pixel_values_videos']
102
+ # print("pixel_values",pixel_values.shape)
103
+ # extract img feature by vit
104
+ vit_session = InferenceSession.load_from_model(f'{checkpoint_dir}/Qwen2.5-VL-3B-Instruct_vision_nhwc.axmodel')
105
+
106
+ t = inputs['video_grid_thw'][0,0]
107
+
108
+ images = []
109
+ for p in paths:
110
+ img = Image.open(p)
111
+ images.append(img)
112
+
113
+ img_processor = Qwen2VLImageProcessorExport(max_pixels=308*308, patch_size=14, temporal_patch_size=2, merge_size=2)
114
+ pixel_values, grid_thw = img_processor._preprocess(images, do_resize=True, resample=PILImageResampling.BICUBIC,
115
+ do_rescale=False, do_normalize=False,
116
+ do_convert_rgb=True)
117
+
118
+ # seq_len, dim = pixel_values.shape
119
+ # ht = pixel_values.reshape(t, seq_len//t, dim)
120
+ print("pixel_values.shape",pixel_values.shape)
121
+ t, seq_len,_,_ = pixel_values.shape
122
+ ht = pixel_values
123
+ vit_output = []
124
+ for i in range(t):
125
+ out = vit_session.run({"hidden_states": ht[i]})[0] # (1, 576, 1176)
126
+ vit_output.append(out.astype(bfloat16))
127
+
128
+ del vit_session
129
+ gc.collect()
130
+
131
+ vit_output = np.concatenate(vit_output, axis=0)
132
+ vit_output = vit_output[None,:,:]
133
+
134
+ print("vit feature extract done!")
135
+
136
+ token_ids = inputs['input_ids'].squeeze().numpy().tolist()
137
+
138
+ image_start_index = np.where(np.array(token_ids) == 151652)[0].tolist()[0]
139
+ image_insert_index = image_start_index + 1
140
+ embeds = np.load(f"{checkpoint_dir}/model.embed_tokens.weight.npy")
141
+ prefill_data = np.take(embeds, token_ids, axis=0)
142
+ prefill_data = prefill_data.astype(bfloat16)
143
+
144
+ prefill_data[ image_insert_index : image_insert_index + vit_output.shape[1]] = vit_output[0, :, :]
145
+ token_len = len(token_ids)
146
+
147
+
148
+ lastN = 1023
149
+ kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
150
+ k_caches = [
151
+ np.zeros((1, lastN, kv_dim), dtype=bfloat16)
152
+ for _ in range(cfg.num_hidden_layers)
153
+ ]
154
+ v_caches = [
155
+ np.zeros((1, lastN, kv_dim), dtype=bfloat16)
156
+ for _ in range(cfg.num_hidden_layers)
157
+ ]
158
+
159
+ prefill_decoder_sessins = []
160
+ for i in range(cfg.num_hidden_layers):
161
+ session = InferenceSession.load_from_model(
162
+ f"{checkpoint_dir}/qwen2_5_vl_p{prefill_len}_l{i}_together.axmodel"
163
+ )
164
+ prefill_decoder_sessins.append(session)
165
+ post_process_session = InferenceSession.load_from_model(
166
+ f"{checkpoint_dir}/qwen2_5_vl_post.axmodel"
167
+ # "../Qwen2.5-VL-3B-Instruct-AX650-video-prefill_512/qwen2_5_vl_post.axmodel"
168
+ )
169
+ print("model load done!")
170
+
171
+ """
172
+ prefill
173
+ """
174
+
175
+ for i in range(cfg.num_hidden_layers):
176
+ prefill_decoder_sessins[i].set_runtime_context(group_id=1)
177
+
178
+ if prefill_len > 0:
179
+ indices = np.zeros((3, prefill_len), dtype=np.uint32)
180
+
181
+ indices[:, 0:token_len] = position_ids.squeeze(1).numpy().astype(np.uint32)
182
+
183
+ mask = np.zeros((1, prefill_len, prefill_len)) - 65536
184
+ data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
185
+
186
+ data[:, 0:token_len] = prefill_data
187
+ for i, t in enumerate(token_ids):
188
+ mask[:, i, : i + 1] = 0
189
+ mask = mask.astype(bfloat16)
190
+ for i in range(cfg.num_hidden_layers):
191
+ input_feed = {
192
+ "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
193
+ "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
194
+ "indices": indices,
195
+ "input": data,
196
+ "mask": mask,
197
+ }
198
+ outputs = prefill_decoder_sessins[i].run(input_feed)
199
+ k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
200
+ v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
201
+ data = outputs[2][:, :token_len, :]
202
+
203
+ post_out = post_process_session.run({"input": data[:, token_len - 1, :]})[0]
204
+ next_token, posssible_tokens, possible_soft = post_process(post_out, topk=1)
205
+ posibles = [tokenizer.decode([t]) for t in posssible_tokens]
206
+ posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
207
+ token_ids.append(next_token)
208
+ print("prefill done!")
209
+
210
+ # set to decoder
211
+ for i in range(cfg.num_hidden_layers):
212
+ prefill_decoder_sessins[i].set_runtime_context(group_id=0)
213
+
214
+ # lastN = np.max(indices)
215
+ start_ids = np.max(indices) + 1
216
+ mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
217
+ mask[:, :, :lastN] -= 65536
218
+ mask[:, :, :token_len] = 0
219
+ for start_indice in range(lastN + 1):
220
+ if prefill_len > 0 and start_indice < token_len:
221
+ continue
222
+ next_token = token_ids[start_indice]
223
+ indices = np.array([start_ids], np.uint32).reshape((1, 1))
224
+ start_ids += 1
225
+ data = embeds[next_token, :].reshape((1, 1, cfg.hidden_size)).astype(bfloat16)
226
+
227
+ for i in range(cfg.num_hidden_layers):
228
+ input_feed = {
229
+ "K_cache": k_caches[i],
230
+ "V_cache": v_caches[i],
231
+ "indices": indices,
232
+ "input": data,
233
+ "mask": mask,
234
+ }
235
+ outputs = prefill_decoder_sessins[i].run(input_feed)
236
+ k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
237
+ v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
238
+ data = outputs[2]
239
+ mask[..., start_indice] = 0
240
+ if start_indice < token_len - 1:
241
+ pass
242
+ else:
243
+ post_out = post_process_session.run({"input": data})[0]
244
+ next_token, posssible_tokens, possible_soft = post_process(post_out)
245
+ token_ids.append(next_token)
246
+ if next_token == tokenizer.eos_token_id:
247
+ # print("hit eos!")
248
+ break
249
+ print(tokenizer.decode(token_ids[token_len:]))
250
+
251
+
252
+
python/preprocess.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional, Union
2
+ from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor, smart_resize
3
+ from transformers.image_transforms import (
4
+ convert_to_rgb,
5
+ resize,
6
+ to_channel_dimension_format,
7
+ )
8
+ from transformers.image_utils import (
9
+ OPENAI_CLIP_MEAN,
10
+ OPENAI_CLIP_STD,
11
+ ChannelDimension,
12
+ ImageInput,
13
+ PILImageResampling,
14
+ VideoInput,
15
+ get_image_size,
16
+ infer_channel_dimension_format,
17
+ is_scaled_image,
18
+ make_batched_videos,
19
+ make_flat_list_of_images,
20
+ make_list_of_images,
21
+ to_numpy_array,
22
+ valid_images,
23
+ validate_preprocess_arguments,)
24
+
25
+ from transformers.utils import TensorType, logging
26
+ import numpy as np
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+ class Qwen2VLImageProcessorExport(Qwen2VLImageProcessor):
31
+
32
+
33
+ def _preprocess(
34
+ self,
35
+ images: Union[ImageInput, VideoInput],
36
+ do_resize: bool = None,
37
+ resample: PILImageResampling = None,
38
+ do_rescale: bool = None,
39
+ rescale_factor: float = None,
40
+ do_normalize: bool = None,
41
+ image_mean: Optional[Union[float, List[float]]] = None,
42
+ image_std: Optional[Union[float, List[float]]] = None,
43
+ do_convert_rgb: bool = None,
44
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
45
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
46
+ ):
47
+ """
48
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
49
+
50
+ Args:
51
+ images (`ImageInput`):
52
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
53
+ vision_info (`List[Dict]`, *optional*):
54
+ Optional list of dictionaries containing additional information about vision inputs.
55
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
56
+ Whether to resize the image.
57
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
58
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
59
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
60
+ Whether to rescale the image.
61
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
62
+ Scale factor to use if rescaling the image.
63
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
64
+ Whether to normalize the image.
65
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
66
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
67
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
68
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
69
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
70
+ Whether to convert the image to RGB.
71
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
72
+ The channel dimension format for the output image. Can be one of:
73
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
74
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
75
+ - Unset: Use the channel dimension format of the input image.
76
+ input_data_format (`ChannelDimension` or `str`, *optional*):
77
+ The channel dimension format for the input image. Can be one of:
78
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
79
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
80
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
81
+ """
82
+ images = make_list_of_images(images)
83
+
84
+ if do_convert_rgb:
85
+ images = [convert_to_rgb(image) for image in images]
86
+
87
+ # All transformations expect numpy arrays.
88
+ images = [to_numpy_array(image) for image in images]
89
+
90
+ if do_rescale and is_scaled_image(images[0]):
91
+ logger.warning_once(
92
+ "It looks like you are trying to rescale already rescaled images. If the input"
93
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
94
+ )
95
+ if input_data_format is None:
96
+ # We assume that all images have the same channel dimension format.
97
+ input_data_format = infer_channel_dimension_format(images[0])
98
+
99
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
100
+ resized_height, resized_width = height, width
101
+ processed_images = []
102
+ for image in images:
103
+ if do_resize:
104
+ resized_height, resized_width = smart_resize(
105
+ height,
106
+ width,
107
+ factor=self.patch_size * self.merge_size,
108
+ min_pixels=self.min_pixels,
109
+ max_pixels=self.max_pixels,
110
+ )
111
+ image = resize(
112
+ image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
113
+ )
114
+
115
+ if do_rescale:
116
+ image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
117
+
118
+ if do_normalize:
119
+ image = self.normalize(
120
+ image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
121
+ )
122
+
123
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
124
+ processed_images.append(image)
125
+
126
+ patches = np.array(processed_images)
127
+ if data_format == ChannelDimension.LAST:
128
+ patches = patches.transpose(0, 3, 1, 2)
129
+ if patches.shape[0] % self.temporal_patch_size != 0:
130
+ repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
131
+ patches = np.concatenate([patches, repeats], axis=0)
132
+ channel = patches.shape[1]
133
+ grid_t = patches.shape[0] // self.temporal_patch_size
134
+ grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
135
+ patches = patches.reshape(
136
+ grid_t, # 0
137
+ self.temporal_patch_size, # 1
138
+ channel, # 2
139
+ grid_h // self.merge_size, # 3
140
+ self.merge_size, # 4
141
+ self.patch_size, # 5
142
+ grid_w // self.merge_size, # 6
143
+ self.merge_size, # 7
144
+ self.patch_size, # 8
145
+ )
146
+ # patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
147
+ # flatten_patches = patches.reshape(
148
+ # grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
149
+ # )
150
+
151
+ patches = patches.transpose(0, 3, 6, 4, 7, 1,5,8, 2)
152
+ flatten_patches = patches.reshape(
153
+ grid_t, grid_h * grid_w, self.temporal_patch_size * self.patch_size * self.patch_size, channel
154
+ )
155
+ return flatten_patches, (grid_t, grid_h, grid_w)
python/utils.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
6
+ def get_rope_index(
7
+ config,
8
+ input_ids: Optional[torch.LongTensor] = None,
9
+ image_grid_thw: Optional[torch.LongTensor] = None,
10
+ video_grid_thw: Optional[torch.LongTensor] = None,
11
+ second_per_grid_ts: Optional[torch.Tensor] = None,
12
+ attention_mask: Optional[torch.Tensor] = None,
13
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
14
+ """
15
+ Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
16
+
17
+ Explanation:
18
+ Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
19
+
20
+ For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
21
+ Examples:
22
+ input_ids: [T T T T T], here T is for text.
23
+ temporal position_ids: [0, 1, 2, 3, 4]
24
+ height position_ids: [0, 1, 2, 3, 4]
25
+ width position_ids: [0, 1, 2, 3, 4]
26
+
27
+ For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
28
+ and 1D rotary position embeddin for text part.
29
+ Examples:
30
+ Temporal (Time): 3 patches, representing different segments of the video in time.
31
+ Height: 2 patches, dividing each frame vertically.
32
+ Width: 2 patches, dividing each frame horizontally.
33
+ We also have some important parameters:
34
+ fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
35
+ tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
36
+ temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
37
+ interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
38
+ input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
39
+ vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
40
+ vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
41
+ vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
42
+ text temporal position_ids: [101, 102, 103, 104, 105]
43
+ text height position_ids: [101, 102, 103, 104, 105]
44
+ text width position_ids: [101, 102, 103, 104, 105]
45
+ Here we calculate the text start position_ids as the max vision position_ids plus 1.
46
+
47
+ Args:
48
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
49
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
50
+ it.
51
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
52
+ The temporal, height and width of feature shape of each image in LLM.
53
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
54
+ The temporal, height and width of feature shape of each video in LLM.
55
+ second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
56
+ The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
57
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
58
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
59
+
60
+ - 1 for tokens that are **not masked**,
61
+ - 0 for tokens that are **masked**.
62
+
63
+ Returns:
64
+ position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
65
+ mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
66
+ """
67
+ spatial_merge_size = config.vision_config.spatial_merge_size
68
+ image_token_id = config.image_token_id
69
+ video_token_id = config.video_token_id
70
+ vision_start_token_id = config.vision_start_token_id
71
+ mrope_position_deltas = []
72
+ if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
73
+ total_input_ids = input_ids
74
+ if attention_mask is None:
75
+ attention_mask = torch.ones_like(total_input_ids)
76
+ position_ids = torch.ones(
77
+ 3,
78
+ input_ids.shape[0],
79
+ input_ids.shape[1],
80
+ dtype=input_ids.dtype,
81
+ device=input_ids.device,
82
+ )
83
+ image_index, video_index = 0, 0
84
+ attention_mask = attention_mask.to(total_input_ids.device)
85
+ for i, input_ids in enumerate(total_input_ids):
86
+ input_ids = input_ids[attention_mask[i] == 1]
87
+ image_nums, video_nums = 0, 0
88
+ vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
89
+ vision_tokens = input_ids[vision_start_indices + 1]
90
+ image_nums = (vision_tokens == image_token_id).sum()
91
+ video_nums = (vision_tokens == video_token_id).sum()
92
+ input_tokens = input_ids.tolist()
93
+ llm_pos_ids_list: list = []
94
+ st = 0
95
+ remain_images, remain_videos = image_nums, video_nums
96
+ for _ in range(image_nums + video_nums):
97
+ if image_token_id in input_tokens and remain_images > 0:
98
+ ed_image = input_tokens.index(image_token_id, st)
99
+ else:
100
+ ed_image = len(input_tokens) + 1
101
+ if video_token_id in input_tokens and remain_videos > 0:
102
+ ed_video = input_tokens.index(video_token_id, st)
103
+ else:
104
+ ed_video = len(input_tokens) + 1
105
+ if ed_image < ed_video:
106
+ t, h, w = (
107
+ image_grid_thw[image_index][0],
108
+ image_grid_thw[image_index][1],
109
+ image_grid_thw[image_index][2],
110
+ )
111
+ second_per_grid_t = 0
112
+ image_index += 1
113
+ remain_images -= 1
114
+ ed = ed_image
115
+
116
+ else:
117
+ t, h, w = (
118
+ video_grid_thw[video_index][0],
119
+ video_grid_thw[video_index][1],
120
+ video_grid_thw[video_index][2],
121
+ )
122
+ if second_per_grid_ts is not None:
123
+ second_per_grid_t = second_per_grid_ts[video_index]
124
+ else:
125
+ second_per_grid_t = 1.0
126
+ video_index += 1
127
+ remain_videos -= 1
128
+ ed = ed_video
129
+ llm_grid_t, llm_grid_h, llm_grid_w = (
130
+ t.item(),
131
+ h.item() // spatial_merge_size,
132
+ w.item() // spatial_merge_size,
133
+ )
134
+ text_len = ed - st
135
+
136
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
137
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
138
+
139
+ range_tensor = torch.arange(llm_grid_t).view(-1, 1)
140
+ expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
141
+
142
+ time_tensor = expanded_range * second_per_grid_t * config.vision_config.tokens_per_second
143
+
144
+ time_tensor_long = time_tensor.long()
145
+ t_index = time_tensor_long.flatten()
146
+
147
+ h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
148
+ w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
149
+ llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
150
+ st = ed + llm_grid_t * llm_grid_h * llm_grid_w
151
+
152
+ if st < len(input_tokens):
153
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
154
+ text_len = len(input_tokens) - st
155
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
156
+
157
+ llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
158
+ position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
159
+ mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
160
+ mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
161
+ return position_ids, mrope_position_deltas
162
+ else:
163
+ if attention_mask is not None:
164
+ position_ids = attention_mask.long().cumsum(-1) - 1
165
+ position_ids.masked_fill_(attention_mask == 0, 1)
166
+ position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
167
+ max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
168
+ mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
169
+ else:
170
+ position_ids = (
171
+ torch.arange(input_ids.shape[1], device=input_ids.device)
172
+ .view(1, 1, -1)
173
+ .expand(3, input_ids.shape[0], -1)
174
+ )
175
+ mrope_position_deltas = torch.zeros(
176
+ [input_ids.shape[0], 1],
177
+ device=input_ids.device,
178
+ dtype=input_ids.dtype,
179
+ )
180
+
181
+ return position_ids, mrope_position_deltas
182
+
183
+
184
+
185
+ def get_window_index(grid_thw, window_size=112,spatial_merge_size=2,patch_size=14):
186
+ spatial_merge_unit = spatial_merge_size * spatial_merge_size
187
+ window_index: list = []
188
+ cu_window_seqlens: list = [0]
189
+ window_index_id = 0
190
+ vit_merger_window_size = window_size // spatial_merge_size // patch_size
191
+
192
+ for grid_t, grid_h, grid_w in grid_thw:
193
+ llm_grid_h, llm_grid_w = (
194
+ grid_h // spatial_merge_size,
195
+ grid_w // spatial_merge_size,
196
+ )
197
+ index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
198
+ pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
199
+ pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
200
+ num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
201
+ num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
202
+ index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
203
+ index_padded = index_padded.reshape(
204
+ grid_t,
205
+ num_windows_h,
206
+ vit_merger_window_size,
207
+ num_windows_w,
208
+ vit_merger_window_size,
209
+ )
210
+ index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
211
+ grid_t,
212
+ num_windows_h * num_windows_w,
213
+ vit_merger_window_size,
214
+ vit_merger_window_size,
215
+ )
216
+ seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
217
+ index_padded = index_padded.reshape(-1)
218
+ index_new = index_padded[index_padded != -100]
219
+ window_index.append(index_new + window_index_id)
220
+ cu_seqlens_tmp = seqlens.cumsum(0) * spatial_merge_unit + cu_window_seqlens[-1]
221
+ cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
222
+ window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
223
+ window_index = torch.cat(window_index, dim=0)
224
+
225
+ return window_index, cu_window_seqlens
226
+
227
+ class Qwen2_5_VisionRotaryEmbedding(nn.Module):
228
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
229
+ super().__init__()
230
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
231
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
232
+
233
+ def forward(self, seqlen: int) -> torch.Tensor:
234
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
235
+ freqs = torch.outer(seq, self.inv_freq)
236
+ return freqs
237
+
238
+ def rot_pos_emb( grid_thw, spatial_merge_size=2, hidden_size=2048, num_heads=16):
239
+ head_dim = hidden_size // num_heads
240
+ rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
241
+ pos_ids = []
242
+ for t, h, w in grid_thw:
243
+ hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
244
+ hpos_ids = hpos_ids.reshape(
245
+ h // spatial_merge_size,
246
+ spatial_merge_size,
247
+ w // spatial_merge_size,
248
+ spatial_merge_size,
249
+ )
250
+ hpos_ids = hpos_ids.permute(0, 2, 1, 3)
251
+ hpos_ids = hpos_ids.flatten()
252
+
253
+ wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
254
+ wpos_ids = wpos_ids.reshape(
255
+ h // spatial_merge_size,
256
+ spatial_merge_size,
257
+ w // spatial_merge_size,
258
+ spatial_merge_size,
259
+ )
260
+ wpos_ids = wpos_ids.permute(0, 2, 1, 3)
261
+ wpos_ids = wpos_ids.flatten()
262
+ print("hpos_ids",hpos_ids.shape)
263
+ pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
264
+ pos_ids = torch.cat(pos_ids, dim=0)
265
+ max_grid_size = grid_thw[:, 1:].max()
266
+ # return max_grid_size, pos_ids
267
+ rotary_pos_emb_full = rotary_pos_emb(max_grid_size)
268
+ rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
269
+ return rotary_pos_emb
270
+
271
+ def rot_pos_id(grid_thw, spatial_merge_size=2):
272
+ pos_ids = []
273
+ for t, h, w in grid_thw:
274
+ hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
275
+ hpos_ids = hpos_ids.reshape(
276
+ h // spatial_merge_size,
277
+ spatial_merge_size,
278
+ w // spatial_merge_size,
279
+ spatial_merge_size,
280
+ )
281
+ hpos_ids = hpos_ids.permute(0, 2, 1, 3)
282
+ hpos_ids = hpos_ids.flatten()
283
+
284
+ wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
285
+ wpos_ids = wpos_ids.reshape(
286
+ h // spatial_merge_size,
287
+ spatial_merge_size,
288
+ w // spatial_merge_size,
289
+ spatial_merge_size,
290
+ )
291
+ wpos_ids = wpos_ids.permute(0, 2, 1, 3)
292
+ wpos_ids = wpos_ids.flatten()
293
+ pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1))
294
+ pos_ids = torch.cat(pos_ids, dim=0)
295
+
296
+ return pos_ids
qwen2_5-vl-3b-image-ax650/Qwen2.5-VL-3B-Instruct_vision_nchw448.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47c6a5c75e3941c49123018f352785dbcbd028dd7d1e741a16c6453f9c9209cf
3
+ size 921254437
qwen2_5-vl-3b-image-ax650/model.embed_tokens.weight.bfloat16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b84907567aa829c6f24cadbdeb68c3c44d25fc0a8be8e917fd603cb64f72810d
3
+ size 622329856
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l0_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45c870e33c94182a3bba8ca328133291dec3c4946610481755a1de37b8379164
3
+ size 86641264
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l10_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c6b45f5df73bf8c702c0d5bab60c85e2056a7ebd0aab45a0110eb185481f7b5
3
+ size 86643728
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l11_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:381ce660b87875613ee91f8e82b17661642eea481229b13d066610c42eac13bf
3
+ size 86643728
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l12_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa11b8cf730de19f78516701401ffc242a85e2869643f1971c151b4516b3d43f
3
+ size 86644080
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l13_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd0f2f7993c43c841fd90be53c55474bd54910ce3cec325f9b114ac002efb612
3
+ size 86643888
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l14_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41525a42c3d78d04f3c6485de9415e92aeee0101895e7d76917c244ca2100811
3
+ size 86643440
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l15_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fabaaa6e5199467048c0c55beb8dfc60446c9511a12fcd7c725c6e8b446076b5
3
+ size 86643504
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l16_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cc277bff420c4cbf5a00ad58368828650c68cfddfda95416973192444824e84
3
+ size 86643888
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l17_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02bea7cc7c031891238a140ec5df4176de5a23aec7a458ef33537f2f4eb01dc2
3
+ size 86643248
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l18_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c8130068bd6d060671b445607d1aa35e302b649471d6a88f279b2299d1eab00
3
+ size 86643728
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l19_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cc7633e9d7795f46a5be0de1d884e44a7248669918f75a08911954356b2bb97
3
+ size 86642768
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l1_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b657f2779b85f1baf1cf448b88f6966bcb0fc22bbda7f106248f348208275c8
3
+ size 86637264
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l20_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f232bbcd9e2d64c3f0ae9853b3aece5a1f7756fb6e582ff66baedf2ced5b890
3
+ size 86642256
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l21_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:692940f18fee8844c13900d90ed9c5b85abde5d3ded81e72131db9a7fdf87c14
3
+ size 86643408
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l22_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67716a9cfb6733b16757ae0507e9a3599488fd78ad9fbf32358b9de9da3e9f07
3
+ size 86643728
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l23_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1d5bee3047692f79d708dbd297daab31d3c63dff6180c7a4072a3007fbb7eb5
3
+ size 86642448
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l24_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e454da2a0a0a92dfba358e99ecc8bc6eef518b84896101b15619e1f566a0eb1
3
+ size 86643920
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l25_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb0eb5fef9d9c1015767e3bd9d5612632c1717d761d6da7f64d6679b8b52337a
3
+ size 86643024
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l26_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0028b84436b3f54598798024bf9c1e404f2cead942ecdc9cebddf8c2391d2f6d
3
+ size 86643408
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l27_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3275466bf43cd6444c5cb82dc937c7a1dc839d2bc8bb05c93ecf9d472c509a0
3
+ size 86642672
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l28_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:212e92ea3e46869a292a2c6e44c6d6c37808d93102d47f970df6d6cffe573e77
3
+ size 86643184
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l29_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c2f2b233469dc19cd76567978e426536907933ba67fda4250b5c716b2f3ebc1
3
+ size 86644080
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l2_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5d6947979aa008d6f50b72fb8d578fe7b2dfc972f1f4944578b63228cef61ad
3
+ size 86638576
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l30_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4d86e37806cc1f06a506a782c7c4652449c42a168ea573f68c883ddcfc64796
3
+ size 86643024
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l31_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:172d662877b4a9013449fefcb79ebc8f5d0a56b10871382b0c7c8ee3eebfe339
3
+ size 86642736
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l32_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0978b287a9500bfbbe906422b8bcc190fcdb305aff9eb1f3065e1e830aa9396d
3
+ size 86643440
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l33_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c455ddc93f9a1199676a1f790f85e22ebf8213836e4a70bc661f26e4cf6c66c3
3
+ size 86644080
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l34_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b2beeb8482259dda3b6275a6ef33fbe6ee93cf70cb9b074187193ce7c8f0d97
3
+ size 86644336
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l35_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9247f1a202b8ed5e007b0923640ef9977099c95130313c9ac3ef370978d9a1d
3
+ size 86643856
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l3_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25d616dfcbfe5b777bdf69e3ac52f0c50939934388b6fc2fec99545651c5aafe
3
+ size 86641392
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l4_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a7d88693f4b01807fe552f19f37b66ad011ca5a36ee241623f3749a94c1e7c7
3
+ size 86640688
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l5_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2ce5e2dba9c9b0b3243f57eaf4707932e3b86fb31ff9cd05425fdefa06b0bb1
3
+ size 86643152
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l6_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31ca7a6025017eba34f2f2d0904458a1e221b046eed71b4255fc554317c88de0
3
+ size 86643696
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l7_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a57e3392728b1fd358d01484c32df85c99b23708190c40727523c814a9f6f60
3
+ size 86644048
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l8_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09f0291e623c33f0b0951d92fa22191b400a92d445d95d616367446f4808ee01
3
+ size 86644272
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l9_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:527546142999c59efec9c58b97a8d0cfea3d3d179d40bd8d4006db74cf39e031
3
+ size 86644304
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_post.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:455e7705d3bf4ebbc602476276904b76573e9094f7a1e6bde4ec782666bd95d5
3
+ size 339965940
qwen2_5-vl-tokenizer/chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
qwen2_5-vl-tokenizer/config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2_5_VLForConditionalGeneration"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "vision_start_token_id": 151652,
9
+ "vision_end_token_id": 151653,
10
+ "vision_token_id": 151654,
11
+ "image_token_id": 151655,
12
+ "video_token_id": 151656,
13
+ "hidden_act": "silu",
14
+ "hidden_size": 2048,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 11008,
17
+ "max_position_embeddings": 128000,
18
+ "max_window_layers": 70,
19
+ "model_type": "qwen2_5_vl",
20
+ "num_attention_heads": 16,
21
+ "num_hidden_layers": 36,
22
+ "num_key_value_heads": 2,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_theta": 1000000.0,
25
+ "sliding_window": 32768,
26
+ "tie_word_embeddings": true,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.41.2",
29
+ "use_cache": true,
30
+ "use_sliding_window": false,
31
+ "vision_config": {
32
+ "initializer_range": 0.02,
33
+ "depth": 32,
34
+ "hidden_act": "silu",
35
+ "hidden_size": 1280,
36
+ "intermediate_size": 3420,
37
+ "num_heads": 16,
38
+ "in_chans": 3,
39
+ "out_hidden_size": 2048,
40
+ "patch_size": 14,
41
+ "spatial_merge_size": 2,
42
+ "spatial_patch_size": 14,
43
+ "window_size": 112,
44
+ "fullatt_block_indexes": [
45
+ 7,
46
+ 15,
47
+ 23,
48
+ 31
49
+ ],
50
+ "tokens_per_second": 2,
51
+ "temporal_patch_size": 2
52
+ },
53
+ "rope_scaling": {
54
+ "type": "mrope",
55
+ "mrope_section": [
56
+ 16,
57
+ 24,
58
+ 24
59
+ ]
60
+ },
61
+ "vocab_size": 151936
62
+ }
qwen2_5-vl-tokenizer/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "pad_token_id": 151643,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 151645,
7
+ 151643
8
+ ],
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.1,
11
+ "top_p": 0.001,
12
+ "top_k": 1,
13
+ "transformers_version": "4.37.0"
14
+ }