qqc1989 commited on Mar 29

Commit

a3b1a17

verified ·

1 Parent(s): 64e1cac

initial this repo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

config.json +0 -0
image/ssd_car.jpg +0 -0
python/cv_resize.py +13 -0
python/infer_image.py +249 -0
python/infer_text.py +237 -0
python/infer_video.py +252 -0
python/preprocess.py +155 -0
python/utils.py +296 -0
qwen2_5-vl-3b-image-ax650/Qwen2.5-VL-3B-Instruct_vision_nchw448.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/model.embed_tokens.weight.bfloat16.bin +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l0_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l10_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l11_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l12_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l13_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l14_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l15_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l16_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l17_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l18_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l19_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l1_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l20_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l21_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l22_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l23_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l24_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l25_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l26_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l27_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l28_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l29_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l2_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l30_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l31_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l32_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l33_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l34_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l35_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l3_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l4_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l5_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l6_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l7_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l8_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l9_together.axmodel +3 -0
qwen2_5-vl-3b-image-ax650/qwen2_5_vl_post.axmodel +3 -0
qwen2_5-vl-tokenizer/chat_template.json +3 -0
qwen2_5-vl-tokenizer/config.json +62 -0
qwen2_5-vl-tokenizer/generation_config.json +14 -0

config.json ADDED Viewed

File without changes

image/ssd_car.jpg ADDED Viewed

python/cv_resize.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import cv2
+from glob import glob
+import os
+paths = sorted(glob("demo/*.jpg"))
+print(paths)
+outdir = "demo_cv308"
+os.makedirs(outdir, exist_ok=True)
+for p in paths:
+    img = cv2.imread(p)
+    img = cv2.resize(img, (308,308))
+    cv2.imwrite(f"{outdir}/{os.path.basename(p)}", img)

python/infer_image.py ADDED Viewed

	@@ -0,0 +1,249 @@

+from transformers import AutoTokenizer, AutoConfig
+import numpy as np
+from ml_dtypes import bfloat16
+from axengine import InferenceSession
+from PIL import Image
+from torchvision import transforms
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+import torch
+from transformers import  AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import onnxruntime
+import gc
+from glob import glob
+from utils import get_rope_index
+from transformers.image_utils import PILImageResampling
+from preprocess import Qwen2VLImageProcessorExport
+def post_process(data, topk=1, topp=0.001, temperature=0.1):
+    def top_p(l: np.ndarray, p: float) -> np.ndarray:
+        index = np.argsort(l)
+        res = l.copy()
+        sum_p = 0
+        for i in index[::-1]:
+            if sum_p >= p:
+                res[i] = 0
+            sum_p += res[i]
+        return res / sum_p
+    def softmax(l: np.ndarray) -> np.ndarray:
+        l_max = l - l.max()
+        l_exp = np.exp(l_max)
+        res = l_exp / np.sum(l_exp)
+        return res.astype(np.float64)
+    r = data.astype(np.float32)
+    r = r.flatten()
+    # topk
+    candidate_index = np.argpartition(r, -topk)[-topk:]
+    candidate_value = r[candidate_index]
+    # temperature
+    candidate_value /= temperature
+    # softmax
+    candidate_soft = softmax(candidate_value)
+    # topp
+    candidate_soft = top_p(candidate_soft, topp)
+    candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
+    pos = np.random.multinomial(1, candidate_soft).argmax()
+    next_token = candidate_index[pos]
+    return next_token, candidate_index, candidate_soft
+if __name__ == "__main__":
+    prefill_len = 320
+    checkpoint_dir=f"../Qwen2.5-VL-3B-Instruct-AX650-prefill_320/"
+    cfg = AutoConfig.from_pretrained(
+        checkpoint_dir, trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        checkpoint_dir, trust_remote_code=True
+    )
+    processor = AutoProcessor.from_pretrained(checkpoint_dir)
+    path = "demo1.jpg"
+    messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            # "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                            # "image": "demo.jpg"
+                            "image": path,
+                            "max_pixels": 448 * 448,
+                        },
+                        {"type": "text", "text": "Describe this image."},
+                    ],
+                }
+            ]
+    # Preparation for inference
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    position_ids,_ = get_rope_index(cfg, inputs["input_ids"], image_grid_thw=inputs['image_grid_thw'])
+    # pixel_values = inputs['pixel_values_videos']
+    # print("pixel_values",pixel_values.shape)
+    # extract img feature by vit
+    vit_session = InferenceSession.load_from_model(f'{checkpoint_dir}/Qwen2.5-VL-3B-Instruct_vision_nchw448.axmodel')
+    image = Image.open(path)
+    image = image.resize((448,448))
+    img_processor = Qwen2VLImageProcessorExport(max_pixels=448*448, patch_size=14, temporal_patch_size=2, merge_size=2)
+    pixel_values, grid_thw = img_processor._preprocess(image, do_resize=True, resample=PILImageResampling.BICUBIC,
+                                        do_rescale=False, do_normalize=False,
+                                        do_convert_rgb=True)
+    # seq_len, dim = pixel_values.shape
+    # ht = pixel_values.reshape(t, seq_len//t, dim)
+    print("pixel_values.shape",pixel_values.shape)
+    t, seq_len,_,_ = pixel_values.shape
+    ht = pixel_values
+    vit_output = []
+    for i in range(t):
+        out = vit_session.run({"hidden_states": ht[i]})[0]
+        vit_output.append(out.astype(bfloat16))
+    del vit_session
+    gc.collect()
+    vit_output = np.concatenate(vit_output, axis=0)
+    vit_output = vit_output[None,:,:]
+    print("vit feature extract done!")
+    token_ids = inputs['input_ids'].squeeze().numpy().tolist()
+    image_start_index = np.where(np.array(token_ids) == 151652)[0].tolist()[0]
+    image_insert_index = image_start_index + 1
+    embeds = np.load(f"{checkpoint_dir}/model.embed_tokens.weight.npy")
+    prefill_data = np.take(embeds, token_ids, axis=0)
+    prefill_data = prefill_data.astype(bfloat16)
+    prefill_data[ image_insert_index : image_insert_index + vit_output.shape[1]] = vit_output[0, :, :]
+    token_len = len(token_ids)
+    lastN = 1023
+    kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
+    k_caches = [
+        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+        for _ in range(cfg.num_hidden_layers)
+    ]
+    v_caches = [
+        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+        for _ in range(cfg.num_hidden_layers)
+    ]
+    prefill_decoder_sessins = []
+    for i in range(cfg.num_hidden_layers):
+        session = InferenceSession.load_from_model(
+            f"{checkpoint_dir}/qwen2_5_vl_p{prefill_len}_l{i}_together.axmodel"
+        )
+        prefill_decoder_sessins.append(session)
+    post_process_session = InferenceSession.load_from_model(
+        f"{checkpoint_dir}/qwen2_5_vl_post.axmodel"
+        # "../Qwen2.5-VL-3B-Instruct-AX650-video-prefill_512/qwen2_5_vl_post.axmodel"
+    )
+    print("model load done!")
+    """
+        prefill
+    """
+    for i in range(cfg.num_hidden_layers):
+        prefill_decoder_sessins[i].set_runtime_context(group_id=1)
+    if prefill_len > 0:
+        indices = np.zeros((3, prefill_len), dtype=np.uint32)
+        indices[:, 0:token_len] = position_ids.squeeze(1).numpy().astype(np.uint32)
+        mask = np.zeros((1, prefill_len, prefill_len)) - 65536
+        data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
+        data[:, 0:token_len] = prefill_data
+        for i, t in enumerate(token_ids):
+            mask[:, i, : i + 1] = 0
+        mask = mask.astype(bfloat16)
+        for i in range(cfg.num_hidden_layers):
+            input_feed = {
+                "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+                "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+                "indices": indices,
+                "input": data,
+                "mask": mask,
+            }
+            outputs = prefill_decoder_sessins[i].run(input_feed)
+            k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
+            v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
+            data = outputs[2][:, :token_len, :]
+    post_out = post_process_session.run({"input": data[:, token_len - 1, :]})[0]
+    next_token, posssible_tokens, possible_soft = post_process(post_out, topk=1)
+    posibles = [tokenizer.decode([t]) for t in posssible_tokens]
+    posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
+    token_ids.append(next_token)
+    print("prefill done!")
+    # set to decoder
+    for i in range(cfg.num_hidden_layers):
+        prefill_decoder_sessins[i].set_runtime_context(group_id=0)
+    # lastN = np.max(indices)
+    start_ids = np.max(indices) + 1
+    mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
+    mask[:, :, :lastN] -= 65536
+    mask[:, :, :token_len] = 0
+    for start_indice in range(lastN + 1):
+        if prefill_len > 0 and start_indice < token_len:
+            continue
+        next_token = token_ids[start_indice]
+        indices = np.array([start_ids], np.uint32).reshape((1, 1))
+        start_ids += 1
+        data = embeds[next_token, :].reshape((1, 1, cfg.hidden_size)).astype(bfloat16)
+        for i in range(cfg.num_hidden_layers):
+            input_feed = {
+                "K_cache": k_caches[i],
+                "V_cache": v_caches[i],
+                "indices": indices,
+                "input": data,
+                "mask": mask,
+            }
+            outputs = prefill_decoder_sessins[i].run(input_feed)
+            k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
+            v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
+            data = outputs[2]
+        mask[..., start_indice] = 0
+        if start_indice < token_len - 1:
+            pass
+        else:
+            post_out = post_process_session.run({"input": data})[0]
+            next_token, posssible_tokens, possible_soft = post_process(post_out)
+            token_ids.append(next_token)
+        if next_token == tokenizer.eos_token_id:
+            # print("hit eos!")
+            break
+    print(tokenizer.decode(token_ids[token_len:]))

python/infer_text.py ADDED Viewed

	@@ -0,0 +1,237 @@

+from transformers import AutoTokenizer, AutoConfig
+import numpy as np
+from ml_dtypes import bfloat16
+from axengine import InferenceSession
+from PIL import Image
+from torchvision import transforms
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+import torch
+from transformers import  AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import onnxruntime
+import gc
+from glob import glob
+from utils import get_rope_index
+def post_process(data, topk=1, topp=0.9, temperature=0.6):
+    def top_p(l: np.ndarray, p: float) -> np.ndarray:
+        index = np.argsort(l)
+        res = l.copy()
+        sum_p = 0
+        for i in index[::-1]:
+            if sum_p >= p:
+                res[i] = 0
+            sum_p += res[i]
+        return res / sum_p
+    def softmax(l: np.ndarray) -> np.ndarray:
+        l_max = l - l.max()
+        l_exp = np.exp(l_max)
+        res = l_exp / np.sum(l_exp)
+        return res.astype(np.float64)
+    r = data.astype(np.float32)
+    r = r.flatten()
+    # topk
+    candidate_index = np.argpartition(r, -topk)[-topk:]
+    candidate_value = r[candidate_index]
+    # temperature
+    candidate_value /= temperature
+    # softmax
+    candidate_soft = softmax(candidate_value)
+    # topp
+    candidate_soft = top_p(candidate_soft, topp)
+    candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
+    pos = np.random.multinomial(1, candidate_soft).argmax()
+    next_token = candidate_index[pos]
+    return next_token, candidate_index, candidate_soft
+if __name__ == "__main__":
+    prefill_len = 512
+    checkpoint_dir=f"../Qwen2.5-VL-3B-Instruct-AX650-video-prefill_512/"
+    cfg = AutoConfig.from_pretrained(
+        checkpoint_dir, trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        checkpoint_dir, trust_remote_code=True
+    )
+    processor = AutoProcessor.from_pretrained(checkpoint_dir)
+    messages=[
+        {
+            "role": "user",
+            "content":[
+                {"type": "text", "text": "你是谁"},
+            ]
+        }
+    ]
+    # Preparation for inference
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    position_ids,_ = get_rope_index(cfg, inputs["input_ids"])
+    # pixel_values = inputs['pixel_values_videos']
+    # # extract img feature by vit
+    # vit_session = InferenceSession.load_from_model(f'{checkpoint_dir}/Qwen2.5-VL-3B-Instruct_vision.axmodel')
+    # t = inputs['video_grid_thw'][0,0]
+    # seq_len, dim = pixel_values.shape
+    # ht = pixel_values.reshape(t, seq_len//t, dim)
+    # vit_output = []
+    # for i in range(t):
+    #     print(i)
+    #     out = vit_session.run({"hidden_states": ht[i].numpy()})[0]  # (1, 576, 1176)
+    #     vit_output.append(out.astype(bfloat16))
+    # del vit_session
+    # gc.collect()
+    # vit_output = np.concatenate(vit_output, axis=0)
+    # # vit_output = np.load("vit_out.npy")
+    # np.save("vit_out_ax.npy",vit_output)
+    # vit_output = vit_output[None,:,:]
+    # print("vit feature extract done!")
+    token_ids = inputs['input_ids'].squeeze().numpy().tolist()
+    # image_start_index = np.where(np.array(token_ids) == 151652)[0].tolist()[0]
+    # image_insert_index = image_start_index + 1
+    embeds = np.load(f"{checkpoint_dir}/model.embed_tokens.weight.npy")
+    prefill_data = np.take(embeds, token_ids, axis=0)
+    prefill_data = prefill_data.astype(bfloat16)
+    # prefill_data[ image_insert_index : image_insert_index + vit_output.shape[1]] = vit_output[0, :, :]
+    token_len = len(token_ids)
+    lastN = 1023
+    kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
+    k_caches = [
+        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+        for _ in range(cfg.num_hidden_layers)
+    ]
+    v_caches = [
+        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+        for _ in range(cfg.num_hidden_layers)
+    ]
+    prefill_decoder_sessins = []
+    for i in range(cfg.num_hidden_layers):
+        session = InferenceSession.load_from_model(
+            f"{checkpoint_dir}/qwen2_5_vl_p{prefill_len}_l{i}_together.axmodel"
+        )
+        prefill_decoder_sessins.append(session)
+    post_process_session = InferenceSession.load_from_model(
+        f"{checkpoint_dir}/qwen2_5_vl_post.axmodel"
+        # "../Qwen2.5-VL-3B-Instruct-AX650-video-prefill_512/qwen2_5_vl_post.axmodel"
+    )
+    print("model load done!")
+    """
+        prefill
+    """
+    print("position_ids",position_ids)
+    for i in range(cfg.num_hidden_layers):
+        prefill_decoder_sessins[i].set_runtime_context(group_id=1)
+    if prefill_len > 0:
+        indices = np.zeros((3, prefill_len), dtype=np.uint32)
+        indices[:, 0:token_len] = position_ids.squeeze(1).numpy().astype(np.uint32)
+        mask = np.zeros((1, prefill_len, prefill_len)) - 65536
+        data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
+        data[:, 0:token_len] = prefill_data
+        for i, t in enumerate(token_ids):
+            mask[:, i, : i + 1] = 0
+        mask = mask.astype(bfloat16)
+        for i in range(cfg.num_hidden_layers):
+            input_feed = {
+                "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+                "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+                "indices": indices,
+                "input": data,
+                "mask": mask,
+            }
+            outputs = prefill_decoder_sessins[i].run(input_feed)
+            k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
+            v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
+            data = outputs[2][:, :token_len, :]
+    post_out = post_process_session.run({"input": data[:, token_len - 1, :]})[0]
+    next_token, posssible_tokens, possible_soft = post_process(post_out, topk=1)
+    posibles = [tokenizer.decode([t]) for t in posssible_tokens]
+    posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
+    token_ids.append(next_token)
+    print("prefill done!")
+    # set to decoder
+    for i in range(cfg.num_hidden_layers):
+        prefill_decoder_sessins[i].set_runtime_context(group_id=0)
+    # lastN = np.max(indices)
+    start_ids = np.max(indices) + 1
+    mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
+    mask[:, :, :lastN] -= 65536
+    mask[:, :, :token_len] = 0
+    for start_indice in range(lastN + 1):
+        if prefill_len > 0 and start_indice < token_len:
+            continue
+        next_token = token_ids[start_indice]
+        indices = np.array([start_ids], np.uint32).reshape((1, 1))
+        start_ids += 1
+        data = embeds[next_token, :].reshape((1, 1, cfg.hidden_size)).astype(bfloat16)
+        for i in range(cfg.num_hidden_layers):
+            input_feed = {
+                "K_cache": k_caches[i],
+                "V_cache": v_caches[i],
+                "indices": indices,
+                "input": data,
+                "mask": mask,
+            }
+            outputs = prefill_decoder_sessins[i].run(input_feed)
+            k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
+            v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
+            data = outputs[2]
+        mask[..., start_indice] = 0
+        if start_indice < token_len - 1:
+            pass
+        else:
+            post_out = post_process_session.run({"input": data})[0]
+            next_token, posssible_tokens, possible_soft = post_process(post_out)
+            print("next_token",next_token)
+            token_ids.append(next_token)
+        if next_token == tokenizer.eos_token_id:
+            # print("hit eos!")
+            break
+    print(tokenizer.decode(token_ids[token_len:]))

python/infer_video.py ADDED Viewed

	@@ -0,0 +1,252 @@

+from transformers import AutoTokenizer, AutoConfig
+import numpy as np
+from ml_dtypes import bfloat16
+from axengine import InferenceSession
+from PIL import Image
+from torchvision import transforms
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+import torch
+from transformers import  AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import onnxruntime
+import gc
+from glob import glob
+from utils import get_rope_index
+from transformers.image_utils import PILImageResampling
+from preprocess import Qwen2VLImageProcessorExport
+def post_process(data, topk=1, topp=0.001, temperature=0.1):
+    def top_p(l: np.ndarray, p: float) -> np.ndarray:
+        index = np.argsort(l)
+        res = l.copy()
+        sum_p = 0
+        for i in index[::-1]:
+            if sum_p >= p:
+                res[i] = 0
+            sum_p += res[i]
+        return res / sum_p
+    def softmax(l: np.ndarray) -> np.ndarray:
+        l_max = l - l.max()
+        l_exp = np.exp(l_max)
+        res = l_exp / np.sum(l_exp)
+        return res.astype(np.float64)
+    r = data.astype(np.float32)
+    r = r.flatten()
+    # topk
+    candidate_index = np.argpartition(r, -topk)[-topk:]
+    candidate_value = r[candidate_index]
+    # temperature
+    candidate_value /= temperature
+    # softmax
+    candidate_soft = softmax(candidate_value)
+    # topp
+    candidate_soft = top_p(candidate_soft, topp)
+    candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
+    pos = np.random.multinomial(1, candidate_soft).argmax()
+    next_token = candidate_index[pos]
+    return next_token, candidate_index, candidate_soft
+if __name__ == "__main__":
+    prefill_len = 512
+    checkpoint_dir=f"../Qwen2.5-VL-3B-Instruct-AX650-video-prefill_512/"
+    cfg = AutoConfig.from_pretrained(
+        checkpoint_dir, trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        checkpoint_dir, trust_remote_code=True
+    )
+    processor = AutoProcessor.from_pretrained(checkpoint_dir)
+    paths = sorted(glob("demo_cv308/*.jpg"))
+    print(paths)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "video",
+                    "video": paths,
+                    "max_pixels": 308 * 308,
+                    "fps": 1.0,
+                },
+                {"type": "text", "text": "描述一下这个视频的内容"},
+            ],
+        }
+    ]
+    # Preparation for inference
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    position_ids,_ = get_rope_index(cfg, inputs["input_ids"], video_grid_thw=inputs['video_grid_thw'], second_per_grid_ts=inputs['second_per_grid_ts'])
+    # pixel_values = inputs['pixel_values_videos']
+    # print("pixel_values",pixel_values.shape)
+    # extract img feature by vit
+    vit_session = InferenceSession.load_from_model(f'{checkpoint_dir}/Qwen2.5-VL-3B-Instruct_vision_nhwc.axmodel')
+    t = inputs['video_grid_thw'][0,0]
+    images = []
+    for p in paths:
+        img = Image.open(p)
+        images.append(img)
+    img_processor = Qwen2VLImageProcessorExport(max_pixels=308*308, patch_size=14, temporal_patch_size=2, merge_size=2)
+    pixel_values, grid_thw = img_processor._preprocess(images, do_resize=True, resample=PILImageResampling.BICUBIC,
+                                        do_rescale=False, do_normalize=False,
+                                        do_convert_rgb=True)
+    # seq_len, dim = pixel_values.shape
+    # ht = pixel_values.reshape(t, seq_len//t, dim)
+    print("pixel_values.shape",pixel_values.shape)
+    t, seq_len,_,_ = pixel_values.shape
+    ht = pixel_values
+    vit_output = []
+    for i in range(t):
+        out = vit_session.run({"hidden_states": ht[i]})[0]  # (1, 576, 1176)
+        vit_output.append(out.astype(bfloat16))
+    del vit_session
+    gc.collect()
+    vit_output = np.concatenate(vit_output, axis=0)
+    vit_output = vit_output[None,:,:]
+    print("vit feature extract done!")
+    token_ids = inputs['input_ids'].squeeze().numpy().tolist()
+    image_start_index = np.where(np.array(token_ids) == 151652)[0].tolist()[0]
+    image_insert_index = image_start_index + 1
+    embeds = np.load(f"{checkpoint_dir}/model.embed_tokens.weight.npy")
+    prefill_data = np.take(embeds, token_ids, axis=0)
+    prefill_data = prefill_data.astype(bfloat16)
+    prefill_data[ image_insert_index : image_insert_index + vit_output.shape[1]] = vit_output[0, :, :]
+    token_len = len(token_ids)
+    lastN = 1023
+    kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
+    k_caches = [
+        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+        for _ in range(cfg.num_hidden_layers)
+    ]
+    v_caches = [
+        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+        for _ in range(cfg.num_hidden_layers)
+    ]
+    prefill_decoder_sessins = []
+    for i in range(cfg.num_hidden_layers):
+        session = InferenceSession.load_from_model(
+            f"{checkpoint_dir}/qwen2_5_vl_p{prefill_len}_l{i}_together.axmodel"
+        )
+        prefill_decoder_sessins.append(session)
+    post_process_session = InferenceSession.load_from_model(
+        f"{checkpoint_dir}/qwen2_5_vl_post.axmodel"
+        # "../Qwen2.5-VL-3B-Instruct-AX650-video-prefill_512/qwen2_5_vl_post.axmodel"
+    )
+    print("model load done!")
+    """
+        prefill
+    """
+    for i in range(cfg.num_hidden_layers):
+        prefill_decoder_sessins[i].set_runtime_context(group_id=1)
+    if prefill_len > 0:
+        indices = np.zeros((3, prefill_len), dtype=np.uint32)
+        indices[:, 0:token_len] = position_ids.squeeze(1).numpy().astype(np.uint32)
+        mask = np.zeros((1, prefill_len, prefill_len)) - 65536
+        data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
+        data[:, 0:token_len] = prefill_data
+        for i, t in enumerate(token_ids):
+            mask[:, i, : i + 1] = 0
+        mask = mask.astype(bfloat16)
+        for i in range(cfg.num_hidden_layers):
+            input_feed = {
+                "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+                "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+                "indices": indices,
+                "input": data,
+                "mask": mask,
+            }
+            outputs = prefill_decoder_sessins[i].run(input_feed)
+            k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
+            v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
+            data = outputs[2][:, :token_len, :]
+    post_out = post_process_session.run({"input": data[:, token_len - 1, :]})[0]
+    next_token, posssible_tokens, possible_soft = post_process(post_out, topk=1)
+    posibles = [tokenizer.decode([t]) for t in posssible_tokens]
+    posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
+    token_ids.append(next_token)
+    print("prefill done!")
+    # set to decoder
+    for i in range(cfg.num_hidden_layers):
+        prefill_decoder_sessins[i].set_runtime_context(group_id=0)
+    # lastN = np.max(indices)
+    start_ids = np.max(indices) + 1
+    mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
+    mask[:, :, :lastN] -= 65536
+    mask[:, :, :token_len] = 0
+    for start_indice in range(lastN + 1):
+        if prefill_len > 0 and start_indice < token_len:
+            continue
+        next_token = token_ids[start_indice]
+        indices = np.array([start_ids], np.uint32).reshape((1, 1))
+        start_ids += 1
+        data = embeds[next_token, :].reshape((1, 1, cfg.hidden_size)).astype(bfloat16)
+        for i in range(cfg.num_hidden_layers):
+            input_feed = {
+                "K_cache": k_caches[i],
+                "V_cache": v_caches[i],
+                "indices": indices,
+                "input": data,
+                "mask": mask,
+            }
+            outputs = prefill_decoder_sessins[i].run(input_feed)
+            k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
+            v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
+            data = outputs[2]
+        mask[..., start_indice] = 0
+        if start_indice < token_len - 1:
+            pass
+        else:
+            post_out = post_process_session.run({"input": data})[0]
+            next_token, posssible_tokens, possible_soft = post_process(post_out)
+            token_ids.append(next_token)
+        if next_token == tokenizer.eos_token_id:
+            # print("hit eos!")
+            break
+    print(tokenizer.decode(token_ids[token_len:]))

python/preprocess.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from typing import Dict, List, Optional, Union
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor, smart_resize
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    VideoInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_batched_videos,
+    make_flat_list_of_images,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,)
+from transformers.utils import TensorType, logging
+import numpy as np
+logger = logging.get_logger(__name__)
+class Qwen2VLImageProcessorExport(Qwen2VLImageProcessor):
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
+            patches = np.concatenate([patches, repeats], axis=0)
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        patches = patches.reshape(
+            grid_t,                     # 0
+            self.temporal_patch_size,   # 1
+            channel,                    # 2
+            grid_h // self.merge_size,  # 3
+            self.merge_size,            # 4
+            self.patch_size,            # 5
+            grid_w // self.merge_size,  # 6
+            self.merge_size,            # 7
+            self.patch_size,            # 8
+        )
+        # patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        # flatten_patches = patches.reshape(
+        #     grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
+        # )
+        patches = patches.transpose(0, 3, 6, 4, 7,  1,5,8, 2)
+        flatten_patches =  patches.reshape(
+            grid_t,  grid_h * grid_w,  self.temporal_patch_size * self.patch_size * self.patch_size, channel
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)

python/utils.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from typing import Any, Dict, List, Optional, Tuple, Union
+def get_rope_index(
+        config,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+        Explanation:
+            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+            Examples:
+                input_ids: [T T T T T], here T is for text.
+                temporal position_ids: [0, 1, 2, 3, 4]
+                height position_ids: [0, 1, 2, 3, 4]
+                width position_ids: [0, 1, 2, 3, 4]
+            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+            and 1D rotary position embeddin for text part.
+            Examples:
+                Temporal (Time): 3 patches, representing different segments of the video in time.
+                Height: 2 patches, dividing each frame vertically.
+                Width: 2 patches, dividing each frame horizontally.
+                We also have some important parameters:
+                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+                interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+                text temporal position_ids: [101, 102, 103, 104, 105]
+                text height position_ids: [101, 102, 103, 104, 105]
+                text width position_ids: [101, 102, 103, 104, 105]
+                Here we calculate the text start position_ids as the max vision position_ids plus 1.
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+            second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
+                The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+        Returns:
+            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+        """
+        spatial_merge_size = config.vision_config.spatial_merge_size
+        image_token_id = config.image_token_id
+        video_token_id = config.video_token_id
+        vision_start_token_id = config.vision_start_token_id
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        second_per_grid_t = 0
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        if second_per_grid_ts is not None:
+                            second_per_grid_t = second_per_grid_ts[video_index]
+                        else:
+                            second_per_grid_t = 1.0
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                    range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                    expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+                    time_tensor = expanded_range * second_per_grid_t * config.vision_config.tokens_per_second
+                    time_tensor_long = time_tensor.long()
+                    t_index = time_tensor_long.flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+                if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+            return position_ids, mrope_position_deltas
+def get_window_index(grid_thw, window_size=112,spatial_merge_size=2,patch_size=14):
+    spatial_merge_unit = spatial_merge_size * spatial_merge_size
+    window_index: list = []
+    cu_window_seqlens: list = [0]
+    window_index_id = 0
+    vit_merger_window_size = window_size // spatial_merge_size // patch_size
+    for grid_t, grid_h, grid_w in grid_thw:
+        llm_grid_h, llm_grid_w = (
+            grid_h // spatial_merge_size,
+            grid_w // spatial_merge_size,
+        )
+        index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+        pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+        pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+        num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+        num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+        index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+        index_padded = index_padded.reshape(
+            grid_t,
+            num_windows_h,
+            vit_merger_window_size,
+            num_windows_w,
+            vit_merger_window_size,
+        )
+        index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+            grid_t,
+            num_windows_h * num_windows_w,
+            vit_merger_window_size,
+            vit_merger_window_size,
+        )
+        seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+        index_padded = index_padded.reshape(-1)
+        index_new = index_padded[index_padded != -100]
+        window_index.append(index_new + window_index_id)
+        cu_seqlens_tmp = seqlens.cumsum(0) * spatial_merge_unit + cu_window_seqlens[-1]
+        cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+        window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+    window_index = torch.cat(window_index, dim=0)
+    return window_index, cu_window_seqlens
+class Qwen2_5_VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+def rot_pos_emb( grid_thw, spatial_merge_size=2, hidden_size=2048, num_heads=16):
+    head_dim = hidden_size // num_heads
+    rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+    pos_ids = []
+    for t, h, w in grid_thw:
+        hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+        hpos_ids = hpos_ids.reshape(
+            h // spatial_merge_size,
+            spatial_merge_size,
+            w // spatial_merge_size,
+            spatial_merge_size,
+        )
+        hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+        hpos_ids = hpos_ids.flatten()
+        wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+        wpos_ids = wpos_ids.reshape(
+            h // spatial_merge_size,
+            spatial_merge_size,
+            w // spatial_merge_size,
+            spatial_merge_size,
+        )
+        wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+        wpos_ids = wpos_ids.flatten()
+        print("hpos_ids",hpos_ids.shape)
+        pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+    pos_ids = torch.cat(pos_ids, dim=0)
+    max_grid_size = grid_thw[:, 1:].max()
+    # return max_grid_size, pos_ids
+    rotary_pos_emb_full = rotary_pos_emb(max_grid_size)
+    rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+    return rotary_pos_emb
+def rot_pos_id(grid_thw, spatial_merge_size=2):
+    pos_ids = []
+    for t, h, w in grid_thw:
+        hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+        hpos_ids = hpos_ids.reshape(
+            h // spatial_merge_size,
+            spatial_merge_size,
+            w // spatial_merge_size,
+            spatial_merge_size,
+        )
+        hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+        hpos_ids = hpos_ids.flatten()
+        wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+        wpos_ids = wpos_ids.reshape(
+            h // spatial_merge_size,
+            spatial_merge_size,
+            w // spatial_merge_size,
+            spatial_merge_size,
+        )
+        wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+        wpos_ids = wpos_ids.flatten()
+        pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1))
+    pos_ids = torch.cat(pos_ids, dim=0)
+    return pos_ids

qwen2_5-vl-3b-image-ax650/Qwen2.5-VL-3B-Instruct_vision_nchw448.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47c6a5c75e3941c49123018f352785dbcbd028dd7d1e741a16c6453f9c9209cf
+size 921254437

qwen2_5-vl-3b-image-ax650/model.embed_tokens.weight.bfloat16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b84907567aa829c6f24cadbdeb68c3c44d25fc0a8be8e917fd603cb64f72810d
+size 622329856

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l0_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45c870e33c94182a3bba8ca328133291dec3c4946610481755a1de37b8379164
+size 86641264

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l10_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c6b45f5df73bf8c702c0d5bab60c85e2056a7ebd0aab45a0110eb185481f7b5
+size 86643728

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l11_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:381ce660b87875613ee91f8e82b17661642eea481229b13d066610c42eac13bf
+size 86643728

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l12_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa11b8cf730de19f78516701401ffc242a85e2869643f1971c151b4516b3d43f
+size 86644080

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l13_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd0f2f7993c43c841fd90be53c55474bd54910ce3cec325f9b114ac002efb612
+size 86643888

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l14_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41525a42c3d78d04f3c6485de9415e92aeee0101895e7d76917c244ca2100811
+size 86643440

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l15_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fabaaa6e5199467048c0c55beb8dfc60446c9511a12fcd7c725c6e8b446076b5
+size 86643504

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l16_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1cc277bff420c4cbf5a00ad58368828650c68cfddfda95416973192444824e84
+size 86643888

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l17_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02bea7cc7c031891238a140ec5df4176de5a23aec7a458ef33537f2f4eb01dc2
+size 86643248

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l18_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8130068bd6d060671b445607d1aa35e302b649471d6a88f279b2299d1eab00
+size 86643728

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l19_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8cc7633e9d7795f46a5be0de1d884e44a7248669918f75a08911954356b2bb97
+size 86642768

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l1_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b657f2779b85f1baf1cf448b88f6966bcb0fc22bbda7f106248f348208275c8
+size 86637264

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l20_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f232bbcd9e2d64c3f0ae9853b3aece5a1f7756fb6e582ff66baedf2ced5b890
+size 86642256

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l21_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:692940f18fee8844c13900d90ed9c5b85abde5d3ded81e72131db9a7fdf87c14
+size 86643408

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l22_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67716a9cfb6733b16757ae0507e9a3599488fd78ad9fbf32358b9de9da3e9f07
+size 86643728

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l23_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1d5bee3047692f79d708dbd297daab31d3c63dff6180c7a4072a3007fbb7eb5
+size 86642448

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l24_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e454da2a0a0a92dfba358e99ecc8bc6eef518b84896101b15619e1f566a0eb1
+size 86643920

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l25_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb0eb5fef9d9c1015767e3bd9d5612632c1717d761d6da7f64d6679b8b52337a
+size 86643024

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l26_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0028b84436b3f54598798024bf9c1e404f2cead942ecdc9cebddf8c2391d2f6d
+size 86643408

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l27_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3275466bf43cd6444c5cb82dc937c7a1dc839d2bc8bb05c93ecf9d472c509a0
+size 86642672

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l28_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:212e92ea3e46869a292a2c6e44c6d6c37808d93102d47f970df6d6cffe573e77
+size 86643184

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l29_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c2f2b233469dc19cd76567978e426536907933ba67fda4250b5c716b2f3ebc1
+size 86644080

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l2_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5d6947979aa008d6f50b72fb8d578fe7b2dfc972f1f4944578b63228cef61ad
+size 86638576

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l30_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4d86e37806cc1f06a506a782c7c4652449c42a168ea573f68c883ddcfc64796
+size 86643024

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l31_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:172d662877b4a9013449fefcb79ebc8f5d0a56b10871382b0c7c8ee3eebfe339
+size 86642736

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l32_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0978b287a9500bfbbe906422b8bcc190fcdb305aff9eb1f3065e1e830aa9396d
+size 86643440

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l33_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c455ddc93f9a1199676a1f790f85e22ebf8213836e4a70bc661f26e4cf6c66c3
+size 86644080

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l34_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b2beeb8482259dda3b6275a6ef33fbe6ee93cf70cb9b074187193ce7c8f0d97
+size 86644336

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l35_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9247f1a202b8ed5e007b0923640ef9977099c95130313c9ac3ef370978d9a1d
+size 86643856

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l3_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25d616dfcbfe5b777bdf69e3ac52f0c50939934388b6fc2fec99545651c5aafe
+size 86641392

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l4_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a7d88693f4b01807fe552f19f37b66ad011ca5a36ee241623f3749a94c1e7c7
+size 86640688

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l5_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e2ce5e2dba9c9b0b3243f57eaf4707932e3b86fb31ff9cd05425fdefa06b0bb1
+size 86643152

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l6_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31ca7a6025017eba34f2f2d0904458a1e221b046eed71b4255fc554317c88de0
+size 86643696

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l7_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a57e3392728b1fd358d01484c32df85c99b23708190c40727523c814a9f6f60
+size 86644048

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l8_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09f0291e623c33f0b0951d92fa22191b400a92d445d95d616367446f4808ee01
+size 86644272

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_p320_l9_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:527546142999c59efec9c58b97a8d0cfea3d3d179d40bd8d4006db74cf39e031
+size 86644304

qwen2_5-vl-3b-image-ax650/qwen2_5_vl_post.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:455e7705d3bf4ebbc602476276904b76573e9094f7a1e6bde4ec782666bd95d5
+size 339965940

qwen2_5-vl-tokenizer/chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

qwen2_5-vl-tokenizer/config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "architectures": [
+    "Qwen2_5_VLForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "vision_start_token_id": 151652,
+  "vision_end_token_id": 151653,
+  "vision_token_id": 151654,
+  "image_token_id": 151655,
+  "video_token_id": 151656,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 70,
+  "model_type": "qwen2_5_vl",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.41.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vision_config": {
+    "initializer_range": 0.02,
+    "depth": 32,
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "intermediate_size": 3420,
+    "num_heads": 16,
+    "in_chans": 3,
+    "out_hidden_size": 2048,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "window_size": 112,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "tokens_per_second": 2,
+    "temporal_patch_size": 2
+  },
+  "rope_scaling": {
+    "type": "mrope",
+    "mrope_section": [
+      16,
+      24,
+      24
+    ]
+  },
+  "vocab_size": 151936
+}

qwen2_5-vl-tokenizer/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "pad_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "repetition_penalty": 1.05,
+  "temperature": 0.1,
+  "top_p": 0.001,
+  "top_k": 1,
+  "transformers_version": "4.37.0"
+}