|
import torch |
|
from torch import nn |
|
import torch.nn.functional as F |
|
from typing import Any, Dict, List, Optional, Tuple, Union |
|
|
|
def get_rope_index( |
|
config, |
|
input_ids: Optional[torch.LongTensor] = None, |
|
image_grid_thw: Optional[torch.LongTensor] = None, |
|
video_grid_thw: Optional[torch.LongTensor] = None, |
|
second_per_grid_ts: Optional[torch.Tensor] = None, |
|
attention_mask: Optional[torch.Tensor] = None, |
|
) -> Tuple[torch.Tensor, torch.Tensor]: |
|
""" |
|
Calculate the 3D rope index based on image and video's temporal, height and width in LLM. |
|
|
|
Explanation: |
|
Each embedding sequence contains vision embedding and text embedding or just contains text embedding. |
|
|
|
For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs. |
|
Examples: |
|
input_ids: [T T T T T], here T is for text. |
|
temporal position_ids: [0, 1, 2, 3, 4] |
|
height position_ids: [0, 1, 2, 3, 4] |
|
width position_ids: [0, 1, 2, 3, 4] |
|
|
|
For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part |
|
and 1D rotary position embeddin for text part. |
|
Examples: |
|
Temporal (Time): 3 patches, representing different segments of the video in time. |
|
Height: 2 patches, dividing each frame vertically. |
|
Width: 2 patches, dividing each frame horizontally. |
|
We also have some important parameters: |
|
fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second. |
|
tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity. |
|
temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames. |
|
interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs. |
|
input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision. |
|
vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100] |
|
vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] |
|
vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] |
|
text temporal position_ids: [101, 102, 103, 104, 105] |
|
text height position_ids: [101, 102, 103, 104, 105] |
|
text width position_ids: [101, 102, 103, 104, 105] |
|
Here we calculate the text start position_ids as the max vision position_ids plus 1. |
|
|
|
Args: |
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): |
|
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide |
|
it. |
|
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): |
|
The temporal, height and width of feature shape of each image in LLM. |
|
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): |
|
The temporal, height and width of feature shape of each video in LLM. |
|
second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*): |
|
The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs. |
|
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): |
|
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: |
|
|
|
- 1 for tokens that are **not masked**, |
|
- 0 for tokens that are **masked**. |
|
|
|
Returns: |
|
position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`) |
|
mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`) |
|
""" |
|
spatial_merge_size = config.vision_config.spatial_merge_size |
|
image_token_id = config.image_token_id |
|
video_token_id = config.video_token_id |
|
vision_start_token_id = config.vision_start_token_id |
|
mrope_position_deltas = [] |
|
if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None): |
|
total_input_ids = input_ids |
|
if attention_mask is None: |
|
attention_mask = torch.ones_like(total_input_ids) |
|
position_ids = torch.ones( |
|
3, |
|
input_ids.shape[0], |
|
input_ids.shape[1], |
|
dtype=input_ids.dtype, |
|
device=input_ids.device, |
|
) |
|
image_index, video_index = 0, 0 |
|
attention_mask = attention_mask.to(total_input_ids.device) |
|
for i, input_ids in enumerate(total_input_ids): |
|
input_ids = input_ids[attention_mask[i] == 1] |
|
image_nums, video_nums = 0, 0 |
|
vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) |
|
vision_tokens = input_ids[vision_start_indices + 1] |
|
image_nums = (vision_tokens == image_token_id).sum() |
|
video_nums = (vision_tokens == video_token_id).sum() |
|
input_tokens = input_ids.tolist() |
|
llm_pos_ids_list: list = [] |
|
st = 0 |
|
remain_images, remain_videos = image_nums, video_nums |
|
for _ in range(image_nums + video_nums): |
|
if image_token_id in input_tokens and remain_images > 0: |
|
ed_image = input_tokens.index(image_token_id, st) |
|
else: |
|
ed_image = len(input_tokens) + 1 |
|
if video_token_id in input_tokens and remain_videos > 0: |
|
ed_video = input_tokens.index(video_token_id, st) |
|
else: |
|
ed_video = len(input_tokens) + 1 |
|
if ed_image < ed_video: |
|
t, h, w = ( |
|
image_grid_thw[image_index][0], |
|
image_grid_thw[image_index][1], |
|
image_grid_thw[image_index][2], |
|
) |
|
second_per_grid_t = 0 |
|
image_index += 1 |
|
remain_images -= 1 |
|
ed = ed_image |
|
|
|
else: |
|
t, h, w = ( |
|
video_grid_thw[video_index][0], |
|
video_grid_thw[video_index][1], |
|
video_grid_thw[video_index][2], |
|
) |
|
if second_per_grid_ts is not None: |
|
second_per_grid_t = second_per_grid_ts[video_index] |
|
else: |
|
second_per_grid_t = 1.0 |
|
video_index += 1 |
|
remain_videos -= 1 |
|
ed = ed_video |
|
llm_grid_t, llm_grid_h, llm_grid_w = ( |
|
t.item(), |
|
h.item() // spatial_merge_size, |
|
w.item() // spatial_merge_size, |
|
) |
|
text_len = ed - st |
|
|
|
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 |
|
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) |
|
|
|
range_tensor = torch.arange(llm_grid_t).view(-1, 1) |
|
expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w) |
|
|
|
time_tensor = expanded_range * second_per_grid_t * config.vision_config.tokens_per_second |
|
|
|
time_tensor_long = time_tensor.long() |
|
t_index = time_tensor_long.flatten() |
|
|
|
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() |
|
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() |
|
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx) |
|
st = ed + llm_grid_t * llm_grid_h * llm_grid_w |
|
|
|
if st < len(input_tokens): |
|
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 |
|
text_len = len(input_tokens) - st |
|
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) |
|
|
|
llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) |
|
position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device) |
|
mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i])) |
|
mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1) |
|
return position_ids, mrope_position_deltas |
|
else: |
|
if attention_mask is not None: |
|
position_ids = attention_mask.long().cumsum(-1) - 1 |
|
position_ids.masked_fill_(attention_mask == 0, 1) |
|
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device) |
|
max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0] |
|
mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1] |
|
else: |
|
position_ids = ( |
|
torch.arange(input_ids.shape[1], device=input_ids.device) |
|
.view(1, 1, -1) |
|
.expand(3, input_ids.shape[0], -1) |
|
) |
|
mrope_position_deltas = torch.zeros( |
|
[input_ids.shape[0], 1], |
|
device=input_ids.device, |
|
dtype=input_ids.dtype, |
|
) |
|
|
|
return position_ids, mrope_position_deltas |
|
|
|
|
|
|
|
def get_window_index(grid_thw, window_size=112,spatial_merge_size=2,patch_size=14): |
|
spatial_merge_unit = spatial_merge_size * spatial_merge_size |
|
window_index: list = [] |
|
cu_window_seqlens: list = [0] |
|
window_index_id = 0 |
|
vit_merger_window_size = window_size // spatial_merge_size // patch_size |
|
|
|
for grid_t, grid_h, grid_w in grid_thw: |
|
llm_grid_h, llm_grid_w = ( |
|
grid_h // spatial_merge_size, |
|
grid_w // spatial_merge_size, |
|
) |
|
index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w) |
|
pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size |
|
pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size |
|
num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size |
|
num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size |
|
index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100) |
|
index_padded = index_padded.reshape( |
|
grid_t, |
|
num_windows_h, |
|
vit_merger_window_size, |
|
num_windows_w, |
|
vit_merger_window_size, |
|
) |
|
index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( |
|
grid_t, |
|
num_windows_h * num_windows_w, |
|
vit_merger_window_size, |
|
vit_merger_window_size, |
|
) |
|
seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) |
|
index_padded = index_padded.reshape(-1) |
|
index_new = index_padded[index_padded != -100] |
|
window_index.append(index_new + window_index_id) |
|
cu_seqlens_tmp = seqlens.cumsum(0) * spatial_merge_unit + cu_window_seqlens[-1] |
|
cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) |
|
window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() |
|
window_index = torch.cat(window_index, dim=0) |
|
|
|
return window_index, cu_window_seqlens |
|
|
|
class Qwen2_5_VisionRotaryEmbedding(nn.Module): |
|
def __init__(self, dim: int, theta: float = 10000.0) -> None: |
|
super().__init__() |
|
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) |
|
self.register_buffer("inv_freq", inv_freq, persistent=False) |
|
|
|
def forward(self, seqlen: int) -> torch.Tensor: |
|
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype) |
|
freqs = torch.outer(seq, self.inv_freq) |
|
return freqs |
|
|
|
def rot_pos_emb( grid_thw, spatial_merge_size=2, hidden_size=2048, num_heads=16): |
|
head_dim = hidden_size // num_heads |
|
rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) |
|
pos_ids = [] |
|
for t, h, w in grid_thw: |
|
hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) |
|
hpos_ids = hpos_ids.reshape( |
|
h // spatial_merge_size, |
|
spatial_merge_size, |
|
w // spatial_merge_size, |
|
spatial_merge_size, |
|
) |
|
hpos_ids = hpos_ids.permute(0, 2, 1, 3) |
|
hpos_ids = hpos_ids.flatten() |
|
|
|
wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) |
|
wpos_ids = wpos_ids.reshape( |
|
h // spatial_merge_size, |
|
spatial_merge_size, |
|
w // spatial_merge_size, |
|
spatial_merge_size, |
|
) |
|
wpos_ids = wpos_ids.permute(0, 2, 1, 3) |
|
wpos_ids = wpos_ids.flatten() |
|
print("hpos_ids",hpos_ids.shape) |
|
pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) |
|
pos_ids = torch.cat(pos_ids, dim=0) |
|
max_grid_size = grid_thw[:, 1:].max() |
|
|
|
rotary_pos_emb_full = rotary_pos_emb(max_grid_size) |
|
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) |
|
return rotary_pos_emb |
|
|
|
def rot_pos_id(grid_thw, spatial_merge_size=2): |
|
pos_ids = [] |
|
for t, h, w in grid_thw: |
|
hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) |
|
hpos_ids = hpos_ids.reshape( |
|
h // spatial_merge_size, |
|
spatial_merge_size, |
|
w // spatial_merge_size, |
|
spatial_merge_size, |
|
) |
|
hpos_ids = hpos_ids.permute(0, 2, 1, 3) |
|
hpos_ids = hpos_ids.flatten() |
|
|
|
wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) |
|
wpos_ids = wpos_ids.reshape( |
|
h // spatial_merge_size, |
|
spatial_merge_size, |
|
w // spatial_merge_size, |
|
spatial_merge_size, |
|
) |
|
wpos_ids = wpos_ids.permute(0, 2, 1, 3) |
|
wpos_ids = wpos_ids.flatten() |
|
pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1)) |
|
pos_ids = torch.cat(pos_ids, dim=0) |
|
|
|
return pos_ids |