|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Qwen2Audio model configuration""" |
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.utils import logging |
|
from transformers import CONFIG_MAPPING |
|
|
|
import os |
|
from typing import Union |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class Qwen2SeamlessEncoderConfig(PretrainedConfig): |
|
|
|
model_type = "qwen2_seamless_encoder" |
|
|
|
def __init__( |
|
self, |
|
speech_encoder_layers=24, |
|
speech_encoder_attention_heads=16, |
|
speech_encoder_intermediate_size=4096, |
|
speech_encoder_hidden_act="swish", |
|
speech_encoder_dropout=0.0, |
|
add_adapter=True, |
|
speech_encoder_layerdrop=0.1, |
|
feature_projection_input_dim=160, |
|
adaptor_kernel_size=8, |
|
adaptor_stride=8, |
|
adaptor_dropout=0.1, |
|
num_adapter_layers=1, |
|
position_embeddings_type="relative_key", |
|
conv_depthwise_kernel_size=31, |
|
left_max_position_embeddings=64, |
|
right_max_position_embeddings=8, |
|
speech_encoder_chunk_size=20000, |
|
speech_encoder_left_chunk_num=128, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
|
|
self.speech_encoder_layers = speech_encoder_layers |
|
self.speech_encoder_hidden_act = speech_encoder_hidden_act |
|
self.speech_encoder_dropout = speech_encoder_dropout |
|
self.speech_encoder_attention_heads = speech_encoder_attention_heads |
|
self.speech_encoder_layerdrop = speech_encoder_layerdrop |
|
self.speech_encoder_intermediate_size = speech_encoder_intermediate_size |
|
self.feature_projection_input_dim = feature_projection_input_dim |
|
self.adaptor_kernel_size = adaptor_kernel_size |
|
self.adaptor_stride = adaptor_stride |
|
self.adaptor_dropout = adaptor_dropout |
|
self.num_adapter_layers = num_adapter_layers |
|
self.position_embeddings_type = position_embeddings_type |
|
self.conv_depthwise_kernel_size = conv_depthwise_kernel_size |
|
self.add_adapter = add_adapter |
|
self.left_max_position_embeddings = left_max_position_embeddings |
|
self.right_max_position_embeddings = right_max_position_embeddings |
|
self.speech_encoder_chunk_size = speech_encoder_chunk_size |
|
self.speech_encoder_left_chunk_num = speech_encoder_left_chunk_num |
|
self.audio_path = "/mnt/diskhd/Backup/DownloadModel/seamless-m4t-v2-large/" |
|
|
|
|
|
|
|
class Qwen2VLVisionConfig(PretrainedConfig): |
|
model_type = "qwen2_vl" |
|
|
|
def __init__( |
|
self, |
|
depth=32, |
|
embed_dim=1280, |
|
hidden_size=3584, |
|
hidden_act="quick_gelu", |
|
mlp_ratio=4, |
|
num_heads=16, |
|
in_channels=3, |
|
patch_size=14, |
|
spatial_merge_size=2, |
|
temporal_patch_size=2, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
|
|
self.depth = depth |
|
self.embed_dim = embed_dim |
|
self.hidden_size = hidden_size |
|
self.hidden_act = hidden_act |
|
self.mlp_ratio = mlp_ratio |
|
self.num_heads = num_heads |
|
self.in_channels = in_channels |
|
self.patch_size = patch_size |
|
self.spatial_merge_size = spatial_merge_size |
|
self.temporal_patch_size = temporal_patch_size |
|
|
|
@classmethod |
|
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": |
|
cls._set_token_in_kwargs(kwargs) |
|
|
|
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) |
|
|
|
if 1: |
|
config_dict = config_dict["vision_config"] |
|
|
|
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: |
|
logger.warning( |
|
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " |
|
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." |
|
) |
|
|
|
return cls.from_dict(config_dict, **kwargs) |
|
|
|
|
|
|
|
class Qwen2MMConfig(PretrainedConfig): |
|
|
|
model_type = "qwen2_mm" |
|
is_composition = False |
|
|
|
def __init__( |
|
self, |
|
vocab_size=152064, |
|
hidden_size=8192, |
|
intermediate_size=29568, |
|
num_hidden_layers=80, |
|
num_attention_heads=64, |
|
num_key_value_heads=8, |
|
hidden_act="silu", |
|
max_position_embeddings=32768, |
|
initializer_range=0.02, |
|
rms_norm_eps=1e-05, |
|
use_cache=True, |
|
tie_word_embeddings=False, |
|
rope_theta=1000000.0, |
|
use_sliding_window=False, |
|
sliding_window=4096, |
|
max_window_layers=80, |
|
attention_dropout=0.0, |
|
audio_config=None, |
|
vision_config=None, |
|
rope_scaling=None, |
|
**kwargs, |
|
): |
|
if isinstance(vision_config, dict): |
|
self.vision_config = Qwen2VLVisionConfig(**vision_config) |
|
elif vision_config is None: |
|
self.vision_config = Qwen2VLVisionConfig() |
|
|
|
if isinstance(audio_config, dict): |
|
self.audio_config = Qwen2SeamlessEncoderConfig(**audio_config) |
|
elif audio_config is None: |
|
self.audio_config = Qwen2SeamlessEncoderConfig() |
|
|
|
self.vocab_size = vocab_size |
|
self.max_position_embeddings = max_position_embeddings |
|
self.hidden_size = hidden_size |
|
self.intermediate_size = intermediate_size |
|
self.num_hidden_layers = num_hidden_layers |
|
self.num_attention_heads = num_attention_heads |
|
self.use_sliding_window = use_sliding_window |
|
self.sliding_window = sliding_window |
|
self.max_window_layers = max_window_layers |
|
|
|
|
|
if num_key_value_heads is None: |
|
num_key_value_heads = num_attention_heads |
|
|
|
self.num_key_value_heads = num_key_value_heads |
|
self.hidden_act = hidden_act |
|
self.initializer_range = initializer_range |
|
self.rms_norm_eps = rms_norm_eps |
|
self.use_cache = use_cache |
|
self.rope_theta = rope_theta |
|
self.attention_dropout = attention_dropout |
|
self.llm_path = "/mnt/diskhd/Backup/DownloadModel/Qwen2.5-3B-Instruct/" |
|
self.auto_map = { |
|
"AutoConfig": "configuration_qwen2_seamless.Qwen2MMConfig", |
|
"AutoModel": "modeling_qwen2_seamless.Qwen2SeamlessForConditionalGeneration" |
|
} |
|
self.rope_scaling = rope_scaling |
|
|
|
super().__init__(**kwargs) |
|
|
|
|