travis-xia commited on
Commit
3080ac4
·
verified ·
1 Parent(s): 2586bbc

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. checkpoint-15200/model-00001-of-00004.safetensors +3 -0
  3. checkpoint-15200/model-00002-of-00004.safetensors +3 -0
  4. checkpoint-15200/model-00003-of-00004.safetensors +3 -0
  5. checkpoint-15200/model-00004-of-00004.safetensors +3 -0
  6. checkpoint-15200/modeling_qwen2_mla.py +218 -0
  7. checkpoint-15200/rng_state_0.pth +3 -0
  8. checkpoint-15200/rng_state_1.pth +3 -0
  9. checkpoint-15200/rng_state_10.pth +3 -0
  10. checkpoint-15200/rng_state_11.pth +3 -0
  11. checkpoint-15200/rng_state_12.pth +3 -0
  12. checkpoint-15200/rng_state_13.pth +3 -0
  13. checkpoint-15200/rng_state_14.pth +3 -0
  14. checkpoint-15200/rng_state_15.pth +3 -0
  15. checkpoint-15200/rng_state_2.pth +3 -0
  16. checkpoint-15200/rng_state_3.pth +3 -0
  17. checkpoint-15200/rng_state_4.pth +3 -0
  18. checkpoint-15200/rng_state_5.pth +3 -0
  19. checkpoint-15200/rng_state_6.pth +3 -0
  20. checkpoint-15200/rng_state_7.pth +3 -0
  21. checkpoint-15200/rng_state_8.pth +3 -0
  22. checkpoint-15200/rng_state_9.pth +3 -0
  23. checkpoint-15200/special_tokens_map.json +31 -0
  24. checkpoint-15200/tokenizer.json +3 -0
  25. checkpoint-15200/training_args.bin +3 -0
  26. checkpoint-15386/configuration_qwen2_mla.py +24 -0
  27. checkpoint-15386/merges.txt +0 -0
  28. checkpoint-15386/model-00001-of-00004.safetensors +3 -0
  29. checkpoint-15386/model-00002-of-00004.safetensors +3 -0
  30. checkpoint-15386/model-00003-of-00004.safetensors +3 -0
  31. checkpoint-15386/model-00004-of-00004.safetensors +3 -0
  32. checkpoint-15386/rng_state_0.pth +3 -0
  33. checkpoint-15386/rng_state_1.pth +3 -0
  34. checkpoint-15386/rng_state_10.pth +3 -0
  35. checkpoint-15386/rng_state_11.pth +3 -0
  36. checkpoint-15386/rng_state_12.pth +3 -0
  37. checkpoint-15386/rng_state_13.pth +3 -0
  38. checkpoint-15386/rng_state_14.pth +3 -0
  39. checkpoint-15386/rng_state_15.pth +3 -0
  40. checkpoint-15386/rng_state_2.pth +3 -0
  41. checkpoint-15386/rng_state_3.pth +3 -0
  42. checkpoint-15386/rng_state_4.pth +3 -0
  43. checkpoint-15386/rng_state_5.pth +3 -0
  44. checkpoint-15386/rng_state_6.pth +3 -0
  45. checkpoint-15386/rng_state_7.pth +3 -0
  46. checkpoint-15386/rng_state_8.pth +3 -0
  47. checkpoint-15386/rng_state_9.pth +3 -0
  48. checkpoint-15386/tokenizer.json +3 -0
  49. checkpoint-15386/tokenizer_config.json +215 -0
  50. checkpoint-15386/training_args.bin +3 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-15386/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-15200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint-15200/model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:949869dc87408865accac4b7f107d22c440eb264fd656fbad1f8668560306e53
3
+ size 4997418264
checkpoint-15200/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3631ec046f0f1a318346107c6531a77ab442ec9264de1ca39c4858a32b68955a
3
+ size 4999754984
checkpoint-15200/model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d09060fcb08ce9cca2fdf28b8f68ef24b1774b230e5e7155ee7f361c08066236
3
+ size 4998910488
checkpoint-15200/model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03847a6be440de6f82575fe8df560651c519c50eaf09572846272518009ad29f
3
+ size 1324572984
checkpoint-15200/modeling_qwen2_mla.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple, Union
2
+
3
+ import torch
4
+ from torch import nn
5
+ import torch.nn.functional as F
6
+ from transformers.cache_utils import Cache
7
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
8
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
9
+ from transformers.processing_utils import Unpack
10
+
11
+ from transformers.models.qwen2.modeling_qwen2 import (
12
+ Qwen2Model,
13
+ Qwen2DecoderLayer,
14
+ Qwen2PreTrainedModel,
15
+ Qwen2ForCausalLM
16
+ )
17
+ from .configuration_qwen2_mla import Qwen2MLAConfig
18
+
19
+ from transformers.models.gemma2.modeling_gemma2 import (
20
+ eager_attention_forward, # for supporting softcap
21
+ logger
22
+ )
23
+ from transformers.models.deepseek_v3.modeling_deepseek_v3 import (
24
+ apply_rotary_pos_emb_interleave,
25
+ DeepseekV3RMSNorm
26
+ )
27
+
28
+
29
+ class MLAAttention(nn.Module):
30
+ """
31
+ Modified from `transformers.models.llama.modeling_deepseek_v3.DeepseekV3Attention`
32
+ add support for attention bias and softcapping
33
+ """
34
+ def __init__(self, config, layer_idx: int):
35
+
36
+ super().__init__()
37
+ self.config = config
38
+ self.layer_idx = layer_idx
39
+
40
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
41
+ self.attention_dropout = config.attention_dropout
42
+ self.num_heads = config.num_attention_heads
43
+ self.rope_theta = config.rope_theta
44
+ self.q_lora_rank = config.q_lora_rank
45
+ self.kv_lora_rank = config.kv_lora_rank
46
+ self.qk_rope_head_dim = config.qk_rope_head_dim
47
+ self.qk_nope_head_dim = config.qk_nope_head_dim
48
+ self.v_head_dim = config.v_head_dim
49
+ self.qk_head_dim = config.qk_head_dim
50
+
51
+ self.qk_latent_layernorm = getattr(config, "qk_latent_layernorm", True)
52
+
53
+ self.is_causal = True
54
+ if self.q_lora_rank is None:
55
+ self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=config.attention_bias)
56
+ else:
57
+ self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=False)
58
+ if self.qk_latent_layernorm:
59
+ self.q_a_layernorm = DeepseekV3RMSNorm(self.q_lora_rank)
60
+ self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=config.attention_bias)
61
+
62
+ self.kv_a_proj_with_mqa = nn.Linear(
63
+ config.hidden_size,
64
+ self.kv_lora_rank + self.qk_rope_head_dim,
65
+ bias=config.attention_bias,
66
+ )
67
+ if self.qk_latent_layernorm:
68
+ self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank)
69
+ self.kv_b_proj = nn.Linear(
70
+ self.kv_lora_rank,
71
+ self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
72
+ bias=False,
73
+ )
74
+
75
+ self.o_proj = nn.Linear(
76
+ self.num_heads * self.v_head_dim,
77
+ config.hidden_size,
78
+ bias=False,
79
+ )
80
+
81
+ self.scaling = self.qk_head_dim**-0.5
82
+
83
+ def forward(
84
+ self,
85
+ hidden_states: torch.Tensor,
86
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
87
+ attention_mask: Optional[torch.Tensor],
88
+ past_key_value: Optional[Cache] = None,
89
+ cache_position: Optional[torch.LongTensor] = None,
90
+ **kwargs: Unpack[FlashAttentionKwargs],
91
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
92
+ batch_size, seq_length = hidden_states.shape[:-1]
93
+ query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
94
+ key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
95
+
96
+ if self.q_lora_rank is None:
97
+ q_states = self.q_proj(hidden_states)
98
+ elif self.qk_latent_layernorm:
99
+ q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
100
+ else:
101
+ q_states = self.q_b_proj(self.q_a_proj(hidden_states))
102
+ q_states = q_states.view(query_shape).transpose(1, 2)
103
+ q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
104
+
105
+ compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
106
+ k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
107
+
108
+ if self.qk_latent_layernorm:
109
+ k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2)
110
+ else:
111
+ k_pass = self.kv_b_proj(k_pass).view(key_shape).transpose(1, 2)
112
+ k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
113
+
114
+ k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
115
+
116
+ cos, sin = position_embeddings
117
+ q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
118
+ k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
119
+
120
+ query_states = torch.cat((q_pass, q_rot), dim=-1)
121
+ key_states = torch.cat((k_pass, k_rot), dim=-1)
122
+
123
+ if past_key_value is not None:
124
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
125
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
126
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
127
+
128
+ if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
129
+ value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
130
+
131
+ attention_interface = eager_attention_forward
132
+ if self.config._attn_implementation != "eager":
133
+ if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
134
+ logger.warning_once(
135
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
136
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
137
+ )
138
+ else:
139
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
140
+
141
+ attn_output, attn_weights = attention_interface(
142
+ self,
143
+ query_states,
144
+ key_states,
145
+ value_states,
146
+ attention_mask,
147
+ dropout=0.0 if not self.training else self.attention_dropout,
148
+ scaling=self.scaling,
149
+ softcap=getattr(self.config, "attn_logit_softcapping", None),
150
+ **kwargs,
151
+ )
152
+ if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
153
+ attn_output = attn_output[:, :, :, : self.v_head_dim]
154
+ attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
155
+ attn_output = self.o_proj(attn_output)
156
+ return attn_output, attn_weights
157
+
158
+
159
+
160
+ class Qwen2MLADecoderLayer(Qwen2DecoderLayer):
161
+ """
162
+ Qwen2 decoder layer with MLA (Multi-Head Latent Attention) instead of standard attention.
163
+ This class inherits from Qwen2DecoderLayer and only replaces the self_attn component.
164
+ """
165
+
166
+ def __init__(self, config: Qwen2MLAConfig, layer_idx: int):
167
+ super().__init__(config, layer_idx)
168
+ # Replace the standard Qwen2 attention with MLA attention
169
+ self.self_attn = MLAAttention(config, layer_idx)
170
+
171
+
172
+ class Qwen2MLAPreTrainedModel(Qwen2PreTrainedModel):
173
+ """
174
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
175
+ models for Qwen2 with MLA attention.
176
+ """
177
+
178
+ config_class = Qwen2MLAConfig
179
+ _no_split_modules = ["Qwen2MLADecoderLayer"]
180
+
181
+
182
+ class Qwen2MLAModel(Qwen2MLAPreTrainedModel, Qwen2Model):
183
+ """
184
+ The Qwen2 model with MLA attention layers.
185
+
186
+ This model inherits from both Qwen2MLAPreTrainedModel and Qwen2Model,
187
+ replacing the standard Qwen2 decoder layers with MLA-enabled ones.
188
+ """
189
+
190
+ def __init__(self, config: Qwen2MLAConfig):
191
+ super().__init__(config)
192
+
193
+ # Replace the layers with MLA-enabled decoder layers
194
+ self.layers = nn.ModuleList(
195
+ [Qwen2MLADecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
196
+ )
197
+
198
+
199
+ class Qwen2MLAForCausalLM(Qwen2MLAPreTrainedModel, Qwen2ForCausalLM):
200
+ """
201
+ The Qwen2 model with MLA attention for causal language modeling.
202
+
203
+ This model can be used for text generation tasks, providing the same
204
+ interface as Qwen2ForCausalLM but with MLA attention mechanism.
205
+ """
206
+
207
+ def __init__(self, config: Qwen2MLAConfig):
208
+ super().__init__(config)
209
+ # Replace the base model with the MLA version
210
+ self.model = Qwen2MLAModel(config)
211
+
212
+
213
+ # Export the main classes for external use
214
+ __all__ = [
215
+ "Qwen2MLAForCausalLM",
216
+ "Qwen2MLAModel",
217
+ "Qwen2MLAPreTrainedModel",
218
+ ]
checkpoint-15200/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85
3
+ size 15984
checkpoint-15200/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73
3
+ size 15984
checkpoint-15200/rng_state_10.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d4c67c1d0ec6b889370f634ff29f584f829acaeaf69196a69304fdba936f9d7
3
+ size 15997
checkpoint-15200/rng_state_11.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afd0c4e2ac5f61d5f4df5d9d5783018b7937173b4afdbb92816c2668982cb351
3
+ size 15997
checkpoint-15200/rng_state_12.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9d1c7f561c5512193e8053ecf371e4fef647a8503481f2dad76332ddbe164fd
3
+ size 15997
checkpoint-15200/rng_state_13.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7604a3cfaf0e3c3f2aa2b3547660615db82205d4158951bb90f69ffc560ed179
3
+ size 15997
checkpoint-15200/rng_state_14.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d984186a3bc06445d6c077fcbf64da96d47d3ed704e68f0212045efb4b91dbb
3
+ size 15997
checkpoint-15200/rng_state_15.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b3945ba3535361fa13a8460a70ebbd44ece2514a1c9dcb76abfe6f54a775c60
3
+ size 15997
checkpoint-15200/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b
3
+ size 15984
checkpoint-15200/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc
3
+ size 15984
checkpoint-15200/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972
3
+ size 15984
checkpoint-15200/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991
3
+ size 15984
checkpoint-15200/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa
3
+ size 15984
checkpoint-15200/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773
3
+ size 15984
checkpoint-15200/rng_state_8.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4b75777b43df70dca87f91deed2e613941188792f244d5eeb06da8ab038bd36
3
+ size 15984
checkpoint-15200/rng_state_9.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8f9eb93d0ac349ba8b2271c417e5fcc4b70447a424f3f861c3338fbd99a48f4
3
+ size 15984
checkpoint-15200/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-15200/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:540b7fbf60b80e8293593a86960df91d2263723d69107ffc1afc89a7c08cda12
3
+ size 11422162
checkpoint-15200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4555e0e31481452ef30287c4b917c3310e94b6234d89939282be3a2ae6d6bc8
3
+ size 8120
checkpoint-15386/configuration_qwen2_mla.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
2
+
3
+ class Qwen2MLAConfig(Qwen2Config):
4
+
5
+ def __init__(
6
+ self,
7
+ *args,
8
+ kv_lora_rank=512,
9
+ q_lora_rank=None,
10
+ qk_rope_head_dim=64,
11
+ qk_nope_head_dim=128,
12
+ v_head_dim=128,
13
+ qk_latent_layernorm=True,
14
+ **kwargs
15
+ ):
16
+ super().__init__(*args, **kwargs)
17
+
18
+ self.kv_lora_rank = kv_lora_rank
19
+ self.q_lora_rank = q_lora_rank
20
+ self.qk_rope_head_dim = qk_rope_head_dim
21
+ self.qk_nope_head_dim = qk_nope_head_dim
22
+ self.qk_head_dim = qk_rope_head_dim + qk_nope_head_dim
23
+ self.v_head_dim = v_head_dim
24
+ self.qk_latent_layernorm = qk_latent_layernorm
checkpoint-15386/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-15386/model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0635097b8c9df0afb076290fd5ccc2debb7c4fb641a60cbfc48dc12224c37c0b
3
+ size 4997418264
checkpoint-15386/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02cc2dffbf6c47ef6826f4448f681ad603b7ca77b460b92f6d5bed9857090387
3
+ size 4999754984
checkpoint-15386/model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:354d58132fab1e43d5928978ba187b5b7ecb336b8042cfde765fb37aaaea46b3
3
+ size 4998910488
checkpoint-15386/model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:294b8199bc7db26e25ff052f32279d51fa4efb14b5d08e13d2f2d6a843c98ac4
3
+ size 1324572984
checkpoint-15386/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85
3
+ size 15984
checkpoint-15386/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73
3
+ size 15984
checkpoint-15386/rng_state_10.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d4c67c1d0ec6b889370f634ff29f584f829acaeaf69196a69304fdba936f9d7
3
+ size 15997
checkpoint-15386/rng_state_11.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afd0c4e2ac5f61d5f4df5d9d5783018b7937173b4afdbb92816c2668982cb351
3
+ size 15997
checkpoint-15386/rng_state_12.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9d1c7f561c5512193e8053ecf371e4fef647a8503481f2dad76332ddbe164fd
3
+ size 15997
checkpoint-15386/rng_state_13.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7604a3cfaf0e3c3f2aa2b3547660615db82205d4158951bb90f69ffc560ed179
3
+ size 15997
checkpoint-15386/rng_state_14.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d984186a3bc06445d6c077fcbf64da96d47d3ed704e68f0212045efb4b91dbb
3
+ size 15997
checkpoint-15386/rng_state_15.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b3945ba3535361fa13a8460a70ebbd44ece2514a1c9dcb76abfe6f54a775c60
3
+ size 15997
checkpoint-15386/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b
3
+ size 15984
checkpoint-15386/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc
3
+ size 15984
checkpoint-15386/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972
3
+ size 15984
checkpoint-15386/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991
3
+ size 15984
checkpoint-15386/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa
3
+ size 15984
checkpoint-15386/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773
3
+ size 15984
checkpoint-15386/rng_state_8.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4b75777b43df70dca87f91deed2e613941188792f244d5eeb06da8ab038bd36
3
+ size 15984
checkpoint-15386/rng_state_9.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8f9eb93d0ac349ba8b2271c417e5fcc4b70447a424f3f861c3338fbd99a48f4
3
+ size 15984
checkpoint-15386/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:540b7fbf60b80e8293593a86960df91d2263723d69107ffc1afc89a7c08cda12
3
+ size 11422162
checkpoint-15386/tokenizer_config.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "max_length": 256,
204
+ "model_max_length": 32768,
205
+ "pad_to_multiple_of": null,
206
+ "pad_token": "<|endoftext|>",
207
+ "pad_token_type_id": 0,
208
+ "padding_side": "right",
209
+ "split_special_tokens": false,
210
+ "stride": 0,
211
+ "tokenizer_class": "Qwen2Tokenizer",
212
+ "truncation_side": "right",
213
+ "truncation_strategy": "longest_first",
214
+ "unk_token": null
215
+ }
checkpoint-15386/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4555e0e31481452ef30287c4b917c3310e94b6234d89939282be3a2ae6d6bc8
3
+ size 8120