Update README.md
Browse files
README.md
CHANGED
@@ -15,6 +15,8 @@ This is my attemp (probably too naive) to reproduce the upcycling process used t
|
|
15 |
|
16 |
## Upcycling script
|
17 |
|
|
|
|
|
18 |
```python
|
19 |
from torch import nn
|
20 |
from transformers import AutoModelForCausalLM
|
@@ -23,6 +25,7 @@ from transformers import AutoModel
|
|
23 |
from typing_extensions import Self
|
24 |
from copy import deepcopy
|
25 |
|
|
|
26 |
@dataclass
|
27 |
class UpcyclingConfig:
|
28 |
finegrained_experts: int
|
@@ -45,13 +48,16 @@ def iterate_in_chunks(list1, list2, chunk_size1, chunk_size2):
|
|
45 |
|
46 |
def chunk_linear(linear: nn.Linear, chunks: int, down_proj: bool = False) -> tuple[nn.Linear, ...]:
|
47 |
if not down_proj:
|
48 |
-
in_features = linear.
|
49 |
-
out_features = linear.
|
|
|
50 |
else:
|
51 |
-
in_features = linear.
|
52 |
-
out_features = linear.
|
|
|
53 |
|
54 |
-
|
|
|
55 |
biases = linear.bias.chunk(chunks) if linear.bias is not None else [None] * chunks
|
56 |
linear_layers = []
|
57 |
for weight, bias in zip(weights, biases):
|
@@ -110,7 +116,7 @@ class Qwen2MoeForCausalLM(UpcycledModelMixin, _Qwen2MoeForCausalLM):
|
|
110 |
sparse_moe_block_cls = Qwen2MoeSparseMoeBlock
|
111 |
|
112 |
|
113 |
-
source_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-1.8B")
|
114 |
model = Qwen2MoeForCausalLM.upcycled_from(
|
115 |
source_model,
|
116 |
UpcyclingConfig(
|
@@ -118,7 +124,10 @@ model = Qwen2MoeForCausalLM.upcycled_from(
|
|
118 |
partitions_from_mlp=4,
|
119 |
),
|
120 |
)
|
|
|
|
|
121 |
```
|
|
|
122 |
|
123 |
### Model Description
|
124 |
|
|
|
15 |
|
16 |
## Upcycling script
|
17 |
|
18 |
+
<details>
|
19 |
+
<summary>Script:</summary>
|
20 |
```python
|
21 |
from torch import nn
|
22 |
from transformers import AutoModelForCausalLM
|
|
|
25 |
from typing_extensions import Self
|
26 |
from copy import deepcopy
|
27 |
|
28 |
+
|
29 |
@dataclass
|
30 |
class UpcyclingConfig:
|
31 |
finegrained_experts: int
|
|
|
48 |
|
49 |
def chunk_linear(linear: nn.Linear, chunks: int, down_proj: bool = False) -> tuple[nn.Linear, ...]:
|
50 |
if not down_proj:
|
51 |
+
in_features = linear.out_features // chunks
|
52 |
+
out_features = linear.in_features
|
53 |
+
dim = 0
|
54 |
else:
|
55 |
+
in_features = linear.out_features
|
56 |
+
out_features = linear.in_features // chunks
|
57 |
+
dim = 1
|
58 |
|
59 |
+
weight = linear.weight.reshape(linear.out_features, linear.in_features)
|
60 |
+
weights = weight.chunk(chunks, dim=dim)
|
61 |
biases = linear.bias.chunk(chunks) if linear.bias is not None else [None] * chunks
|
62 |
linear_layers = []
|
63 |
for weight, bias in zip(weights, biases):
|
|
|
116 |
sparse_moe_block_cls = Qwen2MoeSparseMoeBlock
|
117 |
|
118 |
|
119 |
+
source_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-1.8B", device_map="auto")
|
120 |
model = Qwen2MoeForCausalLM.upcycled_from(
|
121 |
source_model,
|
122 |
UpcyclingConfig(
|
|
|
124 |
partitions_from_mlp=4,
|
125 |
),
|
126 |
)
|
127 |
+
|
128 |
+
model = model.bloat16()
|
129 |
```
|
130 |
+
</details>
|
131 |
|
132 |
### Model Description
|
133 |
|