gabrielmbmb
/

Upcycled-Qwen1.5-MoE2.7B

Text Generation

Model card Files Files and versions

gabrielmbmb commited on Mar 30, 2024

Commit

7e5fc1a

·

verified ·

1 Parent(s): 6eb8674

Update README.md

Files changed (1) hide show

README.md +15 -6

README.md CHANGED Viewed

@@ -15,6 +15,8 @@ This is my attemp (probably too naive) to reproduce the upcycling process used t
 ## Upcycling script
 ```python
 from torch import nn
 from transformers import AutoModelForCausalLM
@@ -23,6 +25,7 @@ from transformers import AutoModel
 from typing_extensions import Self
 from copy import deepcopy
 @dataclass
 class UpcyclingConfig:
     finegrained_experts: int
@@ -45,13 +48,16 @@ def iterate_in_chunks(list1, list2, chunk_size1, chunk_size2):
 def chunk_linear(linear: nn.Linear, chunks: int, down_proj: bool = False) -> tuple[nn.Linear, ...]:
     if not down_proj:
-        in_features = linear.in_features
-        out_features = linear.out_features // chunks
     else:
-        in_features = linear.in_features // chunks
-        out_features = linear.out_features
-    weights = linear.weight.chunk(chunks)
     biases = linear.bias.chunk(chunks) if linear.bias is not None else [None] * chunks
     linear_layers = []
     for weight, bias in zip(weights, biases):
@@ -110,7 +116,7 @@ class Qwen2MoeForCausalLM(UpcycledModelMixin, _Qwen2MoeForCausalLM):
     sparse_moe_block_cls = Qwen2MoeSparseMoeBlock
-source_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-1.8B")
 model = Qwen2MoeForCausalLM.upcycled_from(
     source_model,
     UpcyclingConfig(
@@ -118,7 +124,10 @@ model = Qwen2MoeForCausalLM.upcycled_from(
         partitions_from_mlp=4,
     ),
 )
 ```
 ### Model Description

 ## Upcycling script
+<details>
+  <summary>Script:</summary>
 ```python
 from torch import nn
 from transformers import AutoModelForCausalLM
 from typing_extensions import Self
 from copy import deepcopy
 @dataclass
 class UpcyclingConfig:
     finegrained_experts: int
 def chunk_linear(linear: nn.Linear, chunks: int, down_proj: bool = False) -> tuple[nn.Linear, ...]:
     if not down_proj:
+        in_features = linear.out_features // chunks
+        out_features = linear.in_features
+        dim = 0
     else:
+        in_features = linear.out_features
+        out_features = linear.in_features // chunks
+        dim = 1
+    weight = linear.weight.reshape(linear.out_features, linear.in_features)
+    weights = weight.chunk(chunks, dim=dim)
     biases = linear.bias.chunk(chunks) if linear.bias is not None else [None] * chunks
     linear_layers = []
     for weight, bias in zip(weights, biases):
     sparse_moe_block_cls = Qwen2MoeSparseMoeBlock
+source_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-1.8B", device_map="auto")
 model = Qwen2MoeForCausalLM.upcycled_from(
     source_model,
     UpcyclingConfig(
         partitions_from_mlp=4,
     ),
 )
+model = model.bloat16()
 ```
+</details>
 ### Model Description