gabrielmbmb commited on
Commit
7e5fc1a
·
verified ·
1 Parent(s): 6eb8674

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +15 -6
README.md CHANGED
@@ -15,6 +15,8 @@ This is my attemp (probably too naive) to reproduce the upcycling process used t
15
 
16
  ## Upcycling script
17
 
 
 
18
  ```python
19
  from torch import nn
20
  from transformers import AutoModelForCausalLM
@@ -23,6 +25,7 @@ from transformers import AutoModel
23
  from typing_extensions import Self
24
  from copy import deepcopy
25
 
 
26
  @dataclass
27
  class UpcyclingConfig:
28
  finegrained_experts: int
@@ -45,13 +48,16 @@ def iterate_in_chunks(list1, list2, chunk_size1, chunk_size2):
45
 
46
  def chunk_linear(linear: nn.Linear, chunks: int, down_proj: bool = False) -> tuple[nn.Linear, ...]:
47
  if not down_proj:
48
- in_features = linear.in_features
49
- out_features = linear.out_features // chunks
 
50
  else:
51
- in_features = linear.in_features // chunks
52
- out_features = linear.out_features
 
53
 
54
- weights = linear.weight.chunk(chunks)
 
55
  biases = linear.bias.chunk(chunks) if linear.bias is not None else [None] * chunks
56
  linear_layers = []
57
  for weight, bias in zip(weights, biases):
@@ -110,7 +116,7 @@ class Qwen2MoeForCausalLM(UpcycledModelMixin, _Qwen2MoeForCausalLM):
110
  sparse_moe_block_cls = Qwen2MoeSparseMoeBlock
111
 
112
 
113
- source_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-1.8B")
114
  model = Qwen2MoeForCausalLM.upcycled_from(
115
  source_model,
116
  UpcyclingConfig(
@@ -118,7 +124,10 @@ model = Qwen2MoeForCausalLM.upcycled_from(
118
  partitions_from_mlp=4,
119
  ),
120
  )
 
 
121
  ```
 
122
 
123
  ### Model Description
124
 
 
15
 
16
  ## Upcycling script
17
 
18
+ <details>
19
+ <summary>Script:</summary>
20
  ```python
21
  from torch import nn
22
  from transformers import AutoModelForCausalLM
 
25
  from typing_extensions import Self
26
  from copy import deepcopy
27
 
28
+
29
  @dataclass
30
  class UpcyclingConfig:
31
  finegrained_experts: int
 
48
 
49
  def chunk_linear(linear: nn.Linear, chunks: int, down_proj: bool = False) -> tuple[nn.Linear, ...]:
50
  if not down_proj:
51
+ in_features = linear.out_features // chunks
52
+ out_features = linear.in_features
53
+ dim = 0
54
  else:
55
+ in_features = linear.out_features
56
+ out_features = linear.in_features // chunks
57
+ dim = 1
58
 
59
+ weight = linear.weight.reshape(linear.out_features, linear.in_features)
60
+ weights = weight.chunk(chunks, dim=dim)
61
  biases = linear.bias.chunk(chunks) if linear.bias is not None else [None] * chunks
62
  linear_layers = []
63
  for weight, bias in zip(weights, biases):
 
116
  sparse_moe_block_cls = Qwen2MoeSparseMoeBlock
117
 
118
 
119
+ source_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-1.8B", device_map="auto")
120
  model = Qwen2MoeForCausalLM.upcycled_from(
121
  source_model,
122
  UpcyclingConfig(
 
124
  partitions_from_mlp=4,
125
  ),
126
  )
127
+
128
+ model = model.bloat16()
129
  ```
130
+ </details>
131
 
132
  ### Model Description
133