|
{ |
|
"architectures": [ |
|
"MoEGPTForCausalLM" |
|
], |
|
"auto_map": { |
|
"AutoConfig": "robinfaro/GPT2-1B-base--configuration.MoEGPTConfig", |
|
"AutoModelForCausalLM": "robinfaro/GPT2-1B-base--modeling.MoEGPTForCausalLM" |
|
}, |
|
"batch_size": 16, |
|
"bias": false, |
|
"dropout": 0.0, |
|
"mlp_dim_exp_factor": 1.0, |
|
"model_type": "moegpt", |
|
"moe": false, |
|
"moe_aux_loss_factor": 0.01, |
|
"moe_num_experts": 6, |
|
"moe_num_experts_per_tok": 2, |
|
"moe_router_loss": "load_balancing_z_loss", |
|
"moe_routing": null, |
|
"moe_softmax_order": "softmax_topk", |
|
"moe_z_loss_factor": 1.0, |
|
"n_embd": 1600, |
|
"n_head": 25, |
|
"n_layer": 48, |
|
"sequence_length": 1024, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.51.0", |
|
"vocab_size": 50304 |
|
} |
|
|