robinfaro
/

GPT2-1B-fineweb_edu_10BT

model_hub_mixin

pytorch_model_hub_mixin

Model card Files Files and versions

GPT2-1B-fineweb_edu_10BT / config.json

robinfaro's picture

Upload MoEGPTForCausalLM

6714537 verified 6 months ago

history blame contribute delete

741 Bytes

	{
	"architectures": [
	"MoEGPTForCausalLM"
	],
	"auto_map": {
	"AutoConfig": "robinfaro/GPT2-1B-base--configuration.MoEGPTConfig",
	"AutoModelForCausalLM": "robinfaro/GPT2-1B-base--modeling.MoEGPTForCausalLM"
	},
	"batch_size": 16,
	"bias": false,
	"dropout": 0.0,
	"mlp_dim_exp_factor": 1.0,
	"model_type": "moegpt",
	"moe": false,
	"moe_aux_loss_factor": 0.01,
	"moe_num_experts": 6,
	"moe_num_experts_per_tok": 2,
	"moe_router_loss": "load_balancing_z_loss",
	"moe_routing": null,
	"moe_softmax_order": "softmax_topk",
	"moe_z_loss_factor": 1.0,
	"n_embd": 1600,
	"n_head": 25,
	"n_layer": 48,
	"sequence_length": 1024,
	"torch_dtype": "float32",
	"transformers_version": "4.51.0",
	"vocab_size": 50304
	}