from transformers import PretrainedConfig class MoEGPTConfig(PretrainedConfig): model_type = "moegpt" def __init__( self, vocab_size=50304, n_embd=768, n_layer=12, n_head=12, sequence_length=1024, moe=False, moe_routing="standard_gating", moe_num_experts=4, moe_num_experts_per_tok=2, moe_softmax_order="softmax_topk", moe_router_loss="load_balancing_z_loss", moe_aux_loss_factor=0.01, moe_z_loss_factor=1.0, mlp_dim_exp_factor=1.0, dropout=0.0, bias=False, architectures=["MoEGPTForCausalLM"], auto_map={ "AutoConfig": "configuration.MoEGPTConfig", "AutoModelForCausalLM": "modeling.MoEGPTForCausalLM", "AutoTokenizer": "GPT2TokenizerFast" }, **kwargs, ): super().__init__(**kwargs) self.vocab_size = vocab_size self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head self.sequence_length = sequence_length self.moe = moe self.moe_routing = moe_routing self.moe_num_experts = moe_num_experts self.moe_num_experts_per_tok = moe_num_experts_per_tok self.moe_softmax_order = moe_softmax_order self.moe_router_loss = moe_router_loss self.moe_aux_loss_factor = moe_aux_loss_factor self.moe_z_loss_factor = moe_z_loss_factor self.mlp_dim_exp_factor = mlp_dim_exp_factor self.dropout = dropout self.bias = bias self.architectures = architectures self.auto_map = auto_map