{ "metadata": { "total_moe_layers": 24, "save_timestamp": "2025-09-15T02:07:56.575592", "model_type": "Qwen2MoeForCausalLM", "pytorch_version": "2.6.0+cu124", "description": "Auxiliary-loss-free MoE bias states saved during training" }, "moe_bias_states": { "model.layers.0.mlp": { "bias_values": [ 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.00555419921875, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.1.mlp": { "bias_values": [ 0.03125, -0.03125, 0.0021820068359375, 0.03125, -0.03125, -0.03125, -0.0269775390625, -0.03125, -0.0289306640625, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.0191650390625, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.007232666015625, 0.03125, -0.01239013671875, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.2.mlp": { "bias_values": [ 0.03125, 0.03125, -0.03125, -0.0279541015625, 0.00042724609375, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.0299072265625, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.003509521484375, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.3.mlp": { "bias_values": [ -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.0026092529296875, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.01531982421875, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.0294189453125, -0.03125, 0.03125, -0.03125, -0.03125, -0.0303955078125, -0.03125, -0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.4.mlp": { "bias_values": [ 0.03125, -0.03125, 0.03125, 0.0177001953125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.01190185546875, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.5.mlp": { "bias_values": [ 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.0245361328125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.0220947265625, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.031005859375, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, 0.03125, -0.03125, 0.03125, 0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.6.mlp": { "bias_values": [ 0.03125, 0.03125, -0.03125, 0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.00897216796875, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.0157470703125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.0235595703125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.7.mlp": { "bias_values": [ -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.0211181640625, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.8.mlp": { "bias_values": [ -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.00022029876708984375, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.003082275390625, -0.0299072265625, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.0299072265625, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.0162353515625, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.9.mlp": { "bias_values": [ -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.00738525390625, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.0216064453125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.10.mlp": { "bias_values": [ -0.03125, 0.0167236328125, -0.03125, -0.03125, 0.004638671875, -0.03125, -0.03125, 0.03125, -0.03125, -0.0240478515625, -0.03125, -0.031005859375, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.0186767578125, -0.0181884765625, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, 0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.11.mlp": { "bias_values": [ -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.0294189453125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.0157470703125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.0032501220703125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.12.mlp": { "bias_values": [ -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.0172119140625, -0.03125, -0.03125, 0.01092529296875, -0.03125, -0.03125, 0.03125, -0.01190185546875, -0.0279541015625, 0.03125, -0.03125, -0.03125, 0.01385498046875, -0.03125, -0.03125, -0.03125, -0.0014190673828125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.0068359375, 0.03125, -0.0308837890625, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.13.mlp": { "bias_values": [ -0.03125, 0.0172119140625, -0.03125, -0.0225830078125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.0181884765625, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.0010223388671875, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.004852294921875, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.14.mlp": { "bias_values": [ 0.03125, 0.03125, 0.03125, 0.0303955078125, -0.03125, -0.03125, 0.0038909912109375, 0.0284423828125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.006134033203125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.0245361328125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.15.mlp": { "bias_values": [ -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.0267333984375, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, 0.03125, 0.01483154296875, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.00020503997802734375, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.0260009765625, -0.03125, -0.03125, -0.03125, -0.0289306640625 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.16.mlp": { "bias_values": [ -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.0057373046875, -0.03125, 0.03125, -0.03125, -0.0260009765625, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.01239013671875, -0.03125, -0.03125, 0.03125, 0.0274658203125, 0.03125, -0.03125, 0.03125, 0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.17.mlp": { "bias_values": [ -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.0230712890625, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.0308837890625, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.0260009765625, -0.03125, -0.01385498046875, -0.03125, 0.03125, 0.0026092529296875, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.0303955078125, 0.03125, 0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.18.mlp": { "bias_values": [ -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, 0.0201416015625, 0.03125, -0.03125, -0.031005859375, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.01043701171875, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.031005859375, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.01092529296875, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.19.mlp": { "bias_values": [ 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.0274658203125, 0.03125, -0.03125, 0.03125, 0.00946044921875, -0.03125, 0.03125, -0.03125, 0.0260009765625, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.0010223388671875, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.20.mlp": { "bias_values": [ -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.0303955078125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.0072021484375, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.0303955078125, -0.00141143798828125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.21.mlp": { "bias_values": [ -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.0167236328125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.0172119140625, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.0303955078125, -0.03125, -0.03125, 0.03125, -0.03125, 0.0284423828125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.22.mlp": { "bias_values": [ -0.03125, -0.03125, 0.03125, 0.0230712890625, 0.003509521484375, -0.03125, -0.03125, 0.03125, 0.03125, -0.001373291015625, -0.03125, -0.03125, 0.007232666015625, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.005401611328125, 0.03125, -0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.00799560546875, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" }, "model.layers.23.mlp": { "bias_values": [ 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, 0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, 0.03125, -0.03125, 0.0308837890625, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.03125, -0.03125, -0.03125, -0.03125, -0.0260009765625, -0.03125, 0.03125, 0.03125, -0.03125, 0.03125, 0.03125, -0.03125, -0.03125, -0.03125, 0.03125, -0.031005859375, -0.03125, -0.03125, 0.03125, -0.03125, 0.0250244140625, -0.03125, 0.00017261505126953125, -0.03125, 0.03125, -0.03125, -0.03125 ], "bias_update_speed": 0.0001, "num_experts": 60, "module_type": "Qwen2MoeSparseMoeBlock", "device": "cuda:0", "dtype": "torch.bfloat16" } } }