Upload 11 files

Browse files

Files changed (11) hide show

esm2_3B/mlm/README.md +202 -0
esm2_3B/mlm/adapter_config.json +68 -0
esm2_3B/mlm/adapter_model.bin +3 -0
esm2_3B/mlm/optimizer.pt +3 -0
esm2_3B/mlm/rng_state.pth +3 -0
esm2_3B/mlm/scheduler.pt +3 -0
esm2_3B/mlm/special_tokens_map.json +7 -0
esm2_3B/mlm/tokenizer_config.json +52 -0
esm2_3B/mlm/trainer_state.json +1568 -0
esm2_3B/mlm/training_args.bin +3 -0
esm2_3B/mlm/vocab.txt +33 -0

esm2_3B/mlm/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: facebook/esm2_t36_3B_UR50D
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

esm2_3B/mlm/adapter_config.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "EsmForMaskedLM",
+    "parent_library": "transformers.models.esm.modeling_esm"
+  },
+  "base_model_name_or_path": "facebook/esm2_t36_3B_UR50D",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": "gaussian",
+  "layers_pattern": null,
+  "layers_to_transform": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8,
+    9,
+    10,
+    11,
+    12,
+    13,
+    14,
+    15,
+    16,
+    17,
+    18,
+    19,
+    20,
+    21,
+    22,
+    23,
+    24,
+    25,
+    26,
+    27,
+    28,
+    29,
+    30,
+    31,
+    32,
+    33,
+    34,
+    35
+  ],
+  "loftq_config": {},
+  "lora_alpha": 8,
+  "lora_dropout": 0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "query",
+    "value",
+    "key"
+  ],
+  "task_type": null,
+  "use_rslora": false
+}

esm2_3B/mlm/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0430e6b1f429cb29694fea9496bd8b74ba5de91c5246b590cc8d53845e2fdb08
+size 17774506

esm2_3B/mlm/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9044e4c5caf7f8ff49c55c3db5cb6807ebd31bc03a787ef4824df99efe806336
+size 35570682

esm2_3B/mlm/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
+size 14244

esm2_3B/mlm/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:349999f14d9926b508f5fb59bb5a2d020fb9be8d8105f0ad727233cdb384b81d
+size 1192

esm2_3B/mlm/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "<cls>",
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

esm2_3B/mlm/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<cls>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<cls>",
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "EsmTokenizer",
+  "unk_token": "<unk>"
+}

esm2_3B/mlm/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1568 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.7476476710350247,
+  "eval_steps": 100,
+  "global_step": 2200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 18.108646392822266,
+      "learning_rate": 1e-05,
+      "loss": 21.0115,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 19.891128540039062,
+      "learning_rate": 1e-05,
+      "loss": 20.6659,
+      "step": 10
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 20.71453285217285,
+      "learning_rate": 1e-05,
+      "loss": 20.2163,
+      "step": 20
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 21.3765926361084,
+      "learning_rate": 1e-05,
+      "loss": 18.8949,
+      "step": 30
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 19.546733856201172,
+      "learning_rate": 1e-05,
+      "loss": 17.7048,
+      "step": 40
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 20.177675247192383,
+      "learning_rate": 1e-05,
+      "loss": 16.1373,
+      "step": 50
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 20.215681076049805,
+      "learning_rate": 1e-05,
+      "loss": 15.0542,
+      "step": 60
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 20.990703582763672,
+      "learning_rate": 1e-05,
+      "loss": 13.6239,
+      "step": 70
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 18.313385009765625,
+      "learning_rate": 1e-05,
+      "loss": 12.5539,
+      "step": 80
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 23.427459716796875,
+      "learning_rate": 1e-05,
+      "loss": 11.1956,
+      "step": 90
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 38.51707077026367,
+      "learning_rate": 1e-05,
+      "loss": 9.8649,
+      "step": 100
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 17.18704605102539,
+      "learning_rate": 1e-05,
+      "loss": 8.7252,
+      "step": 110
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 20.252742767333984,
+      "learning_rate": 1e-05,
+      "loss": 7.8294,
+      "step": 120
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 20.109561920166016,
+      "learning_rate": 1e-05,
+      "loss": 6.7392,
+      "step": 130
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 17.344636917114258,
+      "learning_rate": 1e-05,
+      "loss": 5.7289,
+      "step": 140
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 73.92993927001953,
+      "learning_rate": 1e-05,
+      "loss": 4.6879,
+      "step": 150
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 12.572184562683105,
+      "learning_rate": 1e-05,
+      "loss": 3.5502,
+      "step": 160
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 11.667272567749023,
+      "learning_rate": 1e-05,
+      "loss": 2.801,
+      "step": 170
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 9.249564170837402,
+      "learning_rate": 1e-05,
+      "loss": 2.3337,
+      "step": 180
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 13.55443000793457,
+      "learning_rate": 1e-05,
+      "loss": 1.9857,
+      "step": 190
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 248.29791259765625,
+      "learning_rate": 1e-05,
+      "loss": 1.7209,
+      "step": 200
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.2151737213134766,
+      "learning_rate": 1e-05,
+      "loss": 1.5336,
+      "step": 210
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.544109582901001,
+      "learning_rate": 1e-05,
+      "loss": 1.3586,
+      "step": 220
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 2.0416696071624756,
+      "learning_rate": 1e-05,
+      "loss": 1.3511,
+      "step": 230
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.0984798669815063,
+      "learning_rate": 1e-05,
+      "loss": 1.2404,
+      "step": 240
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.1240887641906738,
+      "learning_rate": 1e-05,
+      "loss": 1.1461,
+      "step": 250
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.9642849564552307,
+      "learning_rate": 1e-05,
+      "loss": 1.1479,
+      "step": 260
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 9.870434761047363,
+      "learning_rate": 1e-05,
+      "loss": 1.0765,
+      "step": 270
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.7763110399246216,
+      "learning_rate": 1e-05,
+      "loss": 1.0369,
+      "step": 280
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.542849063873291,
+      "learning_rate": 1e-05,
+      "loss": 1.0358,
+      "step": 290
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 41.260128021240234,
+      "learning_rate": 1e-05,
+      "loss": 0.9912,
+      "step": 300
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.45591601729393005,
+      "learning_rate": 1e-05,
+      "loss": 1.0068,
+      "step": 310
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.4061857759952545,
+      "learning_rate": 1e-05,
+      "loss": 0.9619,
+      "step": 320
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.6108375787734985,
+      "learning_rate": 1e-05,
+      "loss": 0.998,
+      "step": 330
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 54.73992156982422,
+      "learning_rate": 1e-05,
+      "loss": 0.9671,
+      "step": 340
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.1031421422958374,
+      "learning_rate": 1e-05,
+      "loss": 0.9131,
+      "step": 350
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.8087090849876404,
+      "learning_rate": 1e-05,
+      "loss": 0.8975,
+      "step": 360
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.830881655216217,
+      "learning_rate": 1e-05,
+      "loss": 0.8793,
+      "step": 370
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.6168428659439087,
+      "learning_rate": 1e-05,
+      "loss": 0.8614,
+      "step": 380
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.6438579559326172,
+      "learning_rate": 1e-05,
+      "loss": 0.844,
+      "step": 390
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.7063890099525452,
+      "learning_rate": 1e-05,
+      "loss": 0.8112,
+      "step": 400
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.6345189809799194,
+      "learning_rate": 1e-05,
+      "loss": 0.7793,
+      "step": 410
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.7021321654319763,
+      "learning_rate": 1e-05,
+      "loss": 0.7494,
+      "step": 420
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.6041214466094971,
+      "learning_rate": 1e-05,
+      "loss": 0.7391,
+      "step": 430
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.7526829242706299,
+      "learning_rate": 1e-05,
+      "loss": 0.6812,
+      "step": 440
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.9282905459403992,
+      "learning_rate": 1e-05,
+      "loss": 0.7046,
+      "step": 450
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.7867602109909058,
+      "learning_rate": 1e-05,
+      "loss": 0.6556,
+      "step": 460
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.7166017889976501,
+      "learning_rate": 1e-05,
+      "loss": 0.6598,
+      "step": 470
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.48004817962646484,
+      "learning_rate": 1e-05,
+      "loss": 0.6207,
+      "step": 480
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5272298455238342,
+      "learning_rate": 1e-05,
+      "loss": 0.6203,
+      "step": 490
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.7496746182441711,
+      "learning_rate": 1e-05,
+      "loss": 0.5562,
+      "step": 500
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.7099195718765259,
+      "learning_rate": 1e-05,
+      "loss": 0.532,
+      "step": 510
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.6780660152435303,
+      "learning_rate": 1e-05,
+      "loss": 0.5432,
+      "step": 520
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.4947499632835388,
+      "learning_rate": 1e-05,
+      "loss": 0.5428,
+      "step": 530
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.102908968925476,
+      "learning_rate": 1e-05,
+      "loss": 0.5605,
+      "step": 540
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 13.204094886779785,
+      "learning_rate": 1e-05,
+      "loss": 0.5213,
+      "step": 550
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5231891870498657,
+      "learning_rate": 1e-05,
+      "loss": 0.4716,
+      "step": 560
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.3716018199920654,
+      "learning_rate": 1e-05,
+      "loss": 0.4907,
+      "step": 570
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.6284762620925903,
+      "learning_rate": 1e-05,
+      "loss": 0.4753,
+      "step": 580
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.4310474097728729,
+      "learning_rate": 1e-05,
+      "loss": 0.4351,
+      "step": 590
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.1138970851898193,
+      "learning_rate": 1e-05,
+      "loss": 0.4556,
+      "step": 600
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.0587685108184814,
+      "learning_rate": 1e-05,
+      "loss": 0.4312,
+      "step": 610
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.4586256742477417,
+      "learning_rate": 1e-05,
+      "loss": 0.4448,
+      "step": 620
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5893386006355286,
+      "learning_rate": 1e-05,
+      "loss": 0.4097,
+      "step": 630
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.6643804311752319,
+      "learning_rate": 1e-05,
+      "loss": 0.3823,
+      "step": 640
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.758120059967041,
+      "learning_rate": 1e-05,
+      "loss": 0.3963,
+      "step": 650
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.0745816230773926,
+      "learning_rate": 1e-05,
+      "loss": 0.3846,
+      "step": 660
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 53.08203887939453,
+      "learning_rate": 1e-05,
+      "loss": 0.3815,
+      "step": 670
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.9295670390129089,
+      "learning_rate": 1e-05,
+      "loss": 0.3577,
+      "step": 680
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.4713806211948395,
+      "learning_rate": 1e-05,
+      "loss": 0.3429,
+      "step": 690
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.426747590303421,
+      "learning_rate": 1e-05,
+      "loss": 0.3568,
+      "step": 700
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.32080256938934326,
+      "learning_rate": 1e-05,
+      "loss": 0.3549,
+      "step": 710
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.3621009290218353,
+      "learning_rate": 1e-05,
+      "loss": 0.3292,
+      "step": 720
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.7618722915649414,
+      "learning_rate": 1e-05,
+      "loss": 0.3325,
+      "step": 730
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.679786205291748,
+      "learning_rate": 1e-05,
+      "loss": 0.3231,
+      "step": 740
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5011792182922363,
+      "learning_rate": 1e-05,
+      "loss": 0.3287,
+      "step": 750
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.47687655687332153,
+      "learning_rate": 1e-05,
+      "loss": 0.302,
+      "step": 760
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5612205862998962,
+      "learning_rate": 1e-05,
+      "loss": 0.3053,
+      "step": 770
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.2981828451156616,
+      "learning_rate": 1e-05,
+      "loss": 0.2853,
+      "step": 780
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.4281419813632965,
+      "learning_rate": 1e-05,
+      "loss": 0.2746,
+      "step": 790
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.6802565455436707,
+      "learning_rate": 1e-05,
+      "loss": 0.2708,
+      "step": 800
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 3.3523566722869873,
+      "learning_rate": 1e-05,
+      "loss": 0.5448,
+      "step": 810
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 2.0021793842315674,
+      "learning_rate": 1e-05,
+      "loss": 0.3757,
+      "step": 820
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.0229527950286865,
+      "learning_rate": 1e-05,
+      "loss": 0.3256,
+      "step": 830
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5520893931388855,
+      "learning_rate": 1e-05,
+      "loss": 0.292,
+      "step": 840
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5874707698822021,
+      "learning_rate": 1e-05,
+      "loss": 0.2686,
+      "step": 850
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.38817650079727173,
+      "learning_rate": 1e-05,
+      "loss": 0.2527,
+      "step": 860
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.7592063546180725,
+      "learning_rate": 1e-05,
+      "loss": 0.243,
+      "step": 870
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.42729002237319946,
+      "learning_rate": 1e-05,
+      "loss": 0.2297,
+      "step": 880
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.3907771706581116,
+      "learning_rate": 1e-05,
+      "loss": 0.223,
+      "step": 890
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.38824358582496643,
+      "learning_rate": 1e-05,
+      "loss": 0.2173,
+      "step": 900
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.2964054048061371,
+      "learning_rate": 1e-05,
+      "loss": 0.2081,
+      "step": 910
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.37019163370132446,
+      "learning_rate": 1e-05,
+      "loss": 0.2032,
+      "step": 920
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.2846088111400604,
+      "learning_rate": 1e-05,
+      "loss": 0.1971,
+      "step": 930
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.2604421377182007,
+      "learning_rate": 1e-05,
+      "loss": 0.1936,
+      "step": 940
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.3167709410190582,
+      "learning_rate": 1e-05,
+      "loss": 0.1894,
+      "step": 950
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.4469132423400879,
+      "learning_rate": 1e-05,
+      "loss": 0.1817,
+      "step": 960
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.3273838758468628,
+      "learning_rate": 1e-05,
+      "loss": 0.1769,
+      "step": 970
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.3062090277671814,
+      "learning_rate": 1e-05,
+      "loss": 0.1758,
+      "step": 980
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.1933787316083908,
+      "learning_rate": 1e-05,
+      "loss": 0.1701,
+      "step": 990
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.2302626371383667,
+      "learning_rate": 1e-05,
+      "loss": 0.1655,
+      "step": 1000
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.1817045956850052,
+      "learning_rate": 1e-05,
+      "loss": 0.1649,
+      "step": 1010
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.2034452259540558,
+      "learning_rate": 1e-05,
+      "loss": 0.1596,
+      "step": 1020
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.393022745847702,
+      "learning_rate": 1e-05,
+      "loss": 0.1565,
+      "step": 1030
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.45332619547843933,
+      "learning_rate": 1e-05,
+      "loss": 0.1525,
+      "step": 1040
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.3920886516571045,
+      "learning_rate": 1e-05,
+      "loss": 0.1519,
+      "step": 1050
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2492988407611847,
+      "learning_rate": 1e-05,
+      "loss": 0.1486,
+      "step": 1060
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.18808089196681976,
+      "learning_rate": 1e-05,
+      "loss": 0.144,
+      "step": 1070
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.17545698583126068,
+      "learning_rate": 1e-05,
+      "loss": 0.1436,
+      "step": 1080
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.39826542139053345,
+      "learning_rate": 1e-05,
+      "loss": 0.142,
+      "step": 1090
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.25920161604881287,
+      "learning_rate": 1e-05,
+      "loss": 0.1377,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.24598653614521027,
+      "learning_rate": 1e-05,
+      "loss": 0.1343,
+      "step": 1110
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.3771628439426422,
+      "learning_rate": 1e-05,
+      "loss": 0.1308,
+      "step": 1120
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.17581653594970703,
+      "learning_rate": 1e-05,
+      "loss": 0.1304,
+      "step": 1130
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.29346442222595215,
+      "learning_rate": 1e-05,
+      "loss": 0.1272,
+      "step": 1140
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.32129135727882385,
+      "learning_rate": 1e-05,
+      "loss": 0.1249,
+      "step": 1150
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.24236874282360077,
+      "learning_rate": 1e-05,
+      "loss": 0.1246,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.2639343738555908,
+      "learning_rate": 1e-05,
+      "loss": 0.1221,
+      "step": 1170
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1657143533229828,
+      "learning_rate": 1e-05,
+      "loss": 0.1196,
+      "step": 1180
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.20446737110614777,
+      "learning_rate": 1e-05,
+      "loss": 0.1159,
+      "step": 1190
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.20889534056186676,
+      "learning_rate": 1e-05,
+      "loss": 0.1147,
+      "step": 1200
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.34177979826927185,
+      "learning_rate": 1e-05,
+      "loss": 0.1141,
+      "step": 1210
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.12536485493183136,
+      "learning_rate": 1e-05,
+      "loss": 0.1126,
+      "step": 1220
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.20015142858028412,
+      "learning_rate": 1e-05,
+      "loss": 0.1099,
+      "step": 1230
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.29639932513237,
+      "learning_rate": 1e-05,
+      "loss": 0.1102,
+      "step": 1240
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.41865894198417664,
+      "learning_rate": 1e-05,
+      "loss": 0.1085,
+      "step": 1250
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.14246885478496552,
+      "learning_rate": 1e-05,
+      "loss": 0.1054,
+      "step": 1260
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.29295045137405396,
+      "learning_rate": 1e-05,
+      "loss": 0.1048,
+      "step": 1270
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.5125443339347839,
+      "learning_rate": 1e-05,
+      "loss": 0.1038,
+      "step": 1280
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.7473035454750061,
+      "learning_rate": 1e-05,
+      "loss": 0.1025,
+      "step": 1290
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.7184324264526367,
+      "learning_rate": 1e-05,
+      "loss": 0.101,
+      "step": 1300
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.2857365906238556,
+      "learning_rate": 1e-05,
+      "loss": 0.0988,
+      "step": 1310
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.15471303462982178,
+      "learning_rate": 1e-05,
+      "loss": 0.0987,
+      "step": 1320
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.5794267654418945,
+      "learning_rate": 1e-05,
+      "loss": 0.0986,
+      "step": 1330
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.3085970878601074,
+      "learning_rate": 1e-05,
+      "loss": 0.0964,
+      "step": 1340
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.38039007782936096,
+      "learning_rate": 1e-05,
+      "loss": 0.0939,
+      "step": 1350
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.20592401921749115,
+      "learning_rate": 1e-05,
+      "loss": 0.0932,
+      "step": 1360
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.20163756608963013,
+      "learning_rate": 1e-05,
+      "loss": 0.0931,
+      "step": 1370
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.3857424855232239,
+      "learning_rate": 1e-05,
+      "loss": 0.0913,
+      "step": 1380
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.13959680497646332,
+      "learning_rate": 1e-05,
+      "loss": 0.092,
+      "step": 1390
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4001765251159668,
+      "learning_rate": 1e-05,
+      "loss": 0.0902,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.28235796093940735,
+      "learning_rate": 1e-05,
+      "loss": 0.0896,
+      "step": 1410
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.4206916391849518,
+      "learning_rate": 1e-05,
+      "loss": 0.0874,
+      "step": 1420
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.20532837510108948,
+      "learning_rate": 1e-05,
+      "loss": 0.0859,
+      "step": 1430
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.5433242917060852,
+      "learning_rate": 1e-05,
+      "loss": 0.0868,
+      "step": 1440
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.19398020207881927,
+      "learning_rate": 1e-05,
+      "loss": 0.0862,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.8159850835800171,
+      "learning_rate": 1e-05,
+      "loss": 0.0848,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.18659061193466187,
+      "learning_rate": 1e-05,
+      "loss": 0.0839,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.2018926590681076,
+      "learning_rate": 1e-05,
+      "loss": 0.0826,
+      "step": 1480
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.2839762568473816,
+      "learning_rate": 1e-05,
+      "loss": 0.0824,
+      "step": 1490
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.1700347363948822,
+      "learning_rate": 1e-05,
+      "loss": 0.0814,
+      "step": 1500
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.13129307329654694,
+      "learning_rate": 1e-05,
+      "loss": 0.0815,
+      "step": 1510
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.397320032119751,
+      "learning_rate": 1e-05,
+      "loss": 0.0805,
+      "step": 1520
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.2305113822221756,
+      "learning_rate": 1e-05,
+      "loss": 0.0789,
+      "step": 1530
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.37059059739112854,
+      "learning_rate": 1e-05,
+      "loss": 0.0775,
+      "step": 1540
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.2454056739807129,
+      "learning_rate": 1e-05,
+      "loss": 0.0781,
+      "step": 1550
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.340133398771286,
+      "learning_rate": 1e-05,
+      "loss": 0.0778,
+      "step": 1560
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.250377893447876,
+      "learning_rate": 1e-05,
+      "loss": 0.0753,
+      "step": 1570
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.18379095196723938,
+      "learning_rate": 1e-05,
+      "loss": 0.0755,
+      "step": 1580
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.16987499594688416,
+      "learning_rate": 1e-05,
+      "loss": 0.0743,
+      "step": 1590
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.1600923091173172,
+      "learning_rate": 1e-05,
+      "loss": 0.0739,
+      "step": 1600
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.15362544357776642,
+      "learning_rate": 1e-05,
+      "loss": 0.0732,
+      "step": 1610
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.40313267707824707,
+      "learning_rate": 1e-05,
+      "loss": 0.0726,
+      "step": 1620
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.43576550483703613,
+      "learning_rate": 1e-05,
+      "loss": 0.0722,
+      "step": 1630
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.255543977022171,
+      "learning_rate": 1e-05,
+      "loss": 0.0709,
+      "step": 1640
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.23503994941711426,
+      "learning_rate": 1e-05,
+      "loss": 0.0719,
+      "step": 1650
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.27073901891708374,
+      "learning_rate": 1e-05,
+      "loss": 0.0702,
+      "step": 1660
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.15940971672534943,
+      "learning_rate": 1e-05,
+      "loss": 0.0699,
+      "step": 1670
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.3071956932544708,
+      "learning_rate": 1e-05,
+      "loss": 0.0701,
+      "step": 1680
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.13690294325351715,
+      "learning_rate": 1e-05,
+      "loss": 0.0687,
+      "step": 1690
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.11188670992851257,
+      "learning_rate": 1e-05,
+      "loss": 0.0685,
+      "step": 1700
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.22904923558235168,
+      "learning_rate": 1e-05,
+      "loss": 0.069,
+      "step": 1710
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.15047454833984375,
+      "learning_rate": 1e-05,
+      "loss": 0.0676,
+      "step": 1720
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.1788366287946701,
+      "learning_rate": 1e-05,
+      "loss": 0.0663,
+      "step": 1730
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.1737486571073532,
+      "learning_rate": 1e-05,
+      "loss": 0.0665,
+      "step": 1740
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.18163177371025085,
+      "learning_rate": 1e-05,
+      "loss": 0.066,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.15237963199615479,
+      "learning_rate": 1e-05,
+      "loss": 0.0647,
+      "step": 1760
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.1539287269115448,
+      "learning_rate": 1e-05,
+      "loss": 0.0655,
+      "step": 1770
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.13951052725315094,
+      "learning_rate": 1e-05,
+      "loss": 0.0649,
+      "step": 1780
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.1300867348909378,
+      "learning_rate": 1e-05,
+      "loss": 0.0641,
+      "step": 1790
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.12030315399169922,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "step": 1800
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.10913581401109695,
+      "learning_rate": 1e-05,
+      "loss": 0.0636,
+      "step": 1810
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.17842526733875275,
+      "learning_rate": 1e-05,
+      "loss": 0.0637,
+      "step": 1820
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.20770031213760376,
+      "learning_rate": 1e-05,
+      "loss": 0.0632,
+      "step": 1830
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.12772506475448608,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "step": 1840
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.10420256853103638,
+      "learning_rate": 1e-05,
+      "loss": 0.0625,
+      "step": 1850
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.2480468451976776,
+      "learning_rate": 1e-05,
+      "loss": 0.062,
+      "step": 1860
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.17825984954833984,
+      "learning_rate": 1e-05,
+      "loss": 0.0605,
+      "step": 1870
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.12127891182899475,
+      "learning_rate": 1e-05,
+      "loss": 0.0608,
+      "step": 1880
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.09041409939527512,
+      "learning_rate": 1e-05,
+      "loss": 0.0602,
+      "step": 1890
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.10765266418457031,
+      "learning_rate": 1e-05,
+      "loss": 0.0604,
+      "step": 1900
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.12556296586990356,
+      "learning_rate": 1e-05,
+      "loss": 0.0592,
+      "step": 1910
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.2558988928794861,
+      "learning_rate": 1e-05,
+      "loss": 0.0598,
+      "step": 1920
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.12372375279664993,
+      "learning_rate": 1e-05,
+      "loss": 0.0582,
+      "step": 1930
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.0909271314740181,
+      "learning_rate": 1e-05,
+      "loss": 0.0586,
+      "step": 1940
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.19711463153362274,
+      "learning_rate": 1e-05,
+      "loss": 0.058,
+      "step": 1950
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.13144218921661377,
+      "learning_rate": 1e-05,
+      "loss": 0.0592,
+      "step": 1960
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.10313715785741806,
+      "learning_rate": 1e-05,
+      "loss": 0.0577,
+      "step": 1970
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.15561428666114807,
+      "learning_rate": 1e-05,
+      "loss": 0.0581,
+      "step": 1980
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.1571255624294281,
+      "learning_rate": 1e-05,
+      "loss": 0.0576,
+      "step": 1990
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.34297963976860046,
+      "learning_rate": 1e-05,
+      "loss": 0.0574,
+      "step": 2000
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.3346453011035919,
+      "learning_rate": 1e-05,
+      "loss": 0.0573,
+      "step": 2010
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.09346891939640045,
+      "learning_rate": 1e-05,
+      "loss": 0.0558,
+      "step": 2020
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.3849794566631317,
+      "learning_rate": 1e-05,
+      "loss": 0.0576,
+      "step": 2030
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.3006366789340973,
+      "learning_rate": 1e-05,
+      "loss": 0.0561,
+      "step": 2040
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.08292259275913239,
+      "learning_rate": 1e-05,
+      "loss": 0.0567,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.37851089239120483,
+      "learning_rate": 1e-05,
+      "loss": 0.0556,
+      "step": 2060
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.2993626892566681,
+      "learning_rate": 1e-05,
+      "loss": 0.0551,
+      "step": 2070
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.3539237976074219,
+      "learning_rate": 1e-05,
+      "loss": 0.0556,
+      "step": 2080
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.6489169597625732,
+      "learning_rate": 1e-05,
+      "loss": 0.0556,
+      "step": 2090
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.20589084923267365,
+      "learning_rate": 1e-05,
+      "loss": 0.0545,
+      "step": 2100
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.08394920825958252,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "step": 2110
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.0861760601401329,
+      "learning_rate": 1e-05,
+      "loss": 0.0534,
+      "step": 2120
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.22998709976673126,
+      "learning_rate": 1e-05,
+      "loss": 0.0538,
+      "step": 2130
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.1493302583694458,
+      "learning_rate": 1e-05,
+      "loss": 0.0542,
+      "step": 2140
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.11739397794008255,
+      "learning_rate": 1e-05,
+      "loss": 0.0539,
+      "step": 2150
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.10503773391246796,
+      "learning_rate": 1e-05,
+      "loss": 0.053,
+      "step": 2160
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2653915286064148,
+      "learning_rate": 1e-05,
+      "loss": 0.0533,
+      "step": 2170
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.1554844081401825,
+      "learning_rate": 1e-05,
+      "loss": 0.0528,
+      "step": 2180
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.20956221222877502,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "step": 2190
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.07734131067991257,
+      "learning_rate": 1e-05,
+      "loss": 0.0524,
+      "step": 2200
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 17652,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 6,
+  "save_steps": 200,
+  "total_flos": 3.689592339116851e+18,
+  "train_batch_size": 3,
+  "trial_name": null,
+  "trial_params": null
+}

esm2_3B/mlm/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1d2e9bba2aea11f5d16663f0a2e18f04381af354b4f8929817dfa0742d7f94c
+size 4920

esm2_3B/mlm/vocab.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+<cls>
+<pad>
+<eos>
+<unk>
+L
+A
+G
+V
+S
+E
+R
+T
+I
+D
+P
+K
+Q
+N
+F
+Y
+M
+H
+W
+C
+X
+B
+U
+Z
+O
+.
+-
+<null_1>
+<mask>