dsaunders23 commited on
Commit
b126083
·
verified ·
1 Parent(s): c46ba0b

Upload TinyLlama chess eval model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +96 -0
  2. adapter_config.json +37 -0
  3. adapter_model.safetensors +3 -0
  4. checkpoint-242/README.md +202 -0
  5. checkpoint-242/adapter_config.json +37 -0
  6. checkpoint-242/adapter_model.safetensors +3 -0
  7. checkpoint-242/optimizer.pt +3 -0
  8. checkpoint-242/rng_state.pth +3 -0
  9. checkpoint-242/scheduler.pt +3 -0
  10. checkpoint-242/special_tokens_map.json +30 -0
  11. checkpoint-242/tokenizer.json +0 -0
  12. checkpoint-242/tokenizer.model +3 -0
  13. checkpoint-242/tokenizer_config.json +44 -0
  14. checkpoint-242/trainer_state.json +880 -0
  15. checkpoint-242/training_args.bin +3 -0
  16. checkpoint-312/README.md +202 -0
  17. checkpoint-312/adapter_config.json +37 -0
  18. checkpoint-312/adapter_model.safetensors +3 -0
  19. checkpoint-312/optimizer.pt +3 -0
  20. checkpoint-312/rng_state.pth +3 -0
  21. checkpoint-312/scheduler.pt +3 -0
  22. checkpoint-312/special_tokens_map.json +30 -0
  23. checkpoint-312/tokenizer.json +0 -0
  24. checkpoint-312/tokenizer.model +3 -0
  25. checkpoint-312/tokenizer_config.json +44 -0
  26. checkpoint-312/trainer_state.json +2217 -0
  27. checkpoint-312/training_args.bin +3 -0
  28. checkpoint-363/README.md +202 -0
  29. checkpoint-363/adapter_config.json +37 -0
  30. checkpoint-363/adapter_model.safetensors +3 -0
  31. checkpoint-363/optimizer.pt +3 -0
  32. checkpoint-363/rng_state.pth +3 -0
  33. checkpoint-363/scheduler.pt +3 -0
  34. checkpoint-363/special_tokens_map.json +30 -0
  35. checkpoint-363/tokenizer.json +0 -0
  36. checkpoint-363/tokenizer.model +3 -0
  37. checkpoint-363/tokenizer_config.json +44 -0
  38. checkpoint-363/trainer_state.json +1300 -0
  39. checkpoint-363/training_args.bin +3 -0
  40. checkpoint-484/README.md +202 -0
  41. checkpoint-484/adapter_config.json +37 -0
  42. checkpoint-484/adapter_model.safetensors +3 -0
  43. checkpoint-484/optimizer.pt +3 -0
  44. checkpoint-484/rng_state.pth +3 -0
  45. checkpoint-484/scheduler.pt +3 -0
  46. checkpoint-484/special_tokens_map.json +30 -0
  47. checkpoint-484/tokenizer.json +0 -0
  48. checkpoint-484/tokenizer.model +3 -0
  49. checkpoint-484/tokenizer_config.json +44 -0
  50. checkpoint-484/trainer_state.json +1727 -0
README.md ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: apache-2.0
4
+ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - dsaunders23/ChessAlpacaPrediction
9
+ model-index:
10
+ - name: outputs/mymodel
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
18
+ <details><summary>See axolotl config</summary>
19
+
20
+ axolotl version: `0.8.0.dev0`
21
+ ```yaml
22
+ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
23
+ datasets:
24
+ - path: dsaunders23/ChessAlpacaPrediction
25
+ type: alpaca
26
+ output_dir: ./outputs/mymodel
27
+
28
+ sequence_len: 4096
29
+ adapter: lora
30
+
31
+ lora_r: 8
32
+ lora_alpha: 16
33
+ lora_dropout: 0.05
34
+ lora_target_modules:
35
+ - q_proj
36
+ - v_proj
37
+ - k_proj
38
+ - o_proj
39
+ - gate_proj
40
+ - down_proj
41
+ - up_proj
42
+
43
+ gradient_accumulation_steps: 1
44
+ micro_batch_size: 16
45
+ num_epochs: 1
46
+ optimizer: adamw_bnb_8bit
47
+ learning_rate: 0.0002
48
+ load_in_8bit: true
49
+ train_on_inputs: false
50
+ bf16: auto
51
+
52
+ ```
53
+
54
+ </details><br>
55
+
56
+ # outputs/mymodel
57
+
58
+ This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) on the dsaunders23/ChessAlpacaPrediction dataset.
59
+
60
+ ## Model description
61
+
62
+ More information needed
63
+
64
+ ## Intended uses & limitations
65
+
66
+ More information needed
67
+
68
+ ## Training and evaluation data
69
+
70
+ More information needed
71
+
72
+ ## Training procedure
73
+
74
+ ### Training hyperparameters
75
+
76
+ The following hyperparameters were used during training:
77
+ - learning_rate: 0.0002
78
+ - train_batch_size: 16
79
+ - eval_batch_size: 16
80
+ - seed: 42
81
+ - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
82
+ - lr_scheduler_type: cosine
83
+ - lr_scheduler_warmup_steps: 3
84
+ - num_epochs: 1.0
85
+
86
+ ### Training results
87
+
88
+
89
+
90
+ ### Framework versions
91
+
92
+ - PEFT 0.14.0
93
+ - Transformers 4.49.0
94
+ - Pytorch 2.5.1+cu124
95
+ - Datasets 3.2.0
96
+ - Tokenizers 0.21.0
adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": null,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 8,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "up_proj",
27
+ "v_proj",
28
+ "o_proj",
29
+ "q_proj",
30
+ "gate_proj",
31
+ "down_proj",
32
+ "k_proj"
33
+ ],
34
+ "task_type": "CAUSAL_LM",
35
+ "use_dora": false,
36
+ "use_rslora": false
37
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:834aa01d3a27c481518b8eb0670306ea14dd35884712f4af347d7e8713cbc7d1
3
+ size 25271744
checkpoint-242/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
checkpoint-242/adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": null,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 8,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "gate_proj",
27
+ "down_proj",
28
+ "v_proj",
29
+ "k_proj",
30
+ "o_proj",
31
+ "up_proj",
32
+ "q_proj"
33
+ ],
34
+ "task_type": "CAUSAL_LM",
35
+ "use_dora": false,
36
+ "use_rslora": false
37
+ }
checkpoint-242/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8988b279ceed041b49d6696eb4849dc7764ae0a1783dbaa8f2b3ae72e771fd0
3
+ size 25271744
checkpoint-242/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cda84e7dc66fd006a7e5db700db9fac64d1e038d9a076b5502b2a82389377c7c
3
+ size 13685516
checkpoint-242/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd919faa08944c698a3f9ac10339b1cd07038ae87ff6b3366876c1204571e079
3
+ size 14244
checkpoint-242/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26e0e13ba36eccb93aa31dc1658c17534123aefc1a1837dbb8255129bf68b872
3
+ size 1064
checkpoint-242/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-242/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-242/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-242/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 2048,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-242/trainer_state.json ADDED
@@ -0,0 +1,880 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 242,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01652892561983471,
13
+ "grad_norm": 12.011792182922363,
14
+ "learning_rate": 2.857142857142857e-05,
15
+ "loss": 4.7518,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.03305785123966942,
20
+ "grad_norm": 12.689871788024902,
21
+ "learning_rate": 5.714285714285714e-05,
22
+ "loss": 4.687,
23
+ "step": 4
24
+ },
25
+ {
26
+ "epoch": 0.049586776859504134,
27
+ "grad_norm": 11.848198890686035,
28
+ "learning_rate": 8.571428571428571e-05,
29
+ "loss": 4.4922,
30
+ "step": 6
31
+ },
32
+ {
33
+ "epoch": 0.06611570247933884,
34
+ "grad_norm": 10.378408432006836,
35
+ "learning_rate": 0.00011428571428571428,
36
+ "loss": 3.4839,
37
+ "step": 8
38
+ },
39
+ {
40
+ "epoch": 0.08264462809917356,
41
+ "grad_norm": 9.543920516967773,
42
+ "learning_rate": 0.00014285714285714287,
43
+ "loss": 2.4164,
44
+ "step": 10
45
+ },
46
+ {
47
+ "epoch": 0.09917355371900827,
48
+ "grad_norm": 5.602456092834473,
49
+ "learning_rate": 0.00017142857142857143,
50
+ "loss": 1.838,
51
+ "step": 12
52
+ },
53
+ {
54
+ "epoch": 0.11570247933884298,
55
+ "grad_norm": 1.7414193153381348,
56
+ "learning_rate": 0.0002,
57
+ "loss": 1.4404,
58
+ "step": 14
59
+ },
60
+ {
61
+ "epoch": 0.1322314049586777,
62
+ "grad_norm": 1.0191770792007446,
63
+ "learning_rate": 0.0001999910643210378,
64
+ "loss": 1.3474,
65
+ "step": 16
66
+ },
67
+ {
68
+ "epoch": 0.1487603305785124,
69
+ "grad_norm": 3.509352922439575,
70
+ "learning_rate": 0.0001999642588810784,
71
+ "loss": 1.3789,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 0.1652892561983471,
76
+ "grad_norm": 2.426680088043213,
77
+ "learning_rate": 0.00019991958847061784,
78
+ "loss": 1.2245,
79
+ "step": 20
80
+ },
81
+ {
82
+ "epoch": 0.18181818181818182,
83
+ "grad_norm": 0.8060101270675659,
84
+ "learning_rate": 0.00019985706107286514,
85
+ "loss": 1.2259,
86
+ "step": 22
87
+ },
88
+ {
89
+ "epoch": 0.19834710743801653,
90
+ "grad_norm": 1.6215940713882446,
91
+ "learning_rate": 0.00019977668786231534,
92
+ "loss": 1.1992,
93
+ "step": 24
94
+ },
95
+ {
96
+ "epoch": 0.21487603305785125,
97
+ "grad_norm": 2.476259231567383,
98
+ "learning_rate": 0.0001996784832027525,
99
+ "loss": 1.2782,
100
+ "step": 26
101
+ },
102
+ {
103
+ "epoch": 0.23140495867768596,
104
+ "grad_norm": 2.04018235206604,
105
+ "learning_rate": 0.00019956246464468294,
106
+ "loss": 1.23,
107
+ "step": 28
108
+ },
109
+ {
110
+ "epoch": 0.24793388429752067,
111
+ "grad_norm": 1.8799017667770386,
112
+ "learning_rate": 0.00019942865292219838,
113
+ "loss": 1.2164,
114
+ "step": 30
115
+ },
116
+ {
117
+ "epoch": 0.2644628099173554,
118
+ "grad_norm": 1.2413148880004883,
119
+ "learning_rate": 0.00019927707194927066,
120
+ "loss": 1.2924,
121
+ "step": 32
122
+ },
123
+ {
124
+ "epoch": 0.2809917355371901,
125
+ "grad_norm": 1.8723937273025513,
126
+ "learning_rate": 0.000199107748815478,
127
+ "loss": 1.2302,
128
+ "step": 34
129
+ },
130
+ {
131
+ "epoch": 0.2975206611570248,
132
+ "grad_norm": 1.393110752105713,
133
+ "learning_rate": 0.00019892071378116376,
134
+ "loss": 1.2276,
135
+ "step": 36
136
+ },
137
+ {
138
+ "epoch": 0.3140495867768595,
139
+ "grad_norm": 1.1459721326828003,
140
+ "learning_rate": 0.0001987160002720283,
141
+ "loss": 1.1504,
142
+ "step": 38
143
+ },
144
+ {
145
+ "epoch": 0.3305785123966942,
146
+ "grad_norm": 1.4680942296981812,
147
+ "learning_rate": 0.00019849364487315558,
148
+ "loss": 1.1623,
149
+ "step": 40
150
+ },
151
+ {
152
+ "epoch": 0.34710743801652894,
153
+ "grad_norm": 1.8715866804122925,
154
+ "learning_rate": 0.0001982536873224748,
155
+ "loss": 1.2155,
156
+ "step": 42
157
+ },
158
+ {
159
+ "epoch": 0.36363636363636365,
160
+ "grad_norm": 0.871064305305481,
161
+ "learning_rate": 0.0001979961705036587,
162
+ "loss": 1.1594,
163
+ "step": 44
164
+ },
165
+ {
166
+ "epoch": 0.38016528925619836,
167
+ "grad_norm": 0.8239800930023193,
168
+ "learning_rate": 0.00019772114043845965,
169
+ "loss": 1.1501,
170
+ "step": 46
171
+ },
172
+ {
173
+ "epoch": 0.39669421487603307,
174
+ "grad_norm": 0.9587319493293762,
175
+ "learning_rate": 0.0001974286462784851,
176
+ "loss": 1.1195,
177
+ "step": 48
178
+ },
179
+ {
180
+ "epoch": 0.4132231404958678,
181
+ "grad_norm": 1.1645926237106323,
182
+ "learning_rate": 0.0001971187402964132,
183
+ "loss": 1.1417,
184
+ "step": 50
185
+ },
186
+ {
187
+ "epoch": 0.4297520661157025,
188
+ "grad_norm": 0.576813817024231,
189
+ "learning_rate": 0.00019679147787665126,
190
+ "loss": 1.1445,
191
+ "step": 52
192
+ },
193
+ {
194
+ "epoch": 0.4462809917355372,
195
+ "grad_norm": 1.0733133554458618,
196
+ "learning_rate": 0.00019644691750543767,
197
+ "loss": 1.0979,
198
+ "step": 54
199
+ },
200
+ {
201
+ "epoch": 0.4628099173553719,
202
+ "grad_norm": 0.5801639556884766,
203
+ "learning_rate": 0.00019608512076038962,
204
+ "loss": 1.0977,
205
+ "step": 56
206
+ },
207
+ {
208
+ "epoch": 0.4793388429752066,
209
+ "grad_norm": 1.6796538829803467,
210
+ "learning_rate": 0.00019570615229949842,
211
+ "loss": 1.1925,
212
+ "step": 58
213
+ },
214
+ {
215
+ "epoch": 0.49586776859504134,
216
+ "grad_norm": 1.0563887357711792,
217
+ "learning_rate": 0.00019531007984957408,
218
+ "loss": 1.0657,
219
+ "step": 60
220
+ },
221
+ {
222
+ "epoch": 0.512396694214876,
223
+ "grad_norm": 0.9109811186790466,
224
+ "learning_rate": 0.00019489697419414182,
225
+ "loss": 1.1098,
226
+ "step": 62
227
+ },
228
+ {
229
+ "epoch": 0.5289256198347108,
230
+ "grad_norm": 0.7321667671203613,
231
+ "learning_rate": 0.0001944669091607919,
232
+ "loss": 1.0929,
233
+ "step": 64
234
+ },
235
+ {
236
+ "epoch": 0.5454545454545454,
237
+ "grad_norm": 0.685366690158844,
238
+ "learning_rate": 0.00019401996160798573,
239
+ "loss": 1.1242,
240
+ "step": 66
241
+ },
242
+ {
243
+ "epoch": 0.5619834710743802,
244
+ "grad_norm": 0.8959838151931763,
245
+ "learning_rate": 0.0001935562114113202,
246
+ "loss": 1.181,
247
+ "step": 68
248
+ },
249
+ {
250
+ "epoch": 0.5785123966942148,
251
+ "grad_norm": 0.9717262983322144,
252
+ "learning_rate": 0.00019307574144925287,
253
+ "loss": 1.2295,
254
+ "step": 70
255
+ },
256
+ {
257
+ "epoch": 0.5950413223140496,
258
+ "grad_norm": 1.0358582735061646,
259
+ "learning_rate": 0.00019257863758829035,
260
+ "loss": 1.1431,
261
+ "step": 72
262
+ },
263
+ {
264
+ "epoch": 0.6115702479338843,
265
+ "grad_norm": 0.7998526096343994,
266
+ "learning_rate": 0.00019206498866764288,
267
+ "loss": 1.1032,
268
+ "step": 74
269
+ },
270
+ {
271
+ "epoch": 0.628099173553719,
272
+ "grad_norm": 1.1496188640594482,
273
+ "learning_rate": 0.0001915348864833476,
274
+ "loss": 1.057,
275
+ "step": 76
276
+ },
277
+ {
278
+ "epoch": 0.6446280991735537,
279
+ "grad_norm": 0.652406632900238,
280
+ "learning_rate": 0.00019098842577186314,
281
+ "loss": 1.146,
282
+ "step": 78
283
+ },
284
+ {
285
+ "epoch": 0.6611570247933884,
286
+ "grad_norm": 0.9454944729804993,
287
+ "learning_rate": 0.00019042570419313925,
288
+ "loss": 1.1543,
289
+ "step": 80
290
+ },
291
+ {
292
+ "epoch": 0.6776859504132231,
293
+ "grad_norm": 0.7456652522087097,
294
+ "learning_rate": 0.00018984682231316333,
295
+ "loss": 1.1189,
296
+ "step": 82
297
+ },
298
+ {
299
+ "epoch": 0.6942148760330579,
300
+ "grad_norm": 0.7312512397766113,
301
+ "learning_rate": 0.00018925188358598813,
302
+ "loss": 1.0873,
303
+ "step": 84
304
+ },
305
+ {
306
+ "epoch": 0.7107438016528925,
307
+ "grad_norm": 0.8474765419960022,
308
+ "learning_rate": 0.000188640994335243,
309
+ "loss": 1.1698,
310
+ "step": 86
311
+ },
312
+ {
313
+ "epoch": 0.7272727272727273,
314
+ "grad_norm": 0.6979633569717407,
315
+ "learning_rate": 0.0001880142637351325,
316
+ "loss": 1.1417,
317
+ "step": 88
318
+ },
319
+ {
320
+ "epoch": 0.743801652892562,
321
+ "grad_norm": 0.5989161133766174,
322
+ "learning_rate": 0.00018737180379092537,
323
+ "loss": 1.0479,
324
+ "step": 90
325
+ },
326
+ {
327
+ "epoch": 0.7603305785123967,
328
+ "grad_norm": 0.5765272378921509,
329
+ "learning_rate": 0.00018671372931893773,
330
+ "loss": 1.1336,
331
+ "step": 92
332
+ },
333
+ {
334
+ "epoch": 0.7768595041322314,
335
+ "grad_norm": 0.6709849834442139,
336
+ "learning_rate": 0.00018604015792601396,
337
+ "loss": 1.1157,
338
+ "step": 94
339
+ },
340
+ {
341
+ "epoch": 0.7933884297520661,
342
+ "grad_norm": 0.8181343674659729,
343
+ "learning_rate": 0.00018535120998850848,
344
+ "loss": 1.0927,
345
+ "step": 96
346
+ },
347
+ {
348
+ "epoch": 0.8099173553719008,
349
+ "grad_norm": 0.6146332621574402,
350
+ "learning_rate": 0.00018464700863077312,
351
+ "loss": 1.0739,
352
+ "step": 98
353
+ },
354
+ {
355
+ "epoch": 0.8264462809917356,
356
+ "grad_norm": 0.9904415011405945,
357
+ "learning_rate": 0.00018392767970315313,
358
+ "loss": 1.0331,
359
+ "step": 100
360
+ },
361
+ {
362
+ "epoch": 0.8429752066115702,
363
+ "grad_norm": 0.6186695694923401,
364
+ "learning_rate": 0.0001831933517594957,
365
+ "loss": 1.0513,
366
+ "step": 102
367
+ },
368
+ {
369
+ "epoch": 0.859504132231405,
370
+ "grad_norm": 1.1912785768508911,
371
+ "learning_rate": 0.00018244415603417603,
372
+ "loss": 1.1567,
373
+ "step": 104
374
+ },
375
+ {
376
+ "epoch": 0.8760330578512396,
377
+ "grad_norm": 1.3681318759918213,
378
+ "learning_rate": 0.00018168022641864377,
379
+ "loss": 1.1497,
380
+ "step": 106
381
+ },
382
+ {
383
+ "epoch": 0.8925619834710744,
384
+ "grad_norm": 0.619476318359375,
385
+ "learning_rate": 0.00018090169943749476,
386
+ "loss": 1.1546,
387
+ "step": 108
388
+ },
389
+ {
390
+ "epoch": 0.9090909090909091,
391
+ "grad_norm": 0.7421219348907471,
392
+ "learning_rate": 0.00018010871422407236,
393
+ "loss": 1.1458,
394
+ "step": 110
395
+ },
396
+ {
397
+ "epoch": 0.9256198347107438,
398
+ "grad_norm": 0.6569286584854126,
399
+ "learning_rate": 0.00017930141249560233,
400
+ "loss": 1.12,
401
+ "step": 112
402
+ },
403
+ {
404
+ "epoch": 0.9421487603305785,
405
+ "grad_norm": 0.4168110191822052,
406
+ "learning_rate": 0.0001784799385278661,
407
+ "loss": 1.1682,
408
+ "step": 114
409
+ },
410
+ {
411
+ "epoch": 0.9586776859504132,
412
+ "grad_norm": 0.5620162487030029,
413
+ "learning_rate": 0.00017764443912941672,
414
+ "loss": 1.1828,
415
+ "step": 116
416
+ },
417
+ {
418
+ "epoch": 0.9752066115702479,
419
+ "grad_norm": 0.8095484375953674,
420
+ "learning_rate": 0.00017679506361534215,
421
+ "loss": 1.1953,
422
+ "step": 118
423
+ },
424
+ {
425
+ "epoch": 0.9917355371900827,
426
+ "grad_norm": 0.7646257281303406,
427
+ "learning_rate": 0.0001759319637805806,
428
+ "loss": 1.2148,
429
+ "step": 120
430
+ },
431
+ {
432
+ "epoch": 1.0082644628099173,
433
+ "grad_norm": 0.5254501104354858,
434
+ "learning_rate": 0.00017505529387279277,
435
+ "loss": 1.1359,
436
+ "step": 122
437
+ },
438
+ {
439
+ "epoch": 1.024793388429752,
440
+ "grad_norm": 0.6001765727996826,
441
+ "learning_rate": 0.00017416521056479577,
442
+ "loss": 1.1336,
443
+ "step": 124
444
+ },
445
+ {
446
+ "epoch": 1.0413223140495869,
447
+ "grad_norm": 0.35407504439353943,
448
+ "learning_rate": 0.00017326187292656333,
449
+ "loss": 1.1833,
450
+ "step": 126
451
+ },
452
+ {
453
+ "epoch": 1.0578512396694215,
454
+ "grad_norm": 0.414528489112854,
455
+ "learning_rate": 0.00017234544239679806,
456
+ "loss": 1.1301,
457
+ "step": 128
458
+ },
459
+ {
460
+ "epoch": 1.0743801652892562,
461
+ "grad_norm": 0.46355852484703064,
462
+ "learning_rate": 0.00017141608275408006,
463
+ "loss": 1.213,
464
+ "step": 130
465
+ },
466
+ {
467
+ "epoch": 1.0909090909090908,
468
+ "grad_norm": 0.5040593147277832,
469
+ "learning_rate": 0.00017047396008759754,
470
+ "loss": 1.132,
471
+ "step": 132
472
+ },
473
+ {
474
+ "epoch": 1.1074380165289257,
475
+ "grad_norm": 0.4813704192638397,
476
+ "learning_rate": 0.00016951924276746425,
477
+ "loss": 1.0831,
478
+ "step": 134
479
+ },
480
+ {
481
+ "epoch": 1.1239669421487604,
482
+ "grad_norm": 0.5174686312675476,
483
+ "learning_rate": 0.00016855210141462963,
484
+ "loss": 1.0514,
485
+ "step": 136
486
+ },
487
+ {
488
+ "epoch": 1.140495867768595,
489
+ "grad_norm": 0.4712466299533844,
490
+ "learning_rate": 0.00016757270887038654,
491
+ "loss": 1.1334,
492
+ "step": 138
493
+ },
494
+ {
495
+ "epoch": 1.1570247933884297,
496
+ "grad_norm": 0.5912173390388489,
497
+ "learning_rate": 0.00016658124016548197,
498
+ "loss": 1.1011,
499
+ "step": 140
500
+ },
501
+ {
502
+ "epoch": 1.1735537190082646,
503
+ "grad_norm": 0.6392802000045776,
504
+ "learning_rate": 0.00016557787248883696,
505
+ "loss": 1.1361,
506
+ "step": 142
507
+ },
508
+ {
509
+ "epoch": 1.1900826446280992,
510
+ "grad_norm": 0.7376368045806885,
511
+ "learning_rate": 0.00016456278515588024,
512
+ "loss": 1.109,
513
+ "step": 144
514
+ },
515
+ {
516
+ "epoch": 1.2066115702479339,
517
+ "grad_norm": 0.5020875930786133,
518
+ "learning_rate": 0.00016353615957650236,
519
+ "loss": 1.0925,
520
+ "step": 146
521
+ },
522
+ {
523
+ "epoch": 1.2231404958677685,
524
+ "grad_norm": 0.8081740736961365,
525
+ "learning_rate": 0.00016249817922263517,
526
+ "loss": 1.047,
527
+ "step": 148
528
+ },
529
+ {
530
+ "epoch": 1.2396694214876034,
531
+ "grad_norm": 0.6371219754219055,
532
+ "learning_rate": 0.00016144902959546286,
533
+ "loss": 1.113,
534
+ "step": 150
535
+ },
536
+ {
537
+ "epoch": 1.256198347107438,
538
+ "grad_norm": 0.7588189840316772,
539
+ "learning_rate": 0.00016038889819227045,
540
+ "loss": 1.1179,
541
+ "step": 152
542
+ },
543
+ {
544
+ "epoch": 1.2727272727272727,
545
+ "grad_norm": 0.6286205053329468,
546
+ "learning_rate": 0.00015931797447293552,
547
+ "loss": 1.1209,
548
+ "step": 154
549
+ },
550
+ {
551
+ "epoch": 1.2892561983471074,
552
+ "grad_norm": 0.797656238079071,
553
+ "learning_rate": 0.00015823644982606905,
554
+ "loss": 1.1698,
555
+ "step": 156
556
+ },
557
+ {
558
+ "epoch": 1.3057851239669422,
559
+ "grad_norm": 0.5368632078170776,
560
+ "learning_rate": 0.00015714451753481168,
561
+ "loss": 1.1973,
562
+ "step": 158
563
+ },
564
+ {
565
+ "epoch": 1.322314049586777,
566
+ "grad_norm": 0.4135212302207947,
567
+ "learning_rate": 0.00015604237274229147,
568
+ "loss": 1.1452,
569
+ "step": 160
570
+ },
571
+ {
572
+ "epoch": 1.3388429752066116,
573
+ "grad_norm": 0.5289668440818787,
574
+ "learning_rate": 0.00015493021241674918,
575
+ "loss": 1.1954,
576
+ "step": 162
577
+ },
578
+ {
579
+ "epoch": 1.3553719008264462,
580
+ "grad_norm": 0.4092061221599579,
581
+ "learning_rate": 0.00015380823531633729,
582
+ "loss": 1.1226,
583
+ "step": 164
584
+ },
585
+ {
586
+ "epoch": 1.3719008264462809,
587
+ "grad_norm": 0.7049645781517029,
588
+ "learning_rate": 0.00015267664195359917,
589
+ "loss": 1.0948,
590
+ "step": 166
591
+ },
592
+ {
593
+ "epoch": 1.3884297520661157,
594
+ "grad_norm": 0.47164198756217957,
595
+ "learning_rate": 0.00015153563455963499,
596
+ "loss": 1.0977,
597
+ "step": 168
598
+ },
599
+ {
600
+ "epoch": 1.4049586776859504,
601
+ "grad_norm": 0.7871695160865784,
602
+ "learning_rate": 0.00015038541704796003,
603
+ "loss": 1.1674,
604
+ "step": 170
605
+ },
606
+ {
607
+ "epoch": 1.421487603305785,
608
+ "grad_norm": 0.5381121635437012,
609
+ "learning_rate": 0.00014922619497806277,
610
+ "loss": 1.1415,
611
+ "step": 172
612
+ },
613
+ {
614
+ "epoch": 1.43801652892562,
615
+ "grad_norm": 0.39419299364089966,
616
+ "learning_rate": 0.00014805817551866838,
617
+ "loss": 1.0747,
618
+ "step": 174
619
+ },
620
+ {
621
+ "epoch": 1.4545454545454546,
622
+ "grad_norm": 0.38382914662361145,
623
+ "learning_rate": 0.00014688156741071514,
624
+ "loss": 1.1278,
625
+ "step": 176
626
+ },
627
+ {
628
+ "epoch": 1.4710743801652892,
629
+ "grad_norm": 0.32674962282180786,
630
+ "learning_rate": 0.00014569658093004935,
631
+ "loss": 0.9774,
632
+ "step": 178
633
+ },
634
+ {
635
+ "epoch": 1.487603305785124,
636
+ "grad_norm": 0.5443088412284851,
637
+ "learning_rate": 0.00014450342784984633,
638
+ "loss": 1.034,
639
+ "step": 180
640
+ },
641
+ {
642
+ "epoch": 1.5041322314049586,
643
+ "grad_norm": 0.6682401895523071,
644
+ "learning_rate": 0.00014330232140276366,
645
+ "loss": 1.1732,
646
+ "step": 182
647
+ },
648
+ {
649
+ "epoch": 1.5206611570247934,
650
+ "grad_norm": 0.5696044564247131,
651
+ "learning_rate": 0.0001420934762428335,
652
+ "loss": 1.0384,
653
+ "step": 184
654
+ },
655
+ {
656
+ "epoch": 1.537190082644628,
657
+ "grad_norm": 0.6782551407814026,
658
+ "learning_rate": 0.0001408771084071012,
659
+ "loss": 1.1107,
660
+ "step": 186
661
+ },
662
+ {
663
+ "epoch": 1.553719008264463,
664
+ "grad_norm": 0.8336123824119568,
665
+ "learning_rate": 0.00013965343527701628,
666
+ "loss": 1.0737,
667
+ "step": 188
668
+ },
669
+ {
670
+ "epoch": 1.5702479338842976,
671
+ "grad_norm": 0.539226233959198,
672
+ "learning_rate": 0.00013842267553958371,
673
+ "loss": 1.1665,
674
+ "step": 190
675
+ },
676
+ {
677
+ "epoch": 1.5867768595041323,
678
+ "grad_norm": 0.566620409488678,
679
+ "learning_rate": 0.00013718504914828135,
680
+ "loss": 1.1333,
681
+ "step": 192
682
+ },
683
+ {
684
+ "epoch": 1.603305785123967,
685
+ "grad_norm": 0.4735005795955658,
686
+ "learning_rate": 0.00013594077728375128,
687
+ "loss": 1.1709,
688
+ "step": 194
689
+ },
690
+ {
691
+ "epoch": 1.6198347107438016,
692
+ "grad_norm": 0.534383237361908,
693
+ "learning_rate": 0.00013469008231427207,
694
+ "loss": 1.0783,
695
+ "step": 196
696
+ },
697
+ {
698
+ "epoch": 1.6363636363636362,
699
+ "grad_norm": 0.8410363793373108,
700
+ "learning_rate": 0.0001334331877560182,
701
+ "loss": 1.0708,
702
+ "step": 198
703
+ },
704
+ {
705
+ "epoch": 1.6528925619834711,
706
+ "grad_norm": 0.6392219662666321,
707
+ "learning_rate": 0.00013217031823311488,
708
+ "loss": 1.0329,
709
+ "step": 200
710
+ },
711
+ {
712
+ "epoch": 1.6694214876033058,
713
+ "grad_norm": 0.5770404934883118,
714
+ "learning_rate": 0.00013090169943749476,
715
+ "loss": 1.0404,
716
+ "step": 202
717
+ },
718
+ {
719
+ "epoch": 1.6859504132231407,
720
+ "grad_norm": 0.6814575791358948,
721
+ "learning_rate": 0.00012962755808856342,
722
+ "loss": 1.0702,
723
+ "step": 204
724
+ },
725
+ {
726
+ "epoch": 1.7024793388429753,
727
+ "grad_norm": 0.673312783241272,
728
+ "learning_rate": 0.0001283481218926818,
729
+ "loss": 1.0529,
730
+ "step": 206
731
+ },
732
+ {
733
+ "epoch": 1.71900826446281,
734
+ "grad_norm": 0.6180073618888855,
735
+ "learning_rate": 0.0001270636195024719,
736
+ "loss": 1.0257,
737
+ "step": 208
738
+ },
739
+ {
740
+ "epoch": 1.7355371900826446,
741
+ "grad_norm": 0.5565724968910217,
742
+ "learning_rate": 0.00012577428047595344,
743
+ "loss": 1.1102,
744
+ "step": 210
745
+ },
746
+ {
747
+ "epoch": 1.7520661157024793,
748
+ "grad_norm": 0.5586270689964294,
749
+ "learning_rate": 0.00012448033523551865,
750
+ "loss": 1.0277,
751
+ "step": 212
752
+ },
753
+ {
754
+ "epoch": 1.768595041322314,
755
+ "grad_norm": 0.542448878288269,
756
+ "learning_rate": 0.00012318201502675285,
757
+ "loss": 1.0988,
758
+ "step": 214
759
+ },
760
+ {
761
+ "epoch": 1.7851239669421488,
762
+ "grad_norm": 0.513042151927948,
763
+ "learning_rate": 0.0001218795518771075,
764
+ "loss": 1.0828,
765
+ "step": 216
766
+ },
767
+ {
768
+ "epoch": 1.8016528925619835,
769
+ "grad_norm": 0.7613060474395752,
770
+ "learning_rate": 0.00012057317855443395,
771
+ "loss": 1.1962,
772
+ "step": 218
773
+ },
774
+ {
775
+ "epoch": 1.8181818181818183,
776
+ "grad_norm": 0.7522129416465759,
777
+ "learning_rate": 0.00011926312852538455,
778
+ "loss": 1.1339,
779
+ "step": 220
780
+ },
781
+ {
782
+ "epoch": 1.834710743801653,
783
+ "grad_norm": 0.4655594825744629,
784
+ "learning_rate": 0.00011794963591368893,
785
+ "loss": 1.0967,
786
+ "step": 222
787
+ },
788
+ {
789
+ "epoch": 1.8512396694214877,
790
+ "grad_norm": 0.5036570429801941,
791
+ "learning_rate": 0.00011663293545831302,
792
+ "loss": 1.0361,
793
+ "step": 224
794
+ },
795
+ {
796
+ "epoch": 1.8677685950413223,
797
+ "grad_norm": 0.43016380071640015,
798
+ "learning_rate": 0.00011531326247150803,
799
+ "loss": 1.1281,
800
+ "step": 226
801
+ },
802
+ {
803
+ "epoch": 1.884297520661157,
804
+ "grad_norm": 0.5184316635131836,
805
+ "learning_rate": 0.00011399085279675687,
806
+ "loss": 1.2083,
807
+ "step": 228
808
+ },
809
+ {
810
+ "epoch": 1.9008264462809916,
811
+ "grad_norm": 0.6556355357170105,
812
+ "learning_rate": 0.0001126659427666257,
813
+ "loss": 1.0266,
814
+ "step": 230
815
+ },
816
+ {
817
+ "epoch": 1.9173553719008265,
818
+ "grad_norm": 0.515681803226471,
819
+ "learning_rate": 0.00011133876916052821,
820
+ "loss": 1.0472,
821
+ "step": 232
822
+ },
823
+ {
824
+ "epoch": 1.9338842975206612,
825
+ "grad_norm": 0.4592064321041107,
826
+ "learning_rate": 0.00011000956916240985,
827
+ "loss": 1.054,
828
+ "step": 234
829
+ },
830
+ {
831
+ "epoch": 1.950413223140496,
832
+ "grad_norm": 0.5623230338096619,
833
+ "learning_rate": 0.00010867858031835975,
834
+ "loss": 1.1571,
835
+ "step": 236
836
+ },
837
+ {
838
+ "epoch": 1.9669421487603307,
839
+ "grad_norm": 0.5241667032241821,
840
+ "learning_rate": 0.00010734604049415822,
841
+ "loss": 1.0985,
842
+ "step": 238
843
+ },
844
+ {
845
+ "epoch": 1.9834710743801653,
846
+ "grad_norm": 0.54905104637146,
847
+ "learning_rate": 0.00010601218783276672,
848
+ "loss": 1.1088,
849
+ "step": 240
850
+ },
851
+ {
852
+ "epoch": 2.0,
853
+ "grad_norm": 0.8823345303535461,
854
+ "learning_rate": 0.00010467726071176853,
855
+ "loss": 1.0991,
856
+ "step": 242
857
+ }
858
+ ],
859
+ "logging_steps": 2,
860
+ "max_steps": 484,
861
+ "num_input_tokens_seen": 0,
862
+ "num_train_epochs": 4,
863
+ "save_steps": 500,
864
+ "stateful_callbacks": {
865
+ "TrainerControl": {
866
+ "args": {
867
+ "should_epoch_stop": false,
868
+ "should_evaluate": false,
869
+ "should_log": false,
870
+ "should_save": true,
871
+ "should_training_stop": false
872
+ },
873
+ "attributes": {}
874
+ }
875
+ },
876
+ "total_flos": 4578676410679296.0,
877
+ "train_batch_size": 16,
878
+ "trial_name": null,
879
+ "trial_params": null
880
+ }
checkpoint-242/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c7adf20314e4e98465f263bb56b062cceb30de0bf8fa4d85b668c720a84502
3
+ size 6456
checkpoint-312/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
checkpoint-312/adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": null,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 8,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "v_proj",
27
+ "up_proj",
28
+ "k_proj",
29
+ "gate_proj",
30
+ "down_proj",
31
+ "o_proj",
32
+ "q_proj"
33
+ ],
34
+ "task_type": "CAUSAL_LM",
35
+ "use_dora": false,
36
+ "use_rslora": false
37
+ }
checkpoint-312/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca4f470e10d3a1cadc4968b0acb37416dd9cbb615291d7d8b989df2f74b716df
3
+ size 25271744
checkpoint-312/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c71f3b563c3f684bb4165e33eef2c6e2a25979d1a2364e57b99556cc49549f5
3
+ size 13685836
checkpoint-312/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0b07ad1b89c33b892be1932a91fb2298ab424a36a237cfbf9776d684f2df198
3
+ size 14244
checkpoint-312/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07515c2b046d99c3743a994e794dd47b173671b79b64b92ecc764813da2c450b
3
+ size 1064
checkpoint-312/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-312/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-312/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-312/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 2048,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-312/trainer_state.json ADDED
@@ -0,0 +1,2217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 312,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.003205128205128205,
13
+ "grad_norm": 13.91187858581543,
14
+ "learning_rate": 2.2222222222222223e-05,
15
+ "loss": 4.8108,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.00641025641025641,
20
+ "grad_norm": 14.33206558227539,
21
+ "learning_rate": 4.4444444444444447e-05,
22
+ "loss": 4.8801,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.009615384615384616,
27
+ "grad_norm": 14.014120101928711,
28
+ "learning_rate": 6.666666666666667e-05,
29
+ "loss": 4.8164,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.01282051282051282,
34
+ "grad_norm": 13.723206520080566,
35
+ "learning_rate": 8.888888888888889e-05,
36
+ "loss": 4.7503,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.016025641025641024,
41
+ "grad_norm": 14.325286865234375,
42
+ "learning_rate": 0.00011111111111111112,
43
+ "loss": 4.3225,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.019230769230769232,
48
+ "grad_norm": 13.791594505310059,
49
+ "learning_rate": 0.00013333333333333334,
50
+ "loss": 3.3933,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.022435897435897436,
55
+ "grad_norm": 10.824278831481934,
56
+ "learning_rate": 0.00015555555555555556,
57
+ "loss": 2.8485,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.02564102564102564,
62
+ "grad_norm": 9.639212608337402,
63
+ "learning_rate": 0.00017777777777777779,
64
+ "loss": 2.2245,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.028846153846153848,
69
+ "grad_norm": 7.029407501220703,
70
+ "learning_rate": 0.0002,
71
+ "loss": 1.6899,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.03205128205128205,
76
+ "grad_norm": 5.85064697265625,
77
+ "learning_rate": 0.00019999462497359466,
78
+ "loss": 1.4971,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.035256410256410256,
83
+ "grad_norm": 3.8360719680786133,
84
+ "learning_rate": 0.0001999785004721968,
85
+ "loss": 1.159,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.038461538461538464,
90
+ "grad_norm": 1.9359312057495117,
91
+ "learning_rate": 0.00019995162822919883,
92
+ "loss": 0.9306,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.041666666666666664,
97
+ "grad_norm": 2.1098594665527344,
98
+ "learning_rate": 0.00019991401113338104,
99
+ "loss": 0.9273,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 0.04487179487179487,
104
+ "grad_norm": 3.5646820068359375,
105
+ "learning_rate": 0.00019986565322860115,
106
+ "loss": 0.9156,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 0.04807692307692308,
111
+ "grad_norm": 2.836466073989868,
112
+ "learning_rate": 0.00019980655971335945,
113
+ "loss": 0.8981,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.05128205128205128,
118
+ "grad_norm": 2.151491641998291,
119
+ "learning_rate": 0.00019973673694024,
120
+ "loss": 0.8187,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.05448717948717949,
125
+ "grad_norm": 2.594736337661743,
126
+ "learning_rate": 0.0001996561924152278,
127
+ "loss": 0.8162,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 0.057692307692307696,
132
+ "grad_norm": 4.105694770812988,
133
+ "learning_rate": 0.0001995649347969019,
134
+ "loss": 0.8538,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 0.060897435897435896,
139
+ "grad_norm": 2.181260585784912,
140
+ "learning_rate": 0.00019946297389550433,
141
+ "loss": 0.8083,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 0.0641025641025641,
146
+ "grad_norm": 1.9404771327972412,
147
+ "learning_rate": 0.0001993503206718859,
148
+ "loss": 0.8117,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 0.0673076923076923,
153
+ "grad_norm": 1.6177146434783936,
154
+ "learning_rate": 0.00019922698723632767,
155
+ "loss": 0.668,
156
+ "step": 21
157
+ },
158
+ {
159
+ "epoch": 0.07051282051282051,
160
+ "grad_norm": 2.5855164527893066,
161
+ "learning_rate": 0.00019909298684723904,
162
+ "loss": 0.9246,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 0.07371794871794872,
167
+ "grad_norm": 0.7814646363258362,
168
+ "learning_rate": 0.00019894833390973266,
169
+ "loss": 0.7877,
170
+ "step": 23
171
+ },
172
+ {
173
+ "epoch": 0.07692307692307693,
174
+ "grad_norm": 1.592220664024353,
175
+ "learning_rate": 0.0001987930439740757,
176
+ "loss": 0.8776,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 0.08012820512820513,
181
+ "grad_norm": 3.1788816452026367,
182
+ "learning_rate": 0.0001986271337340182,
183
+ "loss": 0.943,
184
+ "step": 25
185
+ },
186
+ {
187
+ "epoch": 0.08333333333333333,
188
+ "grad_norm": 0.8750837445259094,
189
+ "learning_rate": 0.0001984506210249986,
190
+ "loss": 0.9156,
191
+ "step": 26
192
+ },
193
+ {
194
+ "epoch": 0.08653846153846154,
195
+ "grad_norm": 1.8515149354934692,
196
+ "learning_rate": 0.00019826352482222638,
197
+ "loss": 0.8454,
198
+ "step": 27
199
+ },
200
+ {
201
+ "epoch": 0.08974358974358974,
202
+ "grad_norm": 1.592309594154358,
203
+ "learning_rate": 0.0001980658652386421,
204
+ "loss": 0.939,
205
+ "step": 28
206
+ },
207
+ {
208
+ "epoch": 0.09294871794871795,
209
+ "grad_norm": 1.8106122016906738,
210
+ "learning_rate": 0.00019785766352275542,
211
+ "loss": 0.9646,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 0.09615384615384616,
216
+ "grad_norm": 1.220379114151001,
217
+ "learning_rate": 0.00019763894205636072,
218
+ "loss": 0.8782,
219
+ "step": 30
220
+ },
221
+ {
222
+ "epoch": 0.09935897435897435,
223
+ "grad_norm": 1.5412297248840332,
224
+ "learning_rate": 0.00019740972435213115,
225
+ "loss": 0.7935,
226
+ "step": 31
227
+ },
228
+ {
229
+ "epoch": 0.10256410256410256,
230
+ "grad_norm": 1.016067385673523,
231
+ "learning_rate": 0.00019717003505109095,
232
+ "loss": 0.905,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 0.10576923076923077,
237
+ "grad_norm": 0.7788788080215454,
238
+ "learning_rate": 0.00019691989991996663,
239
+ "loss": 0.7594,
240
+ "step": 33
241
+ },
242
+ {
243
+ "epoch": 0.10897435897435898,
244
+ "grad_norm": 1.0361382961273193,
245
+ "learning_rate": 0.00019665934584841682,
246
+ "loss": 0.8002,
247
+ "step": 34
248
+ },
249
+ {
250
+ "epoch": 0.11217948717948718,
251
+ "grad_norm": 0.8088793158531189,
252
+ "learning_rate": 0.00019638840084614182,
253
+ "loss": 0.8022,
254
+ "step": 35
255
+ },
256
+ {
257
+ "epoch": 0.11538461538461539,
258
+ "grad_norm": 1.6871492862701416,
259
+ "learning_rate": 0.00019610709403987246,
260
+ "loss": 0.9974,
261
+ "step": 36
262
+ },
263
+ {
264
+ "epoch": 0.11858974358974358,
265
+ "grad_norm": 0.8181275725364685,
266
+ "learning_rate": 0.000195815455670239,
267
+ "loss": 0.8345,
268
+ "step": 37
269
+ },
270
+ {
271
+ "epoch": 0.12179487179487179,
272
+ "grad_norm": 1.2367571592330933,
273
+ "learning_rate": 0.0001955135170885202,
274
+ "loss": 0.9184,
275
+ "step": 38
276
+ },
277
+ {
278
+ "epoch": 0.125,
279
+ "grad_norm": 0.8114968538284302,
280
+ "learning_rate": 0.00019520131075327298,
281
+ "loss": 0.7724,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 0.1282051282051282,
286
+ "grad_norm": 0.9863741993904114,
287
+ "learning_rate": 0.00019487887022684336,
288
+ "loss": 0.7895,
289
+ "step": 40
290
+ },
291
+ {
292
+ "epoch": 0.13141025641025642,
293
+ "grad_norm": 0.8909745216369629,
294
+ "learning_rate": 0.00019454623017175812,
295
+ "loss": 0.8602,
296
+ "step": 41
297
+ },
298
+ {
299
+ "epoch": 0.1346153846153846,
300
+ "grad_norm": 1.0723499059677124,
301
+ "learning_rate": 0.0001942034263469989,
302
+ "loss": 0.8249,
303
+ "step": 42
304
+ },
305
+ {
306
+ "epoch": 0.13782051282051283,
307
+ "grad_norm": 2.3453705310821533,
308
+ "learning_rate": 0.00019385049560415794,
309
+ "loss": 0.8398,
310
+ "step": 43
311
+ },
312
+ {
313
+ "epoch": 0.14102564102564102,
314
+ "grad_norm": 2.1112656593322754,
315
+ "learning_rate": 0.00019348747588347637,
316
+ "loss": 0.8648,
317
+ "step": 44
318
+ },
319
+ {
320
+ "epoch": 0.14423076923076922,
321
+ "grad_norm": 0.7072998285293579,
322
+ "learning_rate": 0.00019311440620976597,
323
+ "loss": 0.7766,
324
+ "step": 45
325
+ },
326
+ {
327
+ "epoch": 0.14743589743589744,
328
+ "grad_norm": 1.122727632522583,
329
+ "learning_rate": 0.00019273132668821364,
330
+ "loss": 0.8375,
331
+ "step": 46
332
+ },
333
+ {
334
+ "epoch": 0.15064102564102563,
335
+ "grad_norm": 0.43819618225097656,
336
+ "learning_rate": 0.00019233827850007027,
337
+ "loss": 0.7336,
338
+ "step": 47
339
+ },
340
+ {
341
+ "epoch": 0.15384615384615385,
342
+ "grad_norm": 0.6066083908081055,
343
+ "learning_rate": 0.00019193530389822363,
344
+ "loss": 0.7079,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 0.15705128205128205,
349
+ "grad_norm": 0.9152003526687622,
350
+ "learning_rate": 0.0001915224462026563,
351
+ "loss": 0.8533,
352
+ "step": 49
353
+ },
354
+ {
355
+ "epoch": 0.16025641025641027,
356
+ "grad_norm": 1.2656763792037964,
357
+ "learning_rate": 0.0001910997497957885,
358
+ "loss": 0.8655,
359
+ "step": 50
360
+ },
361
+ {
362
+ "epoch": 0.16346153846153846,
363
+ "grad_norm": 0.5455206036567688,
364
+ "learning_rate": 0.00019066726011770726,
365
+ "loss": 0.7714,
366
+ "step": 51
367
+ },
368
+ {
369
+ "epoch": 0.16666666666666666,
370
+ "grad_norm": 0.77585369348526,
371
+ "learning_rate": 0.00019022502366128135,
372
+ "loss": 0.8717,
373
+ "step": 52
374
+ },
375
+ {
376
+ "epoch": 0.16987179487179488,
377
+ "grad_norm": 0.5681566596031189,
378
+ "learning_rate": 0.0001897730879671634,
379
+ "loss": 0.8441,
380
+ "step": 53
381
+ },
382
+ {
383
+ "epoch": 0.17307692307692307,
384
+ "grad_norm": 1.26576566696167,
385
+ "learning_rate": 0.00018931150161867916,
386
+ "loss": 0.9617,
387
+ "step": 54
388
+ },
389
+ {
390
+ "epoch": 0.1762820512820513,
391
+ "grad_norm": 0.49213504791259766,
392
+ "learning_rate": 0.0001888403142366049,
393
+ "loss": 0.8874,
394
+ "step": 55
395
+ },
396
+ {
397
+ "epoch": 0.1794871794871795,
398
+ "grad_norm": 0.5622470378875732,
399
+ "learning_rate": 0.00018835957647383303,
400
+ "loss": 0.7956,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 0.18269230769230768,
405
+ "grad_norm": 0.945149302482605,
406
+ "learning_rate": 0.00018786934000992688,
407
+ "loss": 0.8357,
408
+ "step": 57
409
+ },
410
+ {
411
+ "epoch": 0.1858974358974359,
412
+ "grad_norm": 0.5550402998924255,
413
+ "learning_rate": 0.00018736965754556528,
414
+ "loss": 0.8787,
415
+ "step": 58
416
+ },
417
+ {
418
+ "epoch": 0.1891025641025641,
419
+ "grad_norm": 0.841452419757843,
420
+ "learning_rate": 0.00018686058279687698,
421
+ "loss": 0.8429,
422
+ "step": 59
423
+ },
424
+ {
425
+ "epoch": 0.19230769230769232,
426
+ "grad_norm": 0.45196735858917236,
427
+ "learning_rate": 0.00018634217048966637,
428
+ "loss": 0.8261,
429
+ "step": 60
430
+ },
431
+ {
432
+ "epoch": 0.1955128205128205,
433
+ "grad_norm": 0.9931226372718811,
434
+ "learning_rate": 0.0001858144763535302,
435
+ "loss": 0.8594,
436
+ "step": 61
437
+ },
438
+ {
439
+ "epoch": 0.1987179487179487,
440
+ "grad_norm": 0.7350529432296753,
441
+ "learning_rate": 0.00018527755711586678,
442
+ "loss": 0.8443,
443
+ "step": 62
444
+ },
445
+ {
446
+ "epoch": 0.20192307692307693,
447
+ "grad_norm": 0.5463083386421204,
448
+ "learning_rate": 0.00018473147049577774,
449
+ "loss": 0.7869,
450
+ "step": 63
451
+ },
452
+ {
453
+ "epoch": 0.20512820512820512,
454
+ "grad_norm": 1.1669566631317139,
455
+ "learning_rate": 0.00018417627519786315,
456
+ "loss": 0.9341,
457
+ "step": 64
458
+ },
459
+ {
460
+ "epoch": 0.20833333333333334,
461
+ "grad_norm": 0.588135302066803,
462
+ "learning_rate": 0.00018361203090591071,
463
+ "loss": 0.7736,
464
+ "step": 65
465
+ },
466
+ {
467
+ "epoch": 0.21153846153846154,
468
+ "grad_norm": 0.5148831605911255,
469
+ "learning_rate": 0.00018303879827647975,
470
+ "loss": 0.8559,
471
+ "step": 66
472
+ },
473
+ {
474
+ "epoch": 0.21474358974358973,
475
+ "grad_norm": 0.6593618392944336,
476
+ "learning_rate": 0.00018245663893238075,
477
+ "loss": 0.7974,
478
+ "step": 67
479
+ },
480
+ {
481
+ "epoch": 0.21794871794871795,
482
+ "grad_norm": 0.6366052627563477,
483
+ "learning_rate": 0.00018186561545605054,
484
+ "loss": 0.7543,
485
+ "step": 68
486
+ },
487
+ {
488
+ "epoch": 0.22115384615384615,
489
+ "grad_norm": 0.5705471634864807,
490
+ "learning_rate": 0.00018126579138282503,
491
+ "loss": 0.8161,
492
+ "step": 69
493
+ },
494
+ {
495
+ "epoch": 0.22435897435897437,
496
+ "grad_norm": 0.5039474368095398,
497
+ "learning_rate": 0.00018065723119410884,
498
+ "loss": 0.8071,
499
+ "step": 70
500
+ },
501
+ {
502
+ "epoch": 0.22756410256410256,
503
+ "grad_norm": 0.7973775267601013,
504
+ "learning_rate": 0.0001800400003104436,
505
+ "loss": 0.7785,
506
+ "step": 71
507
+ },
508
+ {
509
+ "epoch": 0.23076923076923078,
510
+ "grad_norm": 0.9587669372558594,
511
+ "learning_rate": 0.00017941416508447536,
512
+ "loss": 0.8423,
513
+ "step": 72
514
+ },
515
+ {
516
+ "epoch": 0.23397435897435898,
517
+ "grad_norm": 0.4499201476573944,
518
+ "learning_rate": 0.00017877979279382135,
519
+ "loss": 0.8052,
520
+ "step": 73
521
+ },
522
+ {
523
+ "epoch": 0.23717948717948717,
524
+ "grad_norm": 0.9492788910865784,
525
+ "learning_rate": 0.0001781369516338378,
526
+ "loss": 0.9093,
527
+ "step": 74
528
+ },
529
+ {
530
+ "epoch": 0.2403846153846154,
531
+ "grad_norm": 0.6145200133323669,
532
+ "learning_rate": 0.000177485710710289,
533
+ "loss": 0.8359,
534
+ "step": 75
535
+ },
536
+ {
537
+ "epoch": 0.24358974358974358,
538
+ "grad_norm": 0.5011272430419922,
539
+ "learning_rate": 0.00017682614003191807,
540
+ "loss": 0.8054,
541
+ "step": 76
542
+ },
543
+ {
544
+ "epoch": 0.2467948717948718,
545
+ "grad_norm": 1.112430453300476,
546
+ "learning_rate": 0.0001761583105029213,
547
+ "loss": 0.8851,
548
+ "step": 77
549
+ },
550
+ {
551
+ "epoch": 0.25,
552
+ "grad_norm": 0.8186496496200562,
553
+ "learning_rate": 0.00017548229391532572,
554
+ "loss": 0.757,
555
+ "step": 78
556
+ },
557
+ {
558
+ "epoch": 0.2532051282051282,
559
+ "grad_norm": 1.00787353515625,
560
+ "learning_rate": 0.00017479816294127152,
561
+ "loss": 0.8089,
562
+ "step": 79
563
+ },
564
+ {
565
+ "epoch": 0.2564102564102564,
566
+ "grad_norm": 0.9003360271453857,
567
+ "learning_rate": 0.0001741059911251997,
568
+ "loss": 0.9457,
569
+ "step": 80
570
+ },
571
+ {
572
+ "epoch": 0.25961538461538464,
573
+ "grad_norm": 1.0145341157913208,
574
+ "learning_rate": 0.00017340585287594604,
575
+ "loss": 0.796,
576
+ "step": 81
577
+ },
578
+ {
579
+ "epoch": 0.26282051282051283,
580
+ "grad_norm": 1.2034144401550293,
581
+ "learning_rate": 0.00017269782345874203,
582
+ "loss": 0.8504,
583
+ "step": 82
584
+ },
585
+ {
586
+ "epoch": 0.266025641025641,
587
+ "grad_norm": 0.5833753347396851,
588
+ "learning_rate": 0.00017198197898712404,
589
+ "loss": 0.819,
590
+ "step": 83
591
+ },
592
+ {
593
+ "epoch": 0.2692307692307692,
594
+ "grad_norm": 0.47029227018356323,
595
+ "learning_rate": 0.00017125839641475072,
596
+ "loss": 0.7965,
597
+ "step": 84
598
+ },
599
+ {
600
+ "epoch": 0.2724358974358974,
601
+ "grad_norm": 0.6297673583030701,
602
+ "learning_rate": 0.00017052715352713075,
603
+ "loss": 0.7564,
604
+ "step": 85
605
+ },
606
+ {
607
+ "epoch": 0.27564102564102566,
608
+ "grad_norm": 0.7019922137260437,
609
+ "learning_rate": 0.00016978832893326074,
610
+ "loss": 0.8576,
611
+ "step": 86
612
+ },
613
+ {
614
+ "epoch": 0.27884615384615385,
615
+ "grad_norm": 0.7785760760307312,
616
+ "learning_rate": 0.0001690420020571747,
617
+ "loss": 0.8872,
618
+ "step": 87
619
+ },
620
+ {
621
+ "epoch": 0.28205128205128205,
622
+ "grad_norm": 0.47651761770248413,
623
+ "learning_rate": 0.00016828825312940592,
624
+ "loss": 0.8333,
625
+ "step": 88
626
+ },
627
+ {
628
+ "epoch": 0.28525641025641024,
629
+ "grad_norm": 0.5962091684341431,
630
+ "learning_rate": 0.00016752716317836229,
631
+ "loss": 0.9013,
632
+ "step": 89
633
+ },
634
+ {
635
+ "epoch": 0.28846153846153844,
636
+ "grad_norm": 0.4600299596786499,
637
+ "learning_rate": 0.00016675881402161536,
638
+ "loss": 0.8245,
639
+ "step": 90
640
+ },
641
+ {
642
+ "epoch": 0.2916666666666667,
643
+ "grad_norm": 0.5112613439559937,
644
+ "learning_rate": 0.00016598328825710533,
645
+ "loss": 0.8138,
646
+ "step": 91
647
+ },
648
+ {
649
+ "epoch": 0.2948717948717949,
650
+ "grad_norm": 0.7249051332473755,
651
+ "learning_rate": 0.00016520066925426144,
652
+ "loss": 0.8401,
653
+ "step": 92
654
+ },
655
+ {
656
+ "epoch": 0.2980769230769231,
657
+ "grad_norm": 0.6219087839126587,
658
+ "learning_rate": 0.0001644110411450398,
659
+ "loss": 0.7938,
660
+ "step": 93
661
+ },
662
+ {
663
+ "epoch": 0.30128205128205127,
664
+ "grad_norm": 0.4909549951553345,
665
+ "learning_rate": 0.00016361448881487914,
666
+ "loss": 0.8425,
667
+ "step": 94
668
+ },
669
+ {
670
+ "epoch": 0.30448717948717946,
671
+ "grad_norm": 0.45718565583229065,
672
+ "learning_rate": 0.0001628110978935756,
673
+ "loss": 0.8709,
674
+ "step": 95
675
+ },
676
+ {
677
+ "epoch": 0.3076923076923077,
678
+ "grad_norm": 0.7159481048583984,
679
+ "learning_rate": 0.00016200095474607753,
680
+ "loss": 0.8469,
681
+ "step": 96
682
+ },
683
+ {
684
+ "epoch": 0.3108974358974359,
685
+ "grad_norm": 0.6644757986068726,
686
+ "learning_rate": 0.0001611841464632011,
687
+ "loss": 0.8352,
688
+ "step": 97
689
+ },
690
+ {
691
+ "epoch": 0.3141025641025641,
692
+ "grad_norm": 0.40486231446266174,
693
+ "learning_rate": 0.00016036076085226814,
694
+ "loss": 0.7915,
695
+ "step": 98
696
+ },
697
+ {
698
+ "epoch": 0.3173076923076923,
699
+ "grad_norm": 0.588192880153656,
700
+ "learning_rate": 0.0001595308864276666,
701
+ "loss": 0.8802,
702
+ "step": 99
703
+ },
704
+ {
705
+ "epoch": 0.32051282051282054,
706
+ "grad_norm": 0.2998511493206024,
707
+ "learning_rate": 0.0001586946124013354,
708
+ "loss": 0.7983,
709
+ "step": 100
710
+ },
711
+ {
712
+ "epoch": 0.32371794871794873,
713
+ "grad_norm": 0.5495243072509766,
714
+ "learning_rate": 0.00015785202867317407,
715
+ "loss": 0.806,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 0.3269230769230769,
720
+ "grad_norm": 0.3516915738582611,
721
+ "learning_rate": 0.00015700322582137827,
722
+ "loss": 0.8455,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 0.3301282051282051,
727
+ "grad_norm": 0.6122769117355347,
728
+ "learning_rate": 0.0001561482950927029,
729
+ "loss": 0.8538,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 0.3333333333333333,
734
+ "grad_norm": 0.5758071541786194,
735
+ "learning_rate": 0.00015528732839265272,
736
+ "loss": 0.7786,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 0.33653846153846156,
741
+ "grad_norm": 0.606889009475708,
742
+ "learning_rate": 0.00015442041827560274,
743
+ "loss": 0.8535,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 0.33974358974358976,
748
+ "grad_norm": 0.5758721232414246,
749
+ "learning_rate": 0.00015354765793484834,
750
+ "loss": 0.8368,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 0.34294871794871795,
755
+ "grad_norm": 0.41782525181770325,
756
+ "learning_rate": 0.000152669141192587,
757
+ "loss": 0.8372,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 0.34615384615384615,
762
+ "grad_norm": 0.44819149374961853,
763
+ "learning_rate": 0.00015178496248983254,
764
+ "loss": 0.7807,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 0.34935897435897434,
769
+ "grad_norm": 0.42602694034576416,
770
+ "learning_rate": 0.00015089521687626243,
771
+ "loss": 0.7363,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 0.3525641025641026,
776
+ "grad_norm": 0.5617648363113403,
777
+ "learning_rate": 0.00015000000000000001,
778
+ "loss": 0.8311,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 0.3557692307692308,
783
+ "grad_norm": 0.38159874081611633,
784
+ "learning_rate": 0.00014909940809733222,
785
+ "loss": 0.7536,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 0.358974358974359,
790
+ "grad_norm": 0.3874197006225586,
791
+ "learning_rate": 0.00014819353798236427,
792
+ "loss": 0.8124,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 0.36217948717948717,
797
+ "grad_norm": 0.3920519948005676,
798
+ "learning_rate": 0.00014728248703661182,
799
+ "loss": 0.7821,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 0.36538461538461536,
804
+ "grad_norm": 0.31610092520713806,
805
+ "learning_rate": 0.00014636635319853275,
806
+ "loss": 0.8025,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 0.3685897435897436,
811
+ "grad_norm": 0.9610485434532166,
812
+ "learning_rate": 0.00014544523495299842,
813
+ "loss": 0.8597,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 0.3717948717948718,
818
+ "grad_norm": 0.6983340978622437,
819
+ "learning_rate": 0.0001445192313207067,
820
+ "loss": 0.8763,
821
+ "step": 116
822
+ },
823
+ {
824
+ "epoch": 0.375,
825
+ "grad_norm": 0.37733662128448486,
826
+ "learning_rate": 0.00014358844184753712,
827
+ "loss": 0.787,
828
+ "step": 117
829
+ },
830
+ {
831
+ "epoch": 0.3782051282051282,
832
+ "grad_norm": 0.6264870166778564,
833
+ "learning_rate": 0.00014265296659384956,
834
+ "loss": 0.6865,
835
+ "step": 118
836
+ },
837
+ {
838
+ "epoch": 0.3814102564102564,
839
+ "grad_norm": 0.6618545651435852,
840
+ "learning_rate": 0.0001417129061237278,
841
+ "loss": 0.8416,
842
+ "step": 119
843
+ },
844
+ {
845
+ "epoch": 0.38461538461538464,
846
+ "grad_norm": 0.5489368438720703,
847
+ "learning_rate": 0.00014076836149416887,
848
+ "loss": 0.8113,
849
+ "step": 120
850
+ },
851
+ {
852
+ "epoch": 0.38782051282051283,
853
+ "grad_norm": 0.5143547654151917,
854
+ "learning_rate": 0.00013981943424421932,
855
+ "loss": 0.7212,
856
+ "step": 121
857
+ },
858
+ {
859
+ "epoch": 0.391025641025641,
860
+ "grad_norm": 0.3641842007637024,
861
+ "learning_rate": 0.00013886622638405952,
862
+ "loss": 0.7081,
863
+ "step": 122
864
+ },
865
+ {
866
+ "epoch": 0.3942307692307692,
867
+ "grad_norm": 0.5204703211784363,
868
+ "learning_rate": 0.00013790884038403795,
869
+ "loss": 0.8671,
870
+ "step": 123
871
+ },
872
+ {
873
+ "epoch": 0.3974358974358974,
874
+ "grad_norm": 0.6415024995803833,
875
+ "learning_rate": 0.00013694737916365517,
876
+ "loss": 0.7905,
877
+ "step": 124
878
+ },
879
+ {
880
+ "epoch": 0.40064102564102566,
881
+ "grad_norm": 0.5021610260009766,
882
+ "learning_rate": 0.0001359819460805001,
883
+ "loss": 0.8396,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 0.40384615384615385,
888
+ "grad_norm": 0.6839991211891174,
889
+ "learning_rate": 0.00013501264491913906,
890
+ "loss": 0.8807,
891
+ "step": 126
892
+ },
893
+ {
894
+ "epoch": 0.40705128205128205,
895
+ "grad_norm": 0.3714783787727356,
896
+ "learning_rate": 0.00013403957987995882,
897
+ "loss": 0.7946,
898
+ "step": 127
899
+ },
900
+ {
901
+ "epoch": 0.41025641025641024,
902
+ "grad_norm": 0.5053361654281616,
903
+ "learning_rate": 0.00013306285556796495,
904
+ "loss": 0.8481,
905
+ "step": 128
906
+ },
907
+ {
908
+ "epoch": 0.41346153846153844,
909
+ "grad_norm": 0.5429531335830688,
910
+ "learning_rate": 0.00013208257698153677,
911
+ "loss": 0.7554,
912
+ "step": 129
913
+ },
914
+ {
915
+ "epoch": 0.4166666666666667,
916
+ "grad_norm": 0.3953702449798584,
917
+ "learning_rate": 0.00013109884950114007,
918
+ "loss": 0.8,
919
+ "step": 130
920
+ },
921
+ {
922
+ "epoch": 0.4198717948717949,
923
+ "grad_norm": 0.6481508016586304,
924
+ "learning_rate": 0.00013011177887799845,
925
+ "loss": 0.8627,
926
+ "step": 131
927
+ },
928
+ {
929
+ "epoch": 0.4230769230769231,
930
+ "grad_norm": 0.7805072069168091,
931
+ "learning_rate": 0.00012912147122272523,
932
+ "loss": 0.8365,
933
+ "step": 132
934
+ },
935
+ {
936
+ "epoch": 0.42628205128205127,
937
+ "grad_norm": 0.6625213027000427,
938
+ "learning_rate": 0.00012812803299391628,
939
+ "loss": 0.8348,
940
+ "step": 133
941
+ },
942
+ {
943
+ "epoch": 0.42948717948717946,
944
+ "grad_norm": 0.5890776515007019,
945
+ "learning_rate": 0.0001271315709867059,
946
+ "loss": 0.8482,
947
+ "step": 134
948
+ },
949
+ {
950
+ "epoch": 0.4326923076923077,
951
+ "grad_norm": 0.8653396964073181,
952
+ "learning_rate": 0.00012613219232128608,
953
+ "loss": 0.7627,
954
+ "step": 135
955
+ },
956
+ {
957
+ "epoch": 0.4358974358974359,
958
+ "grad_norm": 0.4106805920600891,
959
+ "learning_rate": 0.00012513000443139112,
960
+ "loss": 0.8412,
961
+ "step": 136
962
+ },
963
+ {
964
+ "epoch": 0.4391025641025641,
965
+ "grad_norm": 0.9956967234611511,
966
+ "learning_rate": 0.00012412511505274844,
967
+ "loss": 0.9127,
968
+ "step": 137
969
+ },
970
+ {
971
+ "epoch": 0.4423076923076923,
972
+ "grad_norm": 0.46844878792762756,
973
+ "learning_rate": 0.000123117632211497,
974
+ "loss": 0.774,
975
+ "step": 138
976
+ },
977
+ {
978
+ "epoch": 0.44551282051282054,
979
+ "grad_norm": 0.966278612613678,
980
+ "learning_rate": 0.0001221076642125742,
981
+ "loss": 0.8659,
982
+ "step": 139
983
+ },
984
+ {
985
+ "epoch": 0.44871794871794873,
986
+ "grad_norm": 0.5847793221473694,
987
+ "learning_rate": 0.00012109531962807332,
988
+ "loss": 0.7507,
989
+ "step": 140
990
+ },
991
+ {
992
+ "epoch": 0.4519230769230769,
993
+ "grad_norm": 0.5927683115005493,
994
+ "learning_rate": 0.00012008070728557186,
995
+ "loss": 0.8908,
996
+ "step": 141
997
+ },
998
+ {
999
+ "epoch": 0.4551282051282051,
1000
+ "grad_norm": 0.42794185876846313,
1001
+ "learning_rate": 0.00011906393625643244,
1002
+ "loss": 0.7478,
1003
+ "step": 142
1004
+ },
1005
+ {
1006
+ "epoch": 0.4583333333333333,
1007
+ "grad_norm": 0.6426613926887512,
1008
+ "learning_rate": 0.00011804511584407763,
1009
+ "loss": 0.7498,
1010
+ "step": 143
1011
+ },
1012
+ {
1013
+ "epoch": 0.46153846153846156,
1014
+ "grad_norm": 0.40398654341697693,
1015
+ "learning_rate": 0.00011702435557223987,
1016
+ "loss": 0.7539,
1017
+ "step": 144
1018
+ },
1019
+ {
1020
+ "epoch": 0.46474358974358976,
1021
+ "grad_norm": 0.4018569886684418,
1022
+ "learning_rate": 0.00011600176517318741,
1023
+ "loss": 0.8294,
1024
+ "step": 145
1025
+ },
1026
+ {
1027
+ "epoch": 0.46794871794871795,
1028
+ "grad_norm": 0.33486562967300415,
1029
+ "learning_rate": 0.00011497745457592816,
1030
+ "loss": 0.8322,
1031
+ "step": 146
1032
+ },
1033
+ {
1034
+ "epoch": 0.47115384615384615,
1035
+ "grad_norm": 0.539478063583374,
1036
+ "learning_rate": 0.00011395153389439233,
1037
+ "loss": 0.8697,
1038
+ "step": 147
1039
+ },
1040
+ {
1041
+ "epoch": 0.47435897435897434,
1042
+ "grad_norm": 0.4381621479988098,
1043
+ "learning_rate": 0.0001129241134155949,
1044
+ "loss": 0.8216,
1045
+ "step": 148
1046
+ },
1047
+ {
1048
+ "epoch": 0.4775641025641026,
1049
+ "grad_norm": 1.0025720596313477,
1050
+ "learning_rate": 0.00011189530358778005,
1051
+ "loss": 0.9283,
1052
+ "step": 149
1053
+ },
1054
+ {
1055
+ "epoch": 0.4807692307692308,
1056
+ "grad_norm": 0.6928549408912659,
1057
+ "learning_rate": 0.00011086521500854745,
1058
+ "loss": 0.8163,
1059
+ "step": 150
1060
+ },
1061
+ {
1062
+ "epoch": 0.483974358974359,
1063
+ "grad_norm": 0.3628673851490021,
1064
+ "learning_rate": 0.00010983395841296348,
1065
+ "loss": 0.7866,
1066
+ "step": 151
1067
+ },
1068
+ {
1069
+ "epoch": 0.48717948717948717,
1070
+ "grad_norm": 0.6659094095230103,
1071
+ "learning_rate": 0.00010880164466165674,
1072
+ "loss": 0.767,
1073
+ "step": 152
1074
+ },
1075
+ {
1076
+ "epoch": 0.49038461538461536,
1077
+ "grad_norm": 0.4747048318386078,
1078
+ "learning_rate": 0.00010776838472890065,
1079
+ "loss": 0.7455,
1080
+ "step": 153
1081
+ },
1082
+ {
1083
+ "epoch": 0.4935897435897436,
1084
+ "grad_norm": 0.5610392093658447,
1085
+ "learning_rate": 0.00010673428969068364,
1086
+ "loss": 0.7943,
1087
+ "step": 154
1088
+ },
1089
+ {
1090
+ "epoch": 0.4967948717948718,
1091
+ "grad_norm": 0.8494670391082764,
1092
+ "learning_rate": 0.00010569947071276847,
1093
+ "loss": 0.8778,
1094
+ "step": 155
1095
+ },
1096
+ {
1097
+ "epoch": 0.5,
1098
+ "grad_norm": 0.4916403293609619,
1099
+ "learning_rate": 0.00010466403903874176,
1100
+ "loss": 0.7698,
1101
+ "step": 156
1102
+ },
1103
+ {
1104
+ "epoch": 0.5032051282051282,
1105
+ "grad_norm": 1.1345371007919312,
1106
+ "learning_rate": 0.00010362810597805526,
1107
+ "loss": 0.9339,
1108
+ "step": 157
1109
+ },
1110
+ {
1111
+ "epoch": 0.5064102564102564,
1112
+ "grad_norm": 0.8065443634986877,
1113
+ "learning_rate": 0.00010259178289406011,
1114
+ "loss": 0.7978,
1115
+ "step": 158
1116
+ },
1117
+ {
1118
+ "epoch": 0.5096153846153846,
1119
+ "grad_norm": 1.3246110677719116,
1120
+ "learning_rate": 0.0001015551811920351,
1121
+ "loss": 0.9917,
1122
+ "step": 159
1123
+ },
1124
+ {
1125
+ "epoch": 0.5128205128205128,
1126
+ "grad_norm": 0.5924827456474304,
1127
+ "learning_rate": 0.00010051841230721065,
1128
+ "loss": 0.8334,
1129
+ "step": 160
1130
+ },
1131
+ {
1132
+ "epoch": 0.5160256410256411,
1133
+ "grad_norm": 0.5810033679008484,
1134
+ "learning_rate": 9.948158769278939e-05,
1135
+ "loss": 0.8431,
1136
+ "step": 161
1137
+ },
1138
+ {
1139
+ "epoch": 0.5192307692307693,
1140
+ "grad_norm": 0.517737627029419,
1141
+ "learning_rate": 9.844481880796491e-05,
1142
+ "loss": 0.8149,
1143
+ "step": 162
1144
+ },
1145
+ {
1146
+ "epoch": 0.5224358974358975,
1147
+ "grad_norm": 0.6809769868850708,
1148
+ "learning_rate": 9.740821710593989e-05,
1149
+ "loss": 0.9241,
1150
+ "step": 163
1151
+ },
1152
+ {
1153
+ "epoch": 0.5256410256410257,
1154
+ "grad_norm": 0.5955147743225098,
1155
+ "learning_rate": 9.637189402194476e-05,
1156
+ "loss": 0.761,
1157
+ "step": 164
1158
+ },
1159
+ {
1160
+ "epoch": 0.5288461538461539,
1161
+ "grad_norm": 0.6794393062591553,
1162
+ "learning_rate": 9.533596096125825e-05,
1163
+ "loss": 0.8291,
1164
+ "step": 165
1165
+ },
1166
+ {
1167
+ "epoch": 0.532051282051282,
1168
+ "grad_norm": 0.6929296255111694,
1169
+ "learning_rate": 9.430052928723153e-05,
1170
+ "loss": 0.7753,
1171
+ "step": 166
1172
+ },
1173
+ {
1174
+ "epoch": 0.5352564102564102,
1175
+ "grad_norm": 0.6990981698036194,
1176
+ "learning_rate": 9.326571030931637e-05,
1177
+ "loss": 0.8932,
1178
+ "step": 167
1179
+ },
1180
+ {
1181
+ "epoch": 0.5384615384615384,
1182
+ "grad_norm": 0.591814398765564,
1183
+ "learning_rate": 9.223161527109937e-05,
1184
+ "loss": 0.8114,
1185
+ "step": 168
1186
+ },
1187
+ {
1188
+ "epoch": 0.5416666666666666,
1189
+ "grad_norm": 0.6408451199531555,
1190
+ "learning_rate": 9.119835533834331e-05,
1191
+ "loss": 0.7942,
1192
+ "step": 169
1193
+ },
1194
+ {
1195
+ "epoch": 0.5448717948717948,
1196
+ "grad_norm": 0.6977259516716003,
1197
+ "learning_rate": 9.016604158703654e-05,
1198
+ "loss": 0.912,
1199
+ "step": 170
1200
+ },
1201
+ {
1202
+ "epoch": 0.5480769230769231,
1203
+ "grad_norm": 0.604136049747467,
1204
+ "learning_rate": 8.913478499145254e-05,
1205
+ "loss": 0.789,
1206
+ "step": 171
1207
+ },
1208
+ {
1209
+ "epoch": 0.5512820512820513,
1210
+ "grad_norm": 0.5467025637626648,
1211
+ "learning_rate": 8.810469641222001e-05,
1212
+ "loss": 0.7825,
1213
+ "step": 172
1214
+ },
1215
+ {
1216
+ "epoch": 0.5544871794871795,
1217
+ "grad_norm": 0.7560824155807495,
1218
+ "learning_rate": 8.707588658440511e-05,
1219
+ "loss": 0.7609,
1220
+ "step": 173
1221
+ },
1222
+ {
1223
+ "epoch": 0.5576923076923077,
1224
+ "grad_norm": 0.9269303679466248,
1225
+ "learning_rate": 8.604846610560771e-05,
1226
+ "loss": 0.8706,
1227
+ "step": 174
1228
+ },
1229
+ {
1230
+ "epoch": 0.5608974358974359,
1231
+ "grad_norm": 0.7465280294418335,
1232
+ "learning_rate": 8.502254542407186e-05,
1233
+ "loss": 0.7986,
1234
+ "step": 175
1235
+ },
1236
+ {
1237
+ "epoch": 0.5641025641025641,
1238
+ "grad_norm": 0.8691240549087524,
1239
+ "learning_rate": 8.399823482681262e-05,
1240
+ "loss": 0.7843,
1241
+ "step": 176
1242
+ },
1243
+ {
1244
+ "epoch": 0.5673076923076923,
1245
+ "grad_norm": 0.59829181432724,
1246
+ "learning_rate": 8.297564442776014e-05,
1247
+ "loss": 0.7441,
1248
+ "step": 177
1249
+ },
1250
+ {
1251
+ "epoch": 0.5705128205128205,
1252
+ "grad_norm": 0.9093548655509949,
1253
+ "learning_rate": 8.195488415592238e-05,
1254
+ "loss": 0.8501,
1255
+ "step": 178
1256
+ },
1257
+ {
1258
+ "epoch": 0.5737179487179487,
1259
+ "grad_norm": 0.8965813517570496,
1260
+ "learning_rate": 8.093606374356759e-05,
1261
+ "loss": 0.8309,
1262
+ "step": 179
1263
+ },
1264
+ {
1265
+ "epoch": 0.5769230769230769,
1266
+ "grad_norm": 0.7008676528930664,
1267
+ "learning_rate": 7.991929271442817e-05,
1268
+ "loss": 0.6563,
1269
+ "step": 180
1270
+ },
1271
+ {
1272
+ "epoch": 0.5801282051282052,
1273
+ "grad_norm": 0.8001984357833862,
1274
+ "learning_rate": 7.89046803719267e-05,
1275
+ "loss": 0.8305,
1276
+ "step": 181
1277
+ },
1278
+ {
1279
+ "epoch": 0.5833333333333334,
1280
+ "grad_norm": 0.8418125510215759,
1281
+ "learning_rate": 7.789233578742582e-05,
1282
+ "loss": 0.8296,
1283
+ "step": 182
1284
+ },
1285
+ {
1286
+ "epoch": 0.5865384615384616,
1287
+ "grad_norm": 0.8208316564559937,
1288
+ "learning_rate": 7.688236778850306e-05,
1289
+ "loss": 0.7777,
1290
+ "step": 183
1291
+ },
1292
+ {
1293
+ "epoch": 0.5897435897435898,
1294
+ "grad_norm": 0.8942621946334839,
1295
+ "learning_rate": 7.587488494725157e-05,
1296
+ "loss": 0.9101,
1297
+ "step": 184
1298
+ },
1299
+ {
1300
+ "epoch": 0.592948717948718,
1301
+ "grad_norm": 0.6328911781311035,
1302
+ "learning_rate": 7.48699955686089e-05,
1303
+ "loss": 0.7004,
1304
+ "step": 185
1305
+ },
1306
+ {
1307
+ "epoch": 0.5961538461538461,
1308
+ "grad_norm": 0.5853061079978943,
1309
+ "learning_rate": 7.386780767871397e-05,
1310
+ "loss": 0.8373,
1311
+ "step": 186
1312
+ },
1313
+ {
1314
+ "epoch": 0.5993589743589743,
1315
+ "grad_norm": 0.9095894694328308,
1316
+ "learning_rate": 7.286842901329412e-05,
1317
+ "loss": 0.833,
1318
+ "step": 187
1319
+ },
1320
+ {
1321
+ "epoch": 0.6025641025641025,
1322
+ "grad_norm": 0.562564492225647,
1323
+ "learning_rate": 7.187196700608373e-05,
1324
+ "loss": 0.7822,
1325
+ "step": 188
1326
+ },
1327
+ {
1328
+ "epoch": 0.6057692307692307,
1329
+ "grad_norm": 0.812235951423645,
1330
+ "learning_rate": 7.087852877727481e-05,
1331
+ "loss": 0.823,
1332
+ "step": 189
1333
+ },
1334
+ {
1335
+ "epoch": 0.6089743589743589,
1336
+ "grad_norm": 0.9851651191711426,
1337
+ "learning_rate": 6.988822112200156e-05,
1338
+ "loss": 0.8251,
1339
+ "step": 190
1340
+ },
1341
+ {
1342
+ "epoch": 0.6121794871794872,
1343
+ "grad_norm": 0.9343296885490417,
1344
+ "learning_rate": 6.890115049885994e-05,
1345
+ "loss": 0.7613,
1346
+ "step": 191
1347
+ },
1348
+ {
1349
+ "epoch": 0.6153846153846154,
1350
+ "grad_norm": 0.6352812647819519,
1351
+ "learning_rate": 6.791742301846326e-05,
1352
+ "loss": 0.7761,
1353
+ "step": 192
1354
+ },
1355
+ {
1356
+ "epoch": 0.6185897435897436,
1357
+ "grad_norm": 1.490954041481018,
1358
+ "learning_rate": 6.693714443203507e-05,
1359
+ "loss": 1.0911,
1360
+ "step": 193
1361
+ },
1362
+ {
1363
+ "epoch": 0.6217948717948718,
1364
+ "grad_norm": 0.7327432036399841,
1365
+ "learning_rate": 6.59604201200412e-05,
1366
+ "loss": 0.7859,
1367
+ "step": 194
1368
+ },
1369
+ {
1370
+ "epoch": 0.625,
1371
+ "grad_norm": 0.7584162950515747,
1372
+ "learning_rate": 6.498735508086093e-05,
1373
+ "loss": 0.6795,
1374
+ "step": 195
1375
+ },
1376
+ {
1377
+ "epoch": 0.6282051282051282,
1378
+ "grad_norm": 0.9393740892410278,
1379
+ "learning_rate": 6.40180539194999e-05,
1380
+ "loss": 0.7382,
1381
+ "step": 196
1382
+ },
1383
+ {
1384
+ "epoch": 0.6314102564102564,
1385
+ "grad_norm": 1.4272030591964722,
1386
+ "learning_rate": 6.305262083634488e-05,
1387
+ "loss": 0.8325,
1388
+ "step": 197
1389
+ },
1390
+ {
1391
+ "epoch": 0.6346153846153846,
1392
+ "grad_norm": 1.7151983976364136,
1393
+ "learning_rate": 6.209115961596208e-05,
1394
+ "loss": 0.9443,
1395
+ "step": 198
1396
+ },
1397
+ {
1398
+ "epoch": 0.6378205128205128,
1399
+ "grad_norm": 0.856202244758606,
1400
+ "learning_rate": 6.113377361594049e-05,
1401
+ "loss": 0.7448,
1402
+ "step": 199
1403
+ },
1404
+ {
1405
+ "epoch": 0.6410256410256411,
1406
+ "grad_norm": 1.3806068897247314,
1407
+ "learning_rate": 6.018056575578075e-05,
1408
+ "loss": 0.9231,
1409
+ "step": 200
1410
+ },
1411
+ {
1412
+ "epoch": 0.6442307692307693,
1413
+ "grad_norm": 0.8063072562217712,
1414
+ "learning_rate": 5.923163850583113e-05,
1415
+ "loss": 0.7305,
1416
+ "step": 201
1417
+ },
1418
+ {
1419
+ "epoch": 0.6474358974358975,
1420
+ "grad_norm": 1.1578689813613892,
1421
+ "learning_rate": 5.828709387627218e-05,
1422
+ "loss": 0.7892,
1423
+ "step": 202
1424
+ },
1425
+ {
1426
+ "epoch": 0.6506410256410257,
1427
+ "grad_norm": 0.881730854511261,
1428
+ "learning_rate": 5.73470334061505e-05,
1429
+ "loss": 0.7162,
1430
+ "step": 203
1431
+ },
1432
+ {
1433
+ "epoch": 0.6538461538461539,
1434
+ "grad_norm": 0.9713548421859741,
1435
+ "learning_rate": 5.6411558152462894e-05,
1436
+ "loss": 0.7252,
1437
+ "step": 204
1438
+ },
1439
+ {
1440
+ "epoch": 0.657051282051282,
1441
+ "grad_norm": 1.12835693359375,
1442
+ "learning_rate": 5.54807686792933e-05,
1443
+ "loss": 0.7446,
1444
+ "step": 205
1445
+ },
1446
+ {
1447
+ "epoch": 0.6602564102564102,
1448
+ "grad_norm": 1.3624720573425293,
1449
+ "learning_rate": 5.4554765047001613e-05,
1450
+ "loss": 0.7664,
1451
+ "step": 206
1452
+ },
1453
+ {
1454
+ "epoch": 0.6634615384615384,
1455
+ "grad_norm": 1.169715166091919,
1456
+ "learning_rate": 5.363364680146725e-05,
1457
+ "loss": 0.7936,
1458
+ "step": 207
1459
+ },
1460
+ {
1461
+ "epoch": 0.6666666666666666,
1462
+ "grad_norm": 1.94355309009552,
1463
+ "learning_rate": 5.271751296338823e-05,
1464
+ "loss": 0.8667,
1465
+ "step": 208
1466
+ },
1467
+ {
1468
+ "epoch": 0.6698717948717948,
1469
+ "grad_norm": 1.5784542560577393,
1470
+ "learning_rate": 5.180646201763577e-05,
1471
+ "loss": 0.8658,
1472
+ "step": 209
1473
+ },
1474
+ {
1475
+ "epoch": 0.6730769230769231,
1476
+ "grad_norm": 0.9312598705291748,
1477
+ "learning_rate": 5.090059190266779e-05,
1478
+ "loss": 0.7996,
1479
+ "step": 210
1480
+ },
1481
+ {
1482
+ "epoch": 0.6762820512820513,
1483
+ "grad_norm": 0.9065409302711487,
1484
+ "learning_rate": 5.000000000000002e-05,
1485
+ "loss": 0.7893,
1486
+ "step": 211
1487
+ },
1488
+ {
1489
+ "epoch": 0.6794871794871795,
1490
+ "grad_norm": 1.3561393022537231,
1491
+ "learning_rate": 4.9104783123737566e-05,
1492
+ "loss": 0.8413,
1493
+ "step": 212
1494
+ },
1495
+ {
1496
+ "epoch": 0.6826923076923077,
1497
+ "grad_norm": 0.8747413158416748,
1498
+ "learning_rate": 4.821503751016746e-05,
1499
+ "loss": 0.799,
1500
+ "step": 213
1501
+ },
1502
+ {
1503
+ "epoch": 0.6858974358974359,
1504
+ "grad_norm": 0.8541598916053772,
1505
+ "learning_rate": 4.733085880741301e-05,
1506
+ "loss": 0.7965,
1507
+ "step": 214
1508
+ },
1509
+ {
1510
+ "epoch": 0.6891025641025641,
1511
+ "grad_norm": 1.1472944021224976,
1512
+ "learning_rate": 4.645234206515171e-05,
1513
+ "loss": 0.7915,
1514
+ "step": 215
1515
+ },
1516
+ {
1517
+ "epoch": 0.6923076923076923,
1518
+ "grad_norm": 1.0317872762680054,
1519
+ "learning_rate": 4.5579581724397255e-05,
1520
+ "loss": 0.8,
1521
+ "step": 216
1522
+ },
1523
+ {
1524
+ "epoch": 0.6955128205128205,
1525
+ "grad_norm": 1.4401482343673706,
1526
+ "learning_rate": 4.471267160734731e-05,
1527
+ "loss": 0.7188,
1528
+ "step": 217
1529
+ },
1530
+ {
1531
+ "epoch": 0.6987179487179487,
1532
+ "grad_norm": 0.7811070680618286,
1533
+ "learning_rate": 4.385170490729712e-05,
1534
+ "loss": 0.8123,
1535
+ "step": 218
1536
+ },
1537
+ {
1538
+ "epoch": 0.7019230769230769,
1539
+ "grad_norm": 1.1124131679534912,
1540
+ "learning_rate": 4.2996774178621736e-05,
1541
+ "loss": 0.9242,
1542
+ "step": 219
1543
+ },
1544
+ {
1545
+ "epoch": 0.7051282051282052,
1546
+ "grad_norm": 0.9388600587844849,
1547
+ "learning_rate": 4.2147971326825966e-05,
1548
+ "loss": 0.8065,
1549
+ "step": 220
1550
+ },
1551
+ {
1552
+ "epoch": 0.7083333333333334,
1553
+ "grad_norm": 0.8694519400596619,
1554
+ "learning_rate": 4.130538759866457e-05,
1555
+ "loss": 0.8012,
1556
+ "step": 221
1557
+ },
1558
+ {
1559
+ "epoch": 0.7115384615384616,
1560
+ "grad_norm": 0.756574809551239,
1561
+ "learning_rate": 4.046911357233343e-05,
1562
+ "loss": 0.6749,
1563
+ "step": 222
1564
+ },
1565
+ {
1566
+ "epoch": 0.7147435897435898,
1567
+ "grad_norm": 0.8523420095443726,
1568
+ "learning_rate": 3.963923914773187e-05,
1569
+ "loss": 0.7184,
1570
+ "step": 223
1571
+ },
1572
+ {
1573
+ "epoch": 0.717948717948718,
1574
+ "grad_norm": 0.8101953864097595,
1575
+ "learning_rate": 3.8815853536798904e-05,
1576
+ "loss": 0.8064,
1577
+ "step": 224
1578
+ },
1579
+ {
1580
+ "epoch": 0.7211538461538461,
1581
+ "grad_norm": 0.9404911994934082,
1582
+ "learning_rate": 3.79990452539225e-05,
1583
+ "loss": 0.7452,
1584
+ "step": 225
1585
+ },
1586
+ {
1587
+ "epoch": 0.7243589743589743,
1588
+ "grad_norm": 0.8205745220184326,
1589
+ "learning_rate": 3.7188902106424416e-05,
1590
+ "loss": 0.7859,
1591
+ "step": 226
1592
+ },
1593
+ {
1594
+ "epoch": 0.7275641025641025,
1595
+ "grad_norm": 0.9722088575363159,
1596
+ "learning_rate": 3.638551118512089e-05,
1597
+ "loss": 0.7248,
1598
+ "step": 227
1599
+ },
1600
+ {
1601
+ "epoch": 0.7307692307692307,
1602
+ "grad_norm": 0.7296696901321411,
1603
+ "learning_rate": 3.558895885496023e-05,
1604
+ "loss": 0.7329,
1605
+ "step": 228
1606
+ },
1607
+ {
1608
+ "epoch": 0.7339743589743589,
1609
+ "grad_norm": 1.253202199935913,
1610
+ "learning_rate": 3.479933074573858e-05,
1611
+ "loss": 0.8777,
1612
+ "step": 229
1613
+ },
1614
+ {
1615
+ "epoch": 0.7371794871794872,
1616
+ "grad_norm": 1.1534240245819092,
1617
+ "learning_rate": 3.401671174289469e-05,
1618
+ "loss": 0.7485,
1619
+ "step": 230
1620
+ },
1621
+ {
1622
+ "epoch": 0.7403846153846154,
1623
+ "grad_norm": 1.1224077939987183,
1624
+ "learning_rate": 3.324118597838464e-05,
1625
+ "loss": 0.7728,
1626
+ "step": 231
1627
+ },
1628
+ {
1629
+ "epoch": 0.7435897435897436,
1630
+ "grad_norm": 0.8976945281028748,
1631
+ "learning_rate": 3.2472836821637744e-05,
1632
+ "loss": 0.7541,
1633
+ "step": 232
1634
+ },
1635
+ {
1636
+ "epoch": 0.7467948717948718,
1637
+ "grad_norm": 0.840302050113678,
1638
+ "learning_rate": 3.1711746870594086e-05,
1639
+ "loss": 0.6973,
1640
+ "step": 233
1641
+ },
1642
+ {
1643
+ "epoch": 0.75,
1644
+ "grad_norm": 0.988998293876648,
1645
+ "learning_rate": 3.0957997942825336e-05,
1646
+ "loss": 0.6981,
1647
+ "step": 234
1648
+ },
1649
+ {
1650
+ "epoch": 0.7532051282051282,
1651
+ "grad_norm": 1.0511542558670044,
1652
+ "learning_rate": 3.021167106673928e-05,
1653
+ "loss": 0.6618,
1654
+ "step": 235
1655
+ },
1656
+ {
1657
+ "epoch": 0.7564102564102564,
1658
+ "grad_norm": 1.1155120134353638,
1659
+ "learning_rate": 2.9472846472869298e-05,
1660
+ "loss": 0.7775,
1661
+ "step": 236
1662
+ },
1663
+ {
1664
+ "epoch": 0.7596153846153846,
1665
+ "grad_norm": 1.064674735069275,
1666
+ "learning_rate": 2.874160358524931e-05,
1667
+ "loss": 0.747,
1668
+ "step": 237
1669
+ },
1670
+ {
1671
+ "epoch": 0.7628205128205128,
1672
+ "grad_norm": 1.1070621013641357,
1673
+ "learning_rate": 2.8018021012875994e-05,
1674
+ "loss": 0.8083,
1675
+ "step": 238
1676
+ },
1677
+ {
1678
+ "epoch": 0.7660256410256411,
1679
+ "grad_norm": 0.8089228868484497,
1680
+ "learning_rate": 2.7302176541257986e-05,
1681
+ "loss": 0.7037,
1682
+ "step": 239
1683
+ },
1684
+ {
1685
+ "epoch": 0.7692307692307693,
1686
+ "grad_norm": 0.7307572960853577,
1687
+ "learning_rate": 2.659414712405398e-05,
1688
+ "loss": 0.7206,
1689
+ "step": 240
1690
+ },
1691
+ {
1692
+ "epoch": 0.7724358974358975,
1693
+ "grad_norm": 1.0370497703552246,
1694
+ "learning_rate": 2.5894008874800325e-05,
1695
+ "loss": 0.7379,
1696
+ "step": 241
1697
+ },
1698
+ {
1699
+ "epoch": 0.7756410256410257,
1700
+ "grad_norm": 1.3511935472488403,
1701
+ "learning_rate": 2.5201837058728505e-05,
1702
+ "loss": 0.7198,
1703
+ "step": 242
1704
+ },
1705
+ {
1706
+ "epoch": 0.7788461538461539,
1707
+ "grad_norm": 0.8478284478187561,
1708
+ "learning_rate": 2.451770608467432e-05,
1709
+ "loss": 0.8123,
1710
+ "step": 243
1711
+ },
1712
+ {
1713
+ "epoch": 0.782051282051282,
1714
+ "grad_norm": 1.0344876050949097,
1715
+ "learning_rate": 2.3841689497078746e-05,
1716
+ "loss": 0.7769,
1717
+ "step": 244
1718
+ },
1719
+ {
1720
+ "epoch": 0.7852564102564102,
1721
+ "grad_norm": 1.0734611749649048,
1722
+ "learning_rate": 2.3173859968081944e-05,
1723
+ "loss": 0.7204,
1724
+ "step": 245
1725
+ },
1726
+ {
1727
+ "epoch": 0.7884615384615384,
1728
+ "grad_norm": 1.7516003847122192,
1729
+ "learning_rate": 2.251428928971102e-05,
1730
+ "loss": 0.7088,
1731
+ "step": 246
1732
+ },
1733
+ {
1734
+ "epoch": 0.7916666666666666,
1735
+ "grad_norm": 1.0216751098632812,
1736
+ "learning_rate": 2.1863048366162208e-05,
1737
+ "loss": 0.7828,
1738
+ "step": 247
1739
+ },
1740
+ {
1741
+ "epoch": 0.7948717948717948,
1742
+ "grad_norm": 1.4974614381790161,
1743
+ "learning_rate": 2.1220207206178688e-05,
1744
+ "loss": 0.8549,
1745
+ "step": 248
1746
+ },
1747
+ {
1748
+ "epoch": 0.7980769230769231,
1749
+ "grad_norm": 1.2746527194976807,
1750
+ "learning_rate": 2.058583491552465e-05,
1751
+ "loss": 0.7863,
1752
+ "step": 249
1753
+ },
1754
+ {
1755
+ "epoch": 0.8012820512820513,
1756
+ "grad_norm": 1.0694308280944824,
1757
+ "learning_rate": 1.995999968955641e-05,
1758
+ "loss": 0.7612,
1759
+ "step": 250
1760
+ },
1761
+ {
1762
+ "epoch": 0.8044871794871795,
1763
+ "grad_norm": 1.4098316431045532,
1764
+ "learning_rate": 1.9342768805891178e-05,
1765
+ "loss": 0.7792,
1766
+ "step": 251
1767
+ },
1768
+ {
1769
+ "epoch": 0.8076923076923077,
1770
+ "grad_norm": 0.9499948024749756,
1771
+ "learning_rate": 1.8734208617174988e-05,
1772
+ "loss": 0.8431,
1773
+ "step": 252
1774
+ },
1775
+ {
1776
+ "epoch": 0.8108974358974359,
1777
+ "grad_norm": 1.0735160112380981,
1778
+ "learning_rate": 1.8134384543949478e-05,
1779
+ "loss": 0.6933,
1780
+ "step": 253
1781
+ },
1782
+ {
1783
+ "epoch": 0.8141025641025641,
1784
+ "grad_norm": 1.3675204515457153,
1785
+ "learning_rate": 1.754336106761927e-05,
1786
+ "loss": 0.7038,
1787
+ "step": 254
1788
+ },
1789
+ {
1790
+ "epoch": 0.8173076923076923,
1791
+ "grad_norm": 1.0891550779342651,
1792
+ "learning_rate": 1.696120172352025e-05,
1793
+ "loss": 0.6681,
1794
+ "step": 255
1795
+ },
1796
+ {
1797
+ "epoch": 0.8205128205128205,
1798
+ "grad_norm": 1.0907151699066162,
1799
+ "learning_rate": 1.6387969094089316e-05,
1800
+ "loss": 0.8043,
1801
+ "step": 256
1802
+ },
1803
+ {
1804
+ "epoch": 0.8237179487179487,
1805
+ "grad_norm": 0.935387134552002,
1806
+ "learning_rate": 1.5823724802136865e-05,
1807
+ "loss": 0.8365,
1808
+ "step": 257
1809
+ },
1810
+ {
1811
+ "epoch": 0.8269230769230769,
1812
+ "grad_norm": 0.8595330715179443,
1813
+ "learning_rate": 1.526852950422226e-05,
1814
+ "loss": 0.7014,
1815
+ "step": 258
1816
+ },
1817
+ {
1818
+ "epoch": 0.8301282051282052,
1819
+ "grad_norm": 1.1038745641708374,
1820
+ "learning_rate": 1.4722442884133214e-05,
1821
+ "loss": 0.7596,
1822
+ "step": 259
1823
+ },
1824
+ {
1825
+ "epoch": 0.8333333333333334,
1826
+ "grad_norm": 1.1582711935043335,
1827
+ "learning_rate": 1.4185523646469822e-05,
1828
+ "loss": 0.6689,
1829
+ "step": 260
1830
+ },
1831
+ {
1832
+ "epoch": 0.8365384615384616,
1833
+ "grad_norm": 1.1454054117202759,
1834
+ "learning_rate": 1.3657829510333654e-05,
1835
+ "loss": 0.8129,
1836
+ "step": 261
1837
+ },
1838
+ {
1839
+ "epoch": 0.8397435897435898,
1840
+ "grad_norm": 1.1534847021102905,
1841
+ "learning_rate": 1.3139417203123027e-05,
1842
+ "loss": 0.6611,
1843
+ "step": 262
1844
+ },
1845
+ {
1846
+ "epoch": 0.842948717948718,
1847
+ "grad_norm": 1.1782026290893555,
1848
+ "learning_rate": 1.263034245443473e-05,
1849
+ "loss": 0.7166,
1850
+ "step": 263
1851
+ },
1852
+ {
1853
+ "epoch": 0.8461538461538461,
1854
+ "grad_norm": 1.3515312671661377,
1855
+ "learning_rate": 1.2130659990073146e-05,
1856
+ "loss": 0.789,
1857
+ "step": 264
1858
+ },
1859
+ {
1860
+ "epoch": 0.8493589743589743,
1861
+ "grad_norm": 1.0562694072723389,
1862
+ "learning_rate": 1.1640423526166988e-05,
1863
+ "loss": 0.7457,
1864
+ "step": 265
1865
+ },
1866
+ {
1867
+ "epoch": 0.8525641025641025,
1868
+ "grad_norm": 1.0518815517425537,
1869
+ "learning_rate": 1.1159685763395111e-05,
1870
+ "loss": 0.7977,
1871
+ "step": 266
1872
+ },
1873
+ {
1874
+ "epoch": 0.8557692307692307,
1875
+ "grad_norm": 1.1951806545257568,
1876
+ "learning_rate": 1.0688498381320855e-05,
1877
+ "loss": 0.7813,
1878
+ "step": 267
1879
+ },
1880
+ {
1881
+ "epoch": 0.8589743589743589,
1882
+ "grad_norm": 1.2347551584243774,
1883
+ "learning_rate": 1.0226912032836611e-05,
1884
+ "loss": 0.6702,
1885
+ "step": 268
1886
+ },
1887
+ {
1888
+ "epoch": 0.8621794871794872,
1889
+ "grad_norm": 2.0265257358551025,
1890
+ "learning_rate": 9.774976338718677e-06,
1891
+ "loss": 0.9113,
1892
+ "step": 269
1893
+ },
1894
+ {
1895
+ "epoch": 0.8653846153846154,
1896
+ "grad_norm": 1.1159110069274902,
1897
+ "learning_rate": 9.332739882292752e-06,
1898
+ "loss": 0.7241,
1899
+ "step": 270
1900
+ },
1901
+ {
1902
+ "epoch": 0.8685897435897436,
1903
+ "grad_norm": 1.0101889371871948,
1904
+ "learning_rate": 8.900250204211514e-06,
1905
+ "loss": 0.7407,
1906
+ "step": 271
1907
+ },
1908
+ {
1909
+ "epoch": 0.8717948717948718,
1910
+ "grad_norm": 1.1704083681106567,
1911
+ "learning_rate": 8.47755379734373e-06,
1912
+ "loss": 0.6598,
1913
+ "step": 272
1914
+ },
1915
+ {
1916
+ "epoch": 0.875,
1917
+ "grad_norm": 0.9426171779632568,
1918
+ "learning_rate": 8.064696101776358e-06,
1919
+ "loss": 0.7204,
1920
+ "step": 273
1921
+ },
1922
+ {
1923
+ "epoch": 0.8782051282051282,
1924
+ "grad_norm": 1.2830696105957031,
1925
+ "learning_rate": 7.661721499929753e-06,
1926
+ "loss": 0.7574,
1927
+ "step": 274
1928
+ },
1929
+ {
1930
+ "epoch": 0.8814102564102564,
1931
+ "grad_norm": 1.5801949501037598,
1932
+ "learning_rate": 7.2686733117863784e-06,
1933
+ "loss": 0.7968,
1934
+ "step": 275
1935
+ },
1936
+ {
1937
+ "epoch": 0.8846153846153846,
1938
+ "grad_norm": 1.4336384534835815,
1939
+ "learning_rate": 6.8855937902340576e-06,
1940
+ "loss": 0.6974,
1941
+ "step": 276
1942
+ },
1943
+ {
1944
+ "epoch": 0.8878205128205128,
1945
+ "grad_norm": 0.9879603385925293,
1946
+ "learning_rate": 6.512524116523633e-06,
1947
+ "loss": 0.634,
1948
+ "step": 277
1949
+ },
1950
+ {
1951
+ "epoch": 0.8910256410256411,
1952
+ "grad_norm": 1.2858949899673462,
1953
+ "learning_rate": 6.149504395842087e-06,
1954
+ "loss": 0.7762,
1955
+ "step": 278
1956
+ },
1957
+ {
1958
+ "epoch": 0.8942307692307693,
1959
+ "grad_norm": 1.6432108879089355,
1960
+ "learning_rate": 5.7965736530010916e-06,
1961
+ "loss": 0.821,
1962
+ "step": 279
1963
+ },
1964
+ {
1965
+ "epoch": 0.8974358974358975,
1966
+ "grad_norm": 1.3427728414535522,
1967
+ "learning_rate": 5.453769828241872e-06,
1968
+ "loss": 0.7391,
1969
+ "step": 280
1970
+ },
1971
+ {
1972
+ "epoch": 0.9006410256410257,
1973
+ "grad_norm": 1.405120849609375,
1974
+ "learning_rate": 5.121129773156663e-06,
1975
+ "loss": 0.7824,
1976
+ "step": 281
1977
+ },
1978
+ {
1979
+ "epoch": 0.9038461538461539,
1980
+ "grad_norm": 1.232499122619629,
1981
+ "learning_rate": 4.798689246727006e-06,
1982
+ "loss": 0.7732,
1983
+ "step": 282
1984
+ },
1985
+ {
1986
+ "epoch": 0.907051282051282,
1987
+ "grad_norm": 1.4786527156829834,
1988
+ "learning_rate": 4.486482911479839e-06,
1989
+ "loss": 0.7186,
1990
+ "step": 283
1991
+ },
1992
+ {
1993
+ "epoch": 0.9102564102564102,
1994
+ "grad_norm": 1.3841599225997925,
1995
+ "learning_rate": 4.184544329761009e-06,
1996
+ "loss": 0.7316,
1997
+ "step": 284
1998
+ },
1999
+ {
2000
+ "epoch": 0.9134615384615384,
2001
+ "grad_norm": 1.110377550125122,
2002
+ "learning_rate": 3.892905960127546e-06,
2003
+ "loss": 0.7166,
2004
+ "step": 285
2005
+ },
2006
+ {
2007
+ "epoch": 0.9166666666666666,
2008
+ "grad_norm": 1.2394565343856812,
2009
+ "learning_rate": 3.611599153858214e-06,
2010
+ "loss": 0.6894,
2011
+ "step": 286
2012
+ },
2013
+ {
2014
+ "epoch": 0.9198717948717948,
2015
+ "grad_norm": 2.2001726627349854,
2016
+ "learning_rate": 3.3406541515832003e-06,
2017
+ "loss": 0.8162,
2018
+ "step": 287
2019
+ },
2020
+ {
2021
+ "epoch": 0.9230769230769231,
2022
+ "grad_norm": 1.696381688117981,
2023
+ "learning_rate": 3.0801000800333877e-06,
2024
+ "loss": 0.6764,
2025
+ "step": 288
2026
+ },
2027
+ {
2028
+ "epoch": 0.9262820512820513,
2029
+ "grad_norm": 1.1135708093643188,
2030
+ "learning_rate": 2.8299649489090475e-06,
2031
+ "loss": 0.7059,
2032
+ "step": 289
2033
+ },
2034
+ {
2035
+ "epoch": 0.9294871794871795,
2036
+ "grad_norm": 1.3587653636932373,
2037
+ "learning_rate": 2.590275647868867e-06,
2038
+ "loss": 0.7251,
2039
+ "step": 290
2040
+ },
2041
+ {
2042
+ "epoch": 0.9326923076923077,
2043
+ "grad_norm": 1.686203956604004,
2044
+ "learning_rate": 2.3610579436393e-06,
2045
+ "loss": 0.6625,
2046
+ "step": 291
2047
+ },
2048
+ {
2049
+ "epoch": 0.9358974358974359,
2050
+ "grad_norm": 1.2855104207992554,
2051
+ "learning_rate": 2.1423364772445887e-06,
2052
+ "loss": 0.7562,
2053
+ "step": 292
2054
+ },
2055
+ {
2056
+ "epoch": 0.9391025641025641,
2057
+ "grad_norm": 1.6745336055755615,
2058
+ "learning_rate": 1.9341347613579087e-06,
2059
+ "loss": 0.7244,
2060
+ "step": 293
2061
+ },
2062
+ {
2063
+ "epoch": 0.9423076923076923,
2064
+ "grad_norm": 1.2314362525939941,
2065
+ "learning_rate": 1.7364751777736332e-06,
2066
+ "loss": 0.6981,
2067
+ "step": 294
2068
+ },
2069
+ {
2070
+ "epoch": 0.9455128205128205,
2071
+ "grad_norm": 0.9058342576026917,
2072
+ "learning_rate": 1.5493789750014031e-06,
2073
+ "loss": 0.7398,
2074
+ "step": 295
2075
+ },
2076
+ {
2077
+ "epoch": 0.9487179487179487,
2078
+ "grad_norm": 1.4387075901031494,
2079
+ "learning_rate": 1.3728662659818204e-06,
2080
+ "loss": 0.6769,
2081
+ "step": 296
2082
+ },
2083
+ {
2084
+ "epoch": 0.9519230769230769,
2085
+ "grad_norm": 1.3964595794677734,
2086
+ "learning_rate": 1.2069560259243328e-06,
2087
+ "loss": 0.685,
2088
+ "step": 297
2089
+ },
2090
+ {
2091
+ "epoch": 0.9551282051282052,
2092
+ "grad_norm": 1.1776217222213745,
2093
+ "learning_rate": 1.0516660902673448e-06,
2094
+ "loss": 0.7934,
2095
+ "step": 298
2096
+ },
2097
+ {
2098
+ "epoch": 0.9583333333333334,
2099
+ "grad_norm": 1.2884339094161987,
2100
+ "learning_rate": 9.070131527609604e-07,
2101
+ "loss": 0.6712,
2102
+ "step": 299
2103
+ },
2104
+ {
2105
+ "epoch": 0.9615384615384616,
2106
+ "grad_norm": 1.7066468000411987,
2107
+ "learning_rate": 7.730127636723539e-07,
2108
+ "loss": 0.6243,
2109
+ "step": 300
2110
+ },
2111
+ {
2112
+ "epoch": 0.9647435897435898,
2113
+ "grad_norm": 0.9889491200447083,
2114
+ "learning_rate": 6.496793281141056e-07,
2115
+ "loss": 0.6915,
2116
+ "step": 301
2117
+ },
2118
+ {
2119
+ "epoch": 0.967948717948718,
2120
+ "grad_norm": 0.9342219233512878,
2121
+ "learning_rate": 5.370261044956971e-07,
2122
+ "loss": 0.7163,
2123
+ "step": 302
2124
+ },
2125
+ {
2126
+ "epoch": 0.9711538461538461,
2127
+ "grad_norm": 1.2161084413528442,
2128
+ "learning_rate": 4.3506520309813947e-07,
2129
+ "loss": 0.6957,
2130
+ "step": 303
2131
+ },
2132
+ {
2133
+ "epoch": 0.9743589743589743,
2134
+ "grad_norm": 1.4092503786087036,
2135
+ "learning_rate": 3.4380758477219333e-07,
2136
+ "loss": 0.7295,
2137
+ "step": 304
2138
+ },
2139
+ {
2140
+ "epoch": 0.9775641025641025,
2141
+ "grad_norm": 1.5113911628723145,
2142
+ "learning_rate": 2.6326305976001055e-07,
2143
+ "loss": 0.6846,
2144
+ "step": 305
2145
+ },
2146
+ {
2147
+ "epoch": 0.9807692307692307,
2148
+ "grad_norm": 1.2269924879074097,
2149
+ "learning_rate": 1.9344028664056713e-07,
2150
+ "loss": 0.7242,
2151
+ "step": 306
2152
+ },
2153
+ {
2154
+ "epoch": 0.9839743589743589,
2155
+ "grad_norm": 1.3103538751602173,
2156
+ "learning_rate": 1.3434677139885222e-07,
2157
+ "loss": 0.6923,
2158
+ "step": 307
2159
+ },
2160
+ {
2161
+ "epoch": 0.9871794871794872,
2162
+ "grad_norm": 1.182924747467041,
2163
+ "learning_rate": 8.598886661895788e-08,
2164
+ "loss": 0.693,
2165
+ "step": 308
2166
+ },
2167
+ {
2168
+ "epoch": 0.9903846153846154,
2169
+ "grad_norm": 1.258489966392517,
2170
+ "learning_rate": 4.837177080119215e-08,
2171
+ "loss": 0.7017,
2172
+ "step": 309
2173
+ },
2174
+ {
2175
+ "epoch": 0.9935897435897436,
2176
+ "grad_norm": 1.2968997955322266,
2177
+ "learning_rate": 2.1499527803214846e-08,
2178
+ "loss": 0.7148,
2179
+ "step": 310
2180
+ },
2181
+ {
2182
+ "epoch": 0.9967948717948718,
2183
+ "grad_norm": 1.3392126560211182,
2184
+ "learning_rate": 5.375026405352035e-09,
2185
+ "loss": 0.7221,
2186
+ "step": 311
2187
+ },
2188
+ {
2189
+ "epoch": 1.0,
2190
+ "grad_norm": 1.7223619222640991,
2191
+ "learning_rate": 0.0,
2192
+ "loss": 0.5389,
2193
+ "step": 312
2194
+ }
2195
+ ],
2196
+ "logging_steps": 1,
2197
+ "max_steps": 312,
2198
+ "num_input_tokens_seen": 0,
2199
+ "num_train_epochs": 1,
2200
+ "save_steps": 500,
2201
+ "stateful_callbacks": {
2202
+ "TrainerControl": {
2203
+ "args": {
2204
+ "should_epoch_stop": false,
2205
+ "should_evaluate": false,
2206
+ "should_log": false,
2207
+ "should_save": true,
2208
+ "should_training_stop": true
2209
+ },
2210
+ "attributes": {}
2211
+ }
2212
+ },
2213
+ "total_flos": 3978364611723264.0,
2214
+ "train_batch_size": 16,
2215
+ "trial_name": null,
2216
+ "trial_params": null
2217
+ }
checkpoint-312/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8381fa317ea32919a2925037ed9df681449cd986caf35e43c8ea133a79d4cc2
3
+ size 6456
checkpoint-363/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
checkpoint-363/adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": null,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 8,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "gate_proj",
27
+ "down_proj",
28
+ "v_proj",
29
+ "k_proj",
30
+ "o_proj",
31
+ "up_proj",
32
+ "q_proj"
33
+ ],
34
+ "task_type": "CAUSAL_LM",
35
+ "use_dora": false,
36
+ "use_rslora": false
37
+ }
checkpoint-363/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa819f5afaaf986d91b55dff514533e38e7e6f1208309d3e7737646b55f6c00c
3
+ size 25271744
checkpoint-363/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc66ed55f5edb45f17b75f7a2e0a00db8a6b7afa50e8f5163e5c32a42c413ebe
3
+ size 13685836
checkpoint-363/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63905123f21867a969842b3650d5858830a9f958f6d66a0ec0ae5af10dcfc123
3
+ size 14244
checkpoint-363/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a3cdfee7f940157d7e68d9516390e12b7802eca5630eddbc7a6e04bf73df963
3
+ size 1064
checkpoint-363/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-363/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-363/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-363/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 2048,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-363/trainer_state.json ADDED
@@ -0,0 +1,1300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 363,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01652892561983471,
13
+ "grad_norm": 12.011792182922363,
14
+ "learning_rate": 2.857142857142857e-05,
15
+ "loss": 4.7518,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.03305785123966942,
20
+ "grad_norm": 12.689871788024902,
21
+ "learning_rate": 5.714285714285714e-05,
22
+ "loss": 4.687,
23
+ "step": 4
24
+ },
25
+ {
26
+ "epoch": 0.049586776859504134,
27
+ "grad_norm": 11.848198890686035,
28
+ "learning_rate": 8.571428571428571e-05,
29
+ "loss": 4.4922,
30
+ "step": 6
31
+ },
32
+ {
33
+ "epoch": 0.06611570247933884,
34
+ "grad_norm": 10.378408432006836,
35
+ "learning_rate": 0.00011428571428571428,
36
+ "loss": 3.4839,
37
+ "step": 8
38
+ },
39
+ {
40
+ "epoch": 0.08264462809917356,
41
+ "grad_norm": 9.543920516967773,
42
+ "learning_rate": 0.00014285714285714287,
43
+ "loss": 2.4164,
44
+ "step": 10
45
+ },
46
+ {
47
+ "epoch": 0.09917355371900827,
48
+ "grad_norm": 5.602456092834473,
49
+ "learning_rate": 0.00017142857142857143,
50
+ "loss": 1.838,
51
+ "step": 12
52
+ },
53
+ {
54
+ "epoch": 0.11570247933884298,
55
+ "grad_norm": 1.7414193153381348,
56
+ "learning_rate": 0.0002,
57
+ "loss": 1.4404,
58
+ "step": 14
59
+ },
60
+ {
61
+ "epoch": 0.1322314049586777,
62
+ "grad_norm": 1.0191770792007446,
63
+ "learning_rate": 0.0001999910643210378,
64
+ "loss": 1.3474,
65
+ "step": 16
66
+ },
67
+ {
68
+ "epoch": 0.1487603305785124,
69
+ "grad_norm": 3.509352922439575,
70
+ "learning_rate": 0.0001999642588810784,
71
+ "loss": 1.3789,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 0.1652892561983471,
76
+ "grad_norm": 2.426680088043213,
77
+ "learning_rate": 0.00019991958847061784,
78
+ "loss": 1.2245,
79
+ "step": 20
80
+ },
81
+ {
82
+ "epoch": 0.18181818181818182,
83
+ "grad_norm": 0.8060101270675659,
84
+ "learning_rate": 0.00019985706107286514,
85
+ "loss": 1.2259,
86
+ "step": 22
87
+ },
88
+ {
89
+ "epoch": 0.19834710743801653,
90
+ "grad_norm": 1.6215940713882446,
91
+ "learning_rate": 0.00019977668786231534,
92
+ "loss": 1.1992,
93
+ "step": 24
94
+ },
95
+ {
96
+ "epoch": 0.21487603305785125,
97
+ "grad_norm": 2.476259231567383,
98
+ "learning_rate": 0.0001996784832027525,
99
+ "loss": 1.2782,
100
+ "step": 26
101
+ },
102
+ {
103
+ "epoch": 0.23140495867768596,
104
+ "grad_norm": 2.04018235206604,
105
+ "learning_rate": 0.00019956246464468294,
106
+ "loss": 1.23,
107
+ "step": 28
108
+ },
109
+ {
110
+ "epoch": 0.24793388429752067,
111
+ "grad_norm": 1.8799017667770386,
112
+ "learning_rate": 0.00019942865292219838,
113
+ "loss": 1.2164,
114
+ "step": 30
115
+ },
116
+ {
117
+ "epoch": 0.2644628099173554,
118
+ "grad_norm": 1.2413148880004883,
119
+ "learning_rate": 0.00019927707194927066,
120
+ "loss": 1.2924,
121
+ "step": 32
122
+ },
123
+ {
124
+ "epoch": 0.2809917355371901,
125
+ "grad_norm": 1.8723937273025513,
126
+ "learning_rate": 0.000199107748815478,
127
+ "loss": 1.2302,
128
+ "step": 34
129
+ },
130
+ {
131
+ "epoch": 0.2975206611570248,
132
+ "grad_norm": 1.393110752105713,
133
+ "learning_rate": 0.00019892071378116376,
134
+ "loss": 1.2276,
135
+ "step": 36
136
+ },
137
+ {
138
+ "epoch": 0.3140495867768595,
139
+ "grad_norm": 1.1459721326828003,
140
+ "learning_rate": 0.0001987160002720283,
141
+ "loss": 1.1504,
142
+ "step": 38
143
+ },
144
+ {
145
+ "epoch": 0.3305785123966942,
146
+ "grad_norm": 1.4680942296981812,
147
+ "learning_rate": 0.00019849364487315558,
148
+ "loss": 1.1623,
149
+ "step": 40
150
+ },
151
+ {
152
+ "epoch": 0.34710743801652894,
153
+ "grad_norm": 1.8715866804122925,
154
+ "learning_rate": 0.0001982536873224748,
155
+ "loss": 1.2155,
156
+ "step": 42
157
+ },
158
+ {
159
+ "epoch": 0.36363636363636365,
160
+ "grad_norm": 0.871064305305481,
161
+ "learning_rate": 0.0001979961705036587,
162
+ "loss": 1.1594,
163
+ "step": 44
164
+ },
165
+ {
166
+ "epoch": 0.38016528925619836,
167
+ "grad_norm": 0.8239800930023193,
168
+ "learning_rate": 0.00019772114043845965,
169
+ "loss": 1.1501,
170
+ "step": 46
171
+ },
172
+ {
173
+ "epoch": 0.39669421487603307,
174
+ "grad_norm": 0.9587319493293762,
175
+ "learning_rate": 0.0001974286462784851,
176
+ "loss": 1.1195,
177
+ "step": 48
178
+ },
179
+ {
180
+ "epoch": 0.4132231404958678,
181
+ "grad_norm": 1.1645926237106323,
182
+ "learning_rate": 0.0001971187402964132,
183
+ "loss": 1.1417,
184
+ "step": 50
185
+ },
186
+ {
187
+ "epoch": 0.4297520661157025,
188
+ "grad_norm": 0.576813817024231,
189
+ "learning_rate": 0.00019679147787665126,
190
+ "loss": 1.1445,
191
+ "step": 52
192
+ },
193
+ {
194
+ "epoch": 0.4462809917355372,
195
+ "grad_norm": 1.0733133554458618,
196
+ "learning_rate": 0.00019644691750543767,
197
+ "loss": 1.0979,
198
+ "step": 54
199
+ },
200
+ {
201
+ "epoch": 0.4628099173553719,
202
+ "grad_norm": 0.5801639556884766,
203
+ "learning_rate": 0.00019608512076038962,
204
+ "loss": 1.0977,
205
+ "step": 56
206
+ },
207
+ {
208
+ "epoch": 0.4793388429752066,
209
+ "grad_norm": 1.6796538829803467,
210
+ "learning_rate": 0.00019570615229949842,
211
+ "loss": 1.1925,
212
+ "step": 58
213
+ },
214
+ {
215
+ "epoch": 0.49586776859504134,
216
+ "grad_norm": 1.0563887357711792,
217
+ "learning_rate": 0.00019531007984957408,
218
+ "loss": 1.0657,
219
+ "step": 60
220
+ },
221
+ {
222
+ "epoch": 0.512396694214876,
223
+ "grad_norm": 0.9109811186790466,
224
+ "learning_rate": 0.00019489697419414182,
225
+ "loss": 1.1098,
226
+ "step": 62
227
+ },
228
+ {
229
+ "epoch": 0.5289256198347108,
230
+ "grad_norm": 0.7321667671203613,
231
+ "learning_rate": 0.0001944669091607919,
232
+ "loss": 1.0929,
233
+ "step": 64
234
+ },
235
+ {
236
+ "epoch": 0.5454545454545454,
237
+ "grad_norm": 0.685366690158844,
238
+ "learning_rate": 0.00019401996160798573,
239
+ "loss": 1.1242,
240
+ "step": 66
241
+ },
242
+ {
243
+ "epoch": 0.5619834710743802,
244
+ "grad_norm": 0.8959838151931763,
245
+ "learning_rate": 0.0001935562114113202,
246
+ "loss": 1.181,
247
+ "step": 68
248
+ },
249
+ {
250
+ "epoch": 0.5785123966942148,
251
+ "grad_norm": 0.9717262983322144,
252
+ "learning_rate": 0.00019307574144925287,
253
+ "loss": 1.2295,
254
+ "step": 70
255
+ },
256
+ {
257
+ "epoch": 0.5950413223140496,
258
+ "grad_norm": 1.0358582735061646,
259
+ "learning_rate": 0.00019257863758829035,
260
+ "loss": 1.1431,
261
+ "step": 72
262
+ },
263
+ {
264
+ "epoch": 0.6115702479338843,
265
+ "grad_norm": 0.7998526096343994,
266
+ "learning_rate": 0.00019206498866764288,
267
+ "loss": 1.1032,
268
+ "step": 74
269
+ },
270
+ {
271
+ "epoch": 0.628099173553719,
272
+ "grad_norm": 1.1496188640594482,
273
+ "learning_rate": 0.0001915348864833476,
274
+ "loss": 1.057,
275
+ "step": 76
276
+ },
277
+ {
278
+ "epoch": 0.6446280991735537,
279
+ "grad_norm": 0.652406632900238,
280
+ "learning_rate": 0.00019098842577186314,
281
+ "loss": 1.146,
282
+ "step": 78
283
+ },
284
+ {
285
+ "epoch": 0.6611570247933884,
286
+ "grad_norm": 0.9454944729804993,
287
+ "learning_rate": 0.00019042570419313925,
288
+ "loss": 1.1543,
289
+ "step": 80
290
+ },
291
+ {
292
+ "epoch": 0.6776859504132231,
293
+ "grad_norm": 0.7456652522087097,
294
+ "learning_rate": 0.00018984682231316333,
295
+ "loss": 1.1189,
296
+ "step": 82
297
+ },
298
+ {
299
+ "epoch": 0.6942148760330579,
300
+ "grad_norm": 0.7312512397766113,
301
+ "learning_rate": 0.00018925188358598813,
302
+ "loss": 1.0873,
303
+ "step": 84
304
+ },
305
+ {
306
+ "epoch": 0.7107438016528925,
307
+ "grad_norm": 0.8474765419960022,
308
+ "learning_rate": 0.000188640994335243,
309
+ "loss": 1.1698,
310
+ "step": 86
311
+ },
312
+ {
313
+ "epoch": 0.7272727272727273,
314
+ "grad_norm": 0.6979633569717407,
315
+ "learning_rate": 0.0001880142637351325,
316
+ "loss": 1.1417,
317
+ "step": 88
318
+ },
319
+ {
320
+ "epoch": 0.743801652892562,
321
+ "grad_norm": 0.5989161133766174,
322
+ "learning_rate": 0.00018737180379092537,
323
+ "loss": 1.0479,
324
+ "step": 90
325
+ },
326
+ {
327
+ "epoch": 0.7603305785123967,
328
+ "grad_norm": 0.5765272378921509,
329
+ "learning_rate": 0.00018671372931893773,
330
+ "loss": 1.1336,
331
+ "step": 92
332
+ },
333
+ {
334
+ "epoch": 0.7768595041322314,
335
+ "grad_norm": 0.6709849834442139,
336
+ "learning_rate": 0.00018604015792601396,
337
+ "loss": 1.1157,
338
+ "step": 94
339
+ },
340
+ {
341
+ "epoch": 0.7933884297520661,
342
+ "grad_norm": 0.8181343674659729,
343
+ "learning_rate": 0.00018535120998850848,
344
+ "loss": 1.0927,
345
+ "step": 96
346
+ },
347
+ {
348
+ "epoch": 0.8099173553719008,
349
+ "grad_norm": 0.6146332621574402,
350
+ "learning_rate": 0.00018464700863077312,
351
+ "loss": 1.0739,
352
+ "step": 98
353
+ },
354
+ {
355
+ "epoch": 0.8264462809917356,
356
+ "grad_norm": 0.9904415011405945,
357
+ "learning_rate": 0.00018392767970315313,
358
+ "loss": 1.0331,
359
+ "step": 100
360
+ },
361
+ {
362
+ "epoch": 0.8429752066115702,
363
+ "grad_norm": 0.6186695694923401,
364
+ "learning_rate": 0.0001831933517594957,
365
+ "loss": 1.0513,
366
+ "step": 102
367
+ },
368
+ {
369
+ "epoch": 0.859504132231405,
370
+ "grad_norm": 1.1912785768508911,
371
+ "learning_rate": 0.00018244415603417603,
372
+ "loss": 1.1567,
373
+ "step": 104
374
+ },
375
+ {
376
+ "epoch": 0.8760330578512396,
377
+ "grad_norm": 1.3681318759918213,
378
+ "learning_rate": 0.00018168022641864377,
379
+ "loss": 1.1497,
380
+ "step": 106
381
+ },
382
+ {
383
+ "epoch": 0.8925619834710744,
384
+ "grad_norm": 0.619476318359375,
385
+ "learning_rate": 0.00018090169943749476,
386
+ "loss": 1.1546,
387
+ "step": 108
388
+ },
389
+ {
390
+ "epoch": 0.9090909090909091,
391
+ "grad_norm": 0.7421219348907471,
392
+ "learning_rate": 0.00018010871422407236,
393
+ "loss": 1.1458,
394
+ "step": 110
395
+ },
396
+ {
397
+ "epoch": 0.9256198347107438,
398
+ "grad_norm": 0.6569286584854126,
399
+ "learning_rate": 0.00017930141249560233,
400
+ "loss": 1.12,
401
+ "step": 112
402
+ },
403
+ {
404
+ "epoch": 0.9421487603305785,
405
+ "grad_norm": 0.4168110191822052,
406
+ "learning_rate": 0.0001784799385278661,
407
+ "loss": 1.1682,
408
+ "step": 114
409
+ },
410
+ {
411
+ "epoch": 0.9586776859504132,
412
+ "grad_norm": 0.5620162487030029,
413
+ "learning_rate": 0.00017764443912941672,
414
+ "loss": 1.1828,
415
+ "step": 116
416
+ },
417
+ {
418
+ "epoch": 0.9752066115702479,
419
+ "grad_norm": 0.8095484375953674,
420
+ "learning_rate": 0.00017679506361534215,
421
+ "loss": 1.1953,
422
+ "step": 118
423
+ },
424
+ {
425
+ "epoch": 0.9917355371900827,
426
+ "grad_norm": 0.7646257281303406,
427
+ "learning_rate": 0.0001759319637805806,
428
+ "loss": 1.2148,
429
+ "step": 120
430
+ },
431
+ {
432
+ "epoch": 1.0082644628099173,
433
+ "grad_norm": 0.5254501104354858,
434
+ "learning_rate": 0.00017505529387279277,
435
+ "loss": 1.1359,
436
+ "step": 122
437
+ },
438
+ {
439
+ "epoch": 1.024793388429752,
440
+ "grad_norm": 0.6001765727996826,
441
+ "learning_rate": 0.00017416521056479577,
442
+ "loss": 1.1336,
443
+ "step": 124
444
+ },
445
+ {
446
+ "epoch": 1.0413223140495869,
447
+ "grad_norm": 0.35407504439353943,
448
+ "learning_rate": 0.00017326187292656333,
449
+ "loss": 1.1833,
450
+ "step": 126
451
+ },
452
+ {
453
+ "epoch": 1.0578512396694215,
454
+ "grad_norm": 0.414528489112854,
455
+ "learning_rate": 0.00017234544239679806,
456
+ "loss": 1.1301,
457
+ "step": 128
458
+ },
459
+ {
460
+ "epoch": 1.0743801652892562,
461
+ "grad_norm": 0.46355852484703064,
462
+ "learning_rate": 0.00017141608275408006,
463
+ "loss": 1.213,
464
+ "step": 130
465
+ },
466
+ {
467
+ "epoch": 1.0909090909090908,
468
+ "grad_norm": 0.5040593147277832,
469
+ "learning_rate": 0.00017047396008759754,
470
+ "loss": 1.132,
471
+ "step": 132
472
+ },
473
+ {
474
+ "epoch": 1.1074380165289257,
475
+ "grad_norm": 0.4813704192638397,
476
+ "learning_rate": 0.00016951924276746425,
477
+ "loss": 1.0831,
478
+ "step": 134
479
+ },
480
+ {
481
+ "epoch": 1.1239669421487604,
482
+ "grad_norm": 0.5174686312675476,
483
+ "learning_rate": 0.00016855210141462963,
484
+ "loss": 1.0514,
485
+ "step": 136
486
+ },
487
+ {
488
+ "epoch": 1.140495867768595,
489
+ "grad_norm": 0.4712466299533844,
490
+ "learning_rate": 0.00016757270887038654,
491
+ "loss": 1.1334,
492
+ "step": 138
493
+ },
494
+ {
495
+ "epoch": 1.1570247933884297,
496
+ "grad_norm": 0.5912173390388489,
497
+ "learning_rate": 0.00016658124016548197,
498
+ "loss": 1.1011,
499
+ "step": 140
500
+ },
501
+ {
502
+ "epoch": 1.1735537190082646,
503
+ "grad_norm": 0.6392802000045776,
504
+ "learning_rate": 0.00016557787248883696,
505
+ "loss": 1.1361,
506
+ "step": 142
507
+ },
508
+ {
509
+ "epoch": 1.1900826446280992,
510
+ "grad_norm": 0.7376368045806885,
511
+ "learning_rate": 0.00016456278515588024,
512
+ "loss": 1.109,
513
+ "step": 144
514
+ },
515
+ {
516
+ "epoch": 1.2066115702479339,
517
+ "grad_norm": 0.5020875930786133,
518
+ "learning_rate": 0.00016353615957650236,
519
+ "loss": 1.0925,
520
+ "step": 146
521
+ },
522
+ {
523
+ "epoch": 1.2231404958677685,
524
+ "grad_norm": 0.8081740736961365,
525
+ "learning_rate": 0.00016249817922263517,
526
+ "loss": 1.047,
527
+ "step": 148
528
+ },
529
+ {
530
+ "epoch": 1.2396694214876034,
531
+ "grad_norm": 0.6371219754219055,
532
+ "learning_rate": 0.00016144902959546286,
533
+ "loss": 1.113,
534
+ "step": 150
535
+ },
536
+ {
537
+ "epoch": 1.256198347107438,
538
+ "grad_norm": 0.7588189840316772,
539
+ "learning_rate": 0.00016038889819227045,
540
+ "loss": 1.1179,
541
+ "step": 152
542
+ },
543
+ {
544
+ "epoch": 1.2727272727272727,
545
+ "grad_norm": 0.6286205053329468,
546
+ "learning_rate": 0.00015931797447293552,
547
+ "loss": 1.1209,
548
+ "step": 154
549
+ },
550
+ {
551
+ "epoch": 1.2892561983471074,
552
+ "grad_norm": 0.797656238079071,
553
+ "learning_rate": 0.00015823644982606905,
554
+ "loss": 1.1698,
555
+ "step": 156
556
+ },
557
+ {
558
+ "epoch": 1.3057851239669422,
559
+ "grad_norm": 0.5368632078170776,
560
+ "learning_rate": 0.00015714451753481168,
561
+ "loss": 1.1973,
562
+ "step": 158
563
+ },
564
+ {
565
+ "epoch": 1.322314049586777,
566
+ "grad_norm": 0.4135212302207947,
567
+ "learning_rate": 0.00015604237274229147,
568
+ "loss": 1.1452,
569
+ "step": 160
570
+ },
571
+ {
572
+ "epoch": 1.3388429752066116,
573
+ "grad_norm": 0.5289668440818787,
574
+ "learning_rate": 0.00015493021241674918,
575
+ "loss": 1.1954,
576
+ "step": 162
577
+ },
578
+ {
579
+ "epoch": 1.3553719008264462,
580
+ "grad_norm": 0.4092061221599579,
581
+ "learning_rate": 0.00015380823531633729,
582
+ "loss": 1.1226,
583
+ "step": 164
584
+ },
585
+ {
586
+ "epoch": 1.3719008264462809,
587
+ "grad_norm": 0.7049645781517029,
588
+ "learning_rate": 0.00015267664195359917,
589
+ "loss": 1.0948,
590
+ "step": 166
591
+ },
592
+ {
593
+ "epoch": 1.3884297520661157,
594
+ "grad_norm": 0.47164198756217957,
595
+ "learning_rate": 0.00015153563455963499,
596
+ "loss": 1.0977,
597
+ "step": 168
598
+ },
599
+ {
600
+ "epoch": 1.4049586776859504,
601
+ "grad_norm": 0.7871695160865784,
602
+ "learning_rate": 0.00015038541704796003,
603
+ "loss": 1.1674,
604
+ "step": 170
605
+ },
606
+ {
607
+ "epoch": 1.421487603305785,
608
+ "grad_norm": 0.5381121635437012,
609
+ "learning_rate": 0.00014922619497806277,
610
+ "loss": 1.1415,
611
+ "step": 172
612
+ },
613
+ {
614
+ "epoch": 1.43801652892562,
615
+ "grad_norm": 0.39419299364089966,
616
+ "learning_rate": 0.00014805817551866838,
617
+ "loss": 1.0747,
618
+ "step": 174
619
+ },
620
+ {
621
+ "epoch": 1.4545454545454546,
622
+ "grad_norm": 0.38382914662361145,
623
+ "learning_rate": 0.00014688156741071514,
624
+ "loss": 1.1278,
625
+ "step": 176
626
+ },
627
+ {
628
+ "epoch": 1.4710743801652892,
629
+ "grad_norm": 0.32674962282180786,
630
+ "learning_rate": 0.00014569658093004935,
631
+ "loss": 0.9774,
632
+ "step": 178
633
+ },
634
+ {
635
+ "epoch": 1.487603305785124,
636
+ "grad_norm": 0.5443088412284851,
637
+ "learning_rate": 0.00014450342784984633,
638
+ "loss": 1.034,
639
+ "step": 180
640
+ },
641
+ {
642
+ "epoch": 1.5041322314049586,
643
+ "grad_norm": 0.6682401895523071,
644
+ "learning_rate": 0.00014330232140276366,
645
+ "loss": 1.1732,
646
+ "step": 182
647
+ },
648
+ {
649
+ "epoch": 1.5206611570247934,
650
+ "grad_norm": 0.5696044564247131,
651
+ "learning_rate": 0.0001420934762428335,
652
+ "loss": 1.0384,
653
+ "step": 184
654
+ },
655
+ {
656
+ "epoch": 1.537190082644628,
657
+ "grad_norm": 0.6782551407814026,
658
+ "learning_rate": 0.0001408771084071012,
659
+ "loss": 1.1107,
660
+ "step": 186
661
+ },
662
+ {
663
+ "epoch": 1.553719008264463,
664
+ "grad_norm": 0.8336123824119568,
665
+ "learning_rate": 0.00013965343527701628,
666
+ "loss": 1.0737,
667
+ "step": 188
668
+ },
669
+ {
670
+ "epoch": 1.5702479338842976,
671
+ "grad_norm": 0.539226233959198,
672
+ "learning_rate": 0.00013842267553958371,
673
+ "loss": 1.1665,
674
+ "step": 190
675
+ },
676
+ {
677
+ "epoch": 1.5867768595041323,
678
+ "grad_norm": 0.566620409488678,
679
+ "learning_rate": 0.00013718504914828135,
680
+ "loss": 1.1333,
681
+ "step": 192
682
+ },
683
+ {
684
+ "epoch": 1.603305785123967,
685
+ "grad_norm": 0.4735005795955658,
686
+ "learning_rate": 0.00013594077728375128,
687
+ "loss": 1.1709,
688
+ "step": 194
689
+ },
690
+ {
691
+ "epoch": 1.6198347107438016,
692
+ "grad_norm": 0.534383237361908,
693
+ "learning_rate": 0.00013469008231427207,
694
+ "loss": 1.0783,
695
+ "step": 196
696
+ },
697
+ {
698
+ "epoch": 1.6363636363636362,
699
+ "grad_norm": 0.8410363793373108,
700
+ "learning_rate": 0.0001334331877560182,
701
+ "loss": 1.0708,
702
+ "step": 198
703
+ },
704
+ {
705
+ "epoch": 1.6528925619834711,
706
+ "grad_norm": 0.6392219662666321,
707
+ "learning_rate": 0.00013217031823311488,
708
+ "loss": 1.0329,
709
+ "step": 200
710
+ },
711
+ {
712
+ "epoch": 1.6694214876033058,
713
+ "grad_norm": 0.5770404934883118,
714
+ "learning_rate": 0.00013090169943749476,
715
+ "loss": 1.0404,
716
+ "step": 202
717
+ },
718
+ {
719
+ "epoch": 1.6859504132231407,
720
+ "grad_norm": 0.6814575791358948,
721
+ "learning_rate": 0.00012962755808856342,
722
+ "loss": 1.0702,
723
+ "step": 204
724
+ },
725
+ {
726
+ "epoch": 1.7024793388429753,
727
+ "grad_norm": 0.673312783241272,
728
+ "learning_rate": 0.0001283481218926818,
729
+ "loss": 1.0529,
730
+ "step": 206
731
+ },
732
+ {
733
+ "epoch": 1.71900826446281,
734
+ "grad_norm": 0.6180073618888855,
735
+ "learning_rate": 0.0001270636195024719,
736
+ "loss": 1.0257,
737
+ "step": 208
738
+ },
739
+ {
740
+ "epoch": 1.7355371900826446,
741
+ "grad_norm": 0.5565724968910217,
742
+ "learning_rate": 0.00012577428047595344,
743
+ "loss": 1.1102,
744
+ "step": 210
745
+ },
746
+ {
747
+ "epoch": 1.7520661157024793,
748
+ "grad_norm": 0.5586270689964294,
749
+ "learning_rate": 0.00012448033523551865,
750
+ "loss": 1.0277,
751
+ "step": 212
752
+ },
753
+ {
754
+ "epoch": 1.768595041322314,
755
+ "grad_norm": 0.542448878288269,
756
+ "learning_rate": 0.00012318201502675285,
757
+ "loss": 1.0988,
758
+ "step": 214
759
+ },
760
+ {
761
+ "epoch": 1.7851239669421488,
762
+ "grad_norm": 0.513042151927948,
763
+ "learning_rate": 0.0001218795518771075,
764
+ "loss": 1.0828,
765
+ "step": 216
766
+ },
767
+ {
768
+ "epoch": 1.8016528925619835,
769
+ "grad_norm": 0.7613060474395752,
770
+ "learning_rate": 0.00012057317855443395,
771
+ "loss": 1.1962,
772
+ "step": 218
773
+ },
774
+ {
775
+ "epoch": 1.8181818181818183,
776
+ "grad_norm": 0.7522129416465759,
777
+ "learning_rate": 0.00011926312852538455,
778
+ "loss": 1.1339,
779
+ "step": 220
780
+ },
781
+ {
782
+ "epoch": 1.834710743801653,
783
+ "grad_norm": 0.4655594825744629,
784
+ "learning_rate": 0.00011794963591368893,
785
+ "loss": 1.0967,
786
+ "step": 222
787
+ },
788
+ {
789
+ "epoch": 1.8512396694214877,
790
+ "grad_norm": 0.5036570429801941,
791
+ "learning_rate": 0.00011663293545831302,
792
+ "loss": 1.0361,
793
+ "step": 224
794
+ },
795
+ {
796
+ "epoch": 1.8677685950413223,
797
+ "grad_norm": 0.43016380071640015,
798
+ "learning_rate": 0.00011531326247150803,
799
+ "loss": 1.1281,
800
+ "step": 226
801
+ },
802
+ {
803
+ "epoch": 1.884297520661157,
804
+ "grad_norm": 0.5184316635131836,
805
+ "learning_rate": 0.00011399085279675687,
806
+ "loss": 1.2083,
807
+ "step": 228
808
+ },
809
+ {
810
+ "epoch": 1.9008264462809916,
811
+ "grad_norm": 0.6556355357170105,
812
+ "learning_rate": 0.0001126659427666257,
813
+ "loss": 1.0266,
814
+ "step": 230
815
+ },
816
+ {
817
+ "epoch": 1.9173553719008265,
818
+ "grad_norm": 0.515681803226471,
819
+ "learning_rate": 0.00011133876916052821,
820
+ "loss": 1.0472,
821
+ "step": 232
822
+ },
823
+ {
824
+ "epoch": 1.9338842975206612,
825
+ "grad_norm": 0.4592064321041107,
826
+ "learning_rate": 0.00011000956916240985,
827
+ "loss": 1.054,
828
+ "step": 234
829
+ },
830
+ {
831
+ "epoch": 1.950413223140496,
832
+ "grad_norm": 0.5623230338096619,
833
+ "learning_rate": 0.00010867858031835975,
834
+ "loss": 1.1571,
835
+ "step": 236
836
+ },
837
+ {
838
+ "epoch": 1.9669421487603307,
839
+ "grad_norm": 0.5241667032241821,
840
+ "learning_rate": 0.00010734604049415822,
841
+ "loss": 1.0985,
842
+ "step": 238
843
+ },
844
+ {
845
+ "epoch": 1.9834710743801653,
846
+ "grad_norm": 0.54905104637146,
847
+ "learning_rate": 0.00010601218783276672,
848
+ "loss": 1.1088,
849
+ "step": 240
850
+ },
851
+ {
852
+ "epoch": 2.0,
853
+ "grad_norm": 0.8823345303535461,
854
+ "learning_rate": 0.00010467726071176853,
855
+ "loss": 1.0991,
856
+ "step": 242
857
+ },
858
+ {
859
+ "epoch": 2.0165289256198347,
860
+ "grad_norm": 0.5789780020713806,
861
+ "learning_rate": 0.00010334149770076747,
862
+ "loss": 1.1429,
863
+ "step": 244
864
+ },
865
+ {
866
+ "epoch": 2.0330578512396693,
867
+ "grad_norm": 0.6136715412139893,
868
+ "learning_rate": 0.00010200513751875227,
869
+ "loss": 1.0347,
870
+ "step": 246
871
+ },
872
+ {
873
+ "epoch": 2.049586776859504,
874
+ "grad_norm": 0.6462894082069397,
875
+ "learning_rate": 0.00010066841899143425,
876
+ "loss": 1.1008,
877
+ "step": 248
878
+ },
879
+ {
880
+ "epoch": 2.0661157024793386,
881
+ "grad_norm": 0.7272156476974487,
882
+ "learning_rate": 9.93315810085658e-05,
883
+ "loss": 1.1642,
884
+ "step": 250
885
+ },
886
+ {
887
+ "epoch": 2.0826446280991737,
888
+ "grad_norm": 0.5664356350898743,
889
+ "learning_rate": 9.799486248124775e-05,
890
+ "loss": 1.0799,
891
+ "step": 252
892
+ },
893
+ {
894
+ "epoch": 2.0991735537190084,
895
+ "grad_norm": 0.559697687625885,
896
+ "learning_rate": 9.665850229923258e-05,
897
+ "loss": 1.0536,
898
+ "step": 254
899
+ },
900
+ {
901
+ "epoch": 2.115702479338843,
902
+ "grad_norm": 0.7208287715911865,
903
+ "learning_rate": 9.532273928823151e-05,
904
+ "loss": 1.0595,
905
+ "step": 256
906
+ },
907
+ {
908
+ "epoch": 2.1322314049586777,
909
+ "grad_norm": 0.6985677480697632,
910
+ "learning_rate": 9.398781216723331e-05,
911
+ "loss": 1.0626,
912
+ "step": 258
913
+ },
914
+ {
915
+ "epoch": 2.1487603305785123,
916
+ "grad_norm": 0.7583006024360657,
917
+ "learning_rate": 9.26539595058418e-05,
918
+ "loss": 0.9919,
919
+ "step": 260
920
+ },
921
+ {
922
+ "epoch": 2.165289256198347,
923
+ "grad_norm": 0.6283029317855835,
924
+ "learning_rate": 9.132141968164026e-05,
925
+ "loss": 1.1292,
926
+ "step": 262
927
+ },
928
+ {
929
+ "epoch": 2.1818181818181817,
930
+ "grad_norm": 1.4162311553955078,
931
+ "learning_rate": 8.999043083759017e-05,
932
+ "loss": 1.1834,
933
+ "step": 264
934
+ },
935
+ {
936
+ "epoch": 2.1983471074380168,
937
+ "grad_norm": 0.5169870853424072,
938
+ "learning_rate": 8.866123083947182e-05,
939
+ "loss": 1.049,
940
+ "step": 266
941
+ },
942
+ {
943
+ "epoch": 2.2148760330578514,
944
+ "grad_norm": 0.9136308431625366,
945
+ "learning_rate": 8.733405723337432e-05,
946
+ "loss": 1.1336,
947
+ "step": 268
948
+ },
949
+ {
950
+ "epoch": 2.231404958677686,
951
+ "grad_norm": 0.6882078647613525,
952
+ "learning_rate": 8.600914720324316e-05,
953
+ "loss": 1.0953,
954
+ "step": 270
955
+ },
956
+ {
957
+ "epoch": 2.2479338842975207,
958
+ "grad_norm": 0.505447506904602,
959
+ "learning_rate": 8.4686737528492e-05,
960
+ "loss": 1.1014,
961
+ "step": 272
962
+ },
963
+ {
964
+ "epoch": 2.2644628099173554,
965
+ "grad_norm": 0.40685173869132996,
966
+ "learning_rate": 8.336706454168701e-05,
967
+ "loss": 1.0878,
968
+ "step": 274
969
+ },
970
+ {
971
+ "epoch": 2.28099173553719,
972
+ "grad_norm": 0.5715451836585999,
973
+ "learning_rate": 8.20503640863111e-05,
974
+ "loss": 1.0555,
975
+ "step": 276
976
+ },
977
+ {
978
+ "epoch": 2.2975206611570247,
979
+ "grad_norm": 0.5256667733192444,
980
+ "learning_rate": 8.073687147461547e-05,
981
+ "loss": 1.0678,
982
+ "step": 278
983
+ },
984
+ {
985
+ "epoch": 2.3140495867768593,
986
+ "grad_norm": 0.46319133043289185,
987
+ "learning_rate": 7.942682144556604e-05,
988
+ "loss": 1.0699,
989
+ "step": 280
990
+ },
991
+ {
992
+ "epoch": 2.330578512396694,
993
+ "grad_norm": 0.6000749468803406,
994
+ "learning_rate": 7.812044812289249e-05,
995
+ "loss": 1.1389,
996
+ "step": 282
997
+ },
998
+ {
999
+ "epoch": 2.347107438016529,
1000
+ "grad_norm": 0.5024062395095825,
1001
+ "learning_rate": 7.681798497324716e-05,
1002
+ "loss": 1.1033,
1003
+ "step": 284
1004
+ },
1005
+ {
1006
+ "epoch": 2.3636363636363638,
1007
+ "grad_norm": 0.6761860251426697,
1008
+ "learning_rate": 7.55196647644814e-05,
1009
+ "loss": 1.0665,
1010
+ "step": 286
1011
+ },
1012
+ {
1013
+ "epoch": 2.3801652892561984,
1014
+ "grad_norm": 0.7175805568695068,
1015
+ "learning_rate": 7.422571952404663e-05,
1016
+ "loss": 1.0511,
1017
+ "step": 288
1018
+ },
1019
+ {
1020
+ "epoch": 2.396694214876033,
1021
+ "grad_norm": 0.6883595585823059,
1022
+ "learning_rate": 7.293638049752812e-05,
1023
+ "loss": 1.0765,
1024
+ "step": 290
1025
+ },
1026
+ {
1027
+ "epoch": 2.4132231404958677,
1028
+ "grad_norm": 0.6382430791854858,
1029
+ "learning_rate": 7.165187810731823e-05,
1030
+ "loss": 1.1036,
1031
+ "step": 292
1032
+ },
1033
+ {
1034
+ "epoch": 2.4297520661157024,
1035
+ "grad_norm": 0.9323483109474182,
1036
+ "learning_rate": 7.037244191143661e-05,
1037
+ "loss": 1.1113,
1038
+ "step": 294
1039
+ },
1040
+ {
1041
+ "epoch": 2.446280991735537,
1042
+ "grad_norm": 0.9768907427787781,
1043
+ "learning_rate": 6.909830056250527e-05,
1044
+ "loss": 1.0983,
1045
+ "step": 296
1046
+ },
1047
+ {
1048
+ "epoch": 2.462809917355372,
1049
+ "grad_norm": 0.6783613562583923,
1050
+ "learning_rate": 6.782968176688514e-05,
1051
+ "loss": 1.1141,
1052
+ "step": 298
1053
+ },
1054
+ {
1055
+ "epoch": 2.479338842975207,
1056
+ "grad_norm": 0.753887951374054,
1057
+ "learning_rate": 6.656681224398183e-05,
1058
+ "loss": 1.1234,
1059
+ "step": 300
1060
+ },
1061
+ {
1062
+ "epoch": 2.4958677685950414,
1063
+ "grad_norm": 0.6788628697395325,
1064
+ "learning_rate": 6.530991768572794e-05,
1065
+ "loss": 1.0407,
1066
+ "step": 302
1067
+ },
1068
+ {
1069
+ "epoch": 2.512396694214876,
1070
+ "grad_norm": 0.6727221012115479,
1071
+ "learning_rate": 6.405922271624874e-05,
1072
+ "loss": 1.091,
1073
+ "step": 304
1074
+ },
1075
+ {
1076
+ "epoch": 2.5289256198347108,
1077
+ "grad_norm": 0.7418019771575928,
1078
+ "learning_rate": 6.281495085171869e-05,
1079
+ "loss": 1.0884,
1080
+ "step": 306
1081
+ },
1082
+ {
1083
+ "epoch": 2.5454545454545454,
1084
+ "grad_norm": 0.7063189148902893,
1085
+ "learning_rate": 6.15773244604163e-05,
1086
+ "loss": 1.0218,
1087
+ "step": 308
1088
+ },
1089
+ {
1090
+ "epoch": 2.56198347107438,
1091
+ "grad_norm": 1.0706840753555298,
1092
+ "learning_rate": 6.0346564722983736e-05,
1093
+ "loss": 1.0873,
1094
+ "step": 310
1095
+ },
1096
+ {
1097
+ "epoch": 2.5785123966942147,
1098
+ "grad_norm": 0.683253288269043,
1099
+ "learning_rate": 5.912289159289883e-05,
1100
+ "loss": 1.0346,
1101
+ "step": 312
1102
+ },
1103
+ {
1104
+ "epoch": 2.5950413223140494,
1105
+ "grad_norm": 0.5816308856010437,
1106
+ "learning_rate": 5.790652375716652e-05,
1107
+ "loss": 1.0113,
1108
+ "step": 314
1109
+ },
1110
+ {
1111
+ "epoch": 2.6115702479338845,
1112
+ "grad_norm": 0.7935141324996948,
1113
+ "learning_rate": 5.6697678597236356e-05,
1114
+ "loss": 1.1771,
1115
+ "step": 316
1116
+ },
1117
+ {
1118
+ "epoch": 2.628099173553719,
1119
+ "grad_norm": 0.6827735900878906,
1120
+ "learning_rate": 5.549657215015367e-05,
1121
+ "loss": 1.0899,
1122
+ "step": 318
1123
+ },
1124
+ {
1125
+ "epoch": 2.644628099173554,
1126
+ "grad_norm": 0.5408580899238586,
1127
+ "learning_rate": 5.430341906995065e-05,
1128
+ "loss": 1.1056,
1129
+ "step": 320
1130
+ },
1131
+ {
1132
+ "epoch": 2.6611570247933884,
1133
+ "grad_norm": 0.6996527910232544,
1134
+ "learning_rate": 5.31184325892849e-05,
1135
+ "loss": 1.0494,
1136
+ "step": 322
1137
+ },
1138
+ {
1139
+ "epoch": 2.677685950413223,
1140
+ "grad_norm": 0.6700767874717712,
1141
+ "learning_rate": 5.1941824481331626e-05,
1142
+ "loss": 1.1206,
1143
+ "step": 324
1144
+ },
1145
+ {
1146
+ "epoch": 2.6942148760330578,
1147
+ "grad_norm": 0.7690112590789795,
1148
+ "learning_rate": 5.077380502193725e-05,
1149
+ "loss": 1.1541,
1150
+ "step": 326
1151
+ },
1152
+ {
1153
+ "epoch": 2.7107438016528924,
1154
+ "grad_norm": 0.7921670079231262,
1155
+ "learning_rate": 4.961458295203999e-05,
1156
+ "loss": 1.0684,
1157
+ "step": 328
1158
+ },
1159
+ {
1160
+ "epoch": 2.7272727272727275,
1161
+ "grad_norm": 0.6989784240722656,
1162
+ "learning_rate": 4.8464365440365044e-05,
1163
+ "loss": 1.0097,
1164
+ "step": 330
1165
+ },
1166
+ {
1167
+ "epoch": 2.7438016528925617,
1168
+ "grad_norm": 0.6038585901260376,
1169
+ "learning_rate": 4.7323358046400844e-05,
1170
+ "loss": 1.0751,
1171
+ "step": 332
1172
+ },
1173
+ {
1174
+ "epoch": 2.760330578512397,
1175
+ "grad_norm": 0.8160024881362915,
1176
+ "learning_rate": 4.6191764683662744e-05,
1177
+ "loss": 0.9734,
1178
+ "step": 334
1179
+ },
1180
+ {
1181
+ "epoch": 2.7768595041322315,
1182
+ "grad_norm": 0.7741029858589172,
1183
+ "learning_rate": 4.506978758325081e-05,
1184
+ "loss": 1.1186,
1185
+ "step": 336
1186
+ },
1187
+ {
1188
+ "epoch": 2.793388429752066,
1189
+ "grad_norm": 0.8177542090415955,
1190
+ "learning_rate": 4.395762725770852e-05,
1191
+ "loss": 1.1538,
1192
+ "step": 338
1193
+ },
1194
+ {
1195
+ "epoch": 2.809917355371901,
1196
+ "grad_norm": 0.7617602944374084,
1197
+ "learning_rate": 4.285548246518837e-05,
1198
+ "loss": 1.1162,
1199
+ "step": 340
1200
+ },
1201
+ {
1202
+ "epoch": 2.8264462809917354,
1203
+ "grad_norm": 0.6426255106925964,
1204
+ "learning_rate": 4.176355017393099e-05,
1205
+ "loss": 1.0404,
1206
+ "step": 342
1207
+ },
1208
+ {
1209
+ "epoch": 2.84297520661157,
1210
+ "grad_norm": 0.6394379138946533,
1211
+ "learning_rate": 4.0682025527064486e-05,
1212
+ "loss": 1.1107,
1213
+ "step": 344
1214
+ },
1215
+ {
1216
+ "epoch": 2.8595041322314048,
1217
+ "grad_norm": 0.6632633209228516,
1218
+ "learning_rate": 3.961110180772955e-05,
1219
+ "loss": 1.1085,
1220
+ "step": 346
1221
+ },
1222
+ {
1223
+ "epoch": 2.87603305785124,
1224
+ "grad_norm": 0.5437737703323364,
1225
+ "learning_rate": 3.8550970404537144e-05,
1226
+ "loss": 1.0072,
1227
+ "step": 348
1228
+ },
1229
+ {
1230
+ "epoch": 2.8925619834710745,
1231
+ "grad_norm": 0.8178601264953613,
1232
+ "learning_rate": 3.750182077736486e-05,
1233
+ "loss": 1.0606,
1234
+ "step": 350
1235
+ },
1236
+ {
1237
+ "epoch": 2.909090909090909,
1238
+ "grad_norm": 0.8268756866455078,
1239
+ "learning_rate": 3.646384042349764e-05,
1240
+ "loss": 0.9642,
1241
+ "step": 352
1242
+ },
1243
+ {
1244
+ "epoch": 2.925619834710744,
1245
+ "grad_norm": 1.064460277557373,
1246
+ "learning_rate": 3.543721484411976e-05,
1247
+ "loss": 1.0881,
1248
+ "step": 354
1249
+ },
1250
+ {
1251
+ "epoch": 2.9421487603305785,
1252
+ "grad_norm": 0.7875952124595642,
1253
+ "learning_rate": 3.442212751116305e-05,
1254
+ "loss": 1.0141,
1255
+ "step": 356
1256
+ },
1257
+ {
1258
+ "epoch": 2.958677685950413,
1259
+ "grad_norm": 0.6071433424949646,
1260
+ "learning_rate": 3.3418759834518056e-05,
1261
+ "loss": 1.0692,
1262
+ "step": 358
1263
+ },
1264
+ {
1265
+ "epoch": 2.975206611570248,
1266
+ "grad_norm": 0.6311193704605103,
1267
+ "learning_rate": 3.24272911296135e-05,
1268
+ "loss": 0.9481,
1269
+ "step": 360
1270
+ },
1271
+ {
1272
+ "epoch": 2.991735537190083,
1273
+ "grad_norm": 0.7501534223556519,
1274
+ "learning_rate": 3.1447898585370384e-05,
1275
+ "loss": 1.1183,
1276
+ "step": 362
1277
+ }
1278
+ ],
1279
+ "logging_steps": 2,
1280
+ "max_steps": 484,
1281
+ "num_input_tokens_seen": 0,
1282
+ "num_train_epochs": 4,
1283
+ "save_steps": 500,
1284
+ "stateful_callbacks": {
1285
+ "TrainerControl": {
1286
+ "args": {
1287
+ "should_epoch_stop": false,
1288
+ "should_evaluate": false,
1289
+ "should_log": false,
1290
+ "should_save": true,
1291
+ "should_training_stop": false
1292
+ },
1293
+ "attributes": {}
1294
+ }
1295
+ },
1296
+ "total_flos": 6858422416834560.0,
1297
+ "train_batch_size": 16,
1298
+ "trial_name": null,
1299
+ "trial_params": null
1300
+ }
checkpoint-363/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c7adf20314e4e98465f263bb56b062cceb30de0bf8fa4d85b668c720a84502
3
+ size 6456
checkpoint-484/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
checkpoint-484/adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": null,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0.05,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 8,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "gate_proj",
27
+ "down_proj",
28
+ "v_proj",
29
+ "k_proj",
30
+ "o_proj",
31
+ "up_proj",
32
+ "q_proj"
33
+ ],
34
+ "task_type": "CAUSAL_LM",
35
+ "use_dora": false,
36
+ "use_rslora": false
37
+ }
checkpoint-484/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fe424a4de02d7bbd037f66fdfa874457df9673741824c6c8c7dce80527da55c
3
+ size 25271744
checkpoint-484/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f83c5d25655b3897a8e1bace7994f0312ba565ffe9c2f2c15888cec7bfac0686
3
+ size 13685836
checkpoint-484/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef0fa4de13c24e8107d077d830c61aa7c6410c4cd4d9f472aaeb4a44800c77ff
3
+ size 14244
checkpoint-484/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c16c78b962934cb2d4fb81e707fc4c46873b51a6b49f76873cdca14ac338be13
3
+ size 1064
checkpoint-484/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-484/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-484/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-484/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 2048,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
checkpoint-484/trainer_state.json ADDED
@@ -0,0 +1,1727 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
+ "eval_steps": 500,
6
+ "global_step": 484,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01652892561983471,
13
+ "grad_norm": 12.011792182922363,
14
+ "learning_rate": 2.857142857142857e-05,
15
+ "loss": 4.7518,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.03305785123966942,
20
+ "grad_norm": 12.689871788024902,
21
+ "learning_rate": 5.714285714285714e-05,
22
+ "loss": 4.687,
23
+ "step": 4
24
+ },
25
+ {
26
+ "epoch": 0.049586776859504134,
27
+ "grad_norm": 11.848198890686035,
28
+ "learning_rate": 8.571428571428571e-05,
29
+ "loss": 4.4922,
30
+ "step": 6
31
+ },
32
+ {
33
+ "epoch": 0.06611570247933884,
34
+ "grad_norm": 10.378408432006836,
35
+ "learning_rate": 0.00011428571428571428,
36
+ "loss": 3.4839,
37
+ "step": 8
38
+ },
39
+ {
40
+ "epoch": 0.08264462809917356,
41
+ "grad_norm": 9.543920516967773,
42
+ "learning_rate": 0.00014285714285714287,
43
+ "loss": 2.4164,
44
+ "step": 10
45
+ },
46
+ {
47
+ "epoch": 0.09917355371900827,
48
+ "grad_norm": 5.602456092834473,
49
+ "learning_rate": 0.00017142857142857143,
50
+ "loss": 1.838,
51
+ "step": 12
52
+ },
53
+ {
54
+ "epoch": 0.11570247933884298,
55
+ "grad_norm": 1.7414193153381348,
56
+ "learning_rate": 0.0002,
57
+ "loss": 1.4404,
58
+ "step": 14
59
+ },
60
+ {
61
+ "epoch": 0.1322314049586777,
62
+ "grad_norm": 1.0191770792007446,
63
+ "learning_rate": 0.0001999910643210378,
64
+ "loss": 1.3474,
65
+ "step": 16
66
+ },
67
+ {
68
+ "epoch": 0.1487603305785124,
69
+ "grad_norm": 3.509352922439575,
70
+ "learning_rate": 0.0001999642588810784,
71
+ "loss": 1.3789,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 0.1652892561983471,
76
+ "grad_norm": 2.426680088043213,
77
+ "learning_rate": 0.00019991958847061784,
78
+ "loss": 1.2245,
79
+ "step": 20
80
+ },
81
+ {
82
+ "epoch": 0.18181818181818182,
83
+ "grad_norm": 0.8060101270675659,
84
+ "learning_rate": 0.00019985706107286514,
85
+ "loss": 1.2259,
86
+ "step": 22
87
+ },
88
+ {
89
+ "epoch": 0.19834710743801653,
90
+ "grad_norm": 1.6215940713882446,
91
+ "learning_rate": 0.00019977668786231534,
92
+ "loss": 1.1992,
93
+ "step": 24
94
+ },
95
+ {
96
+ "epoch": 0.21487603305785125,
97
+ "grad_norm": 2.476259231567383,
98
+ "learning_rate": 0.0001996784832027525,
99
+ "loss": 1.2782,
100
+ "step": 26
101
+ },
102
+ {
103
+ "epoch": 0.23140495867768596,
104
+ "grad_norm": 2.04018235206604,
105
+ "learning_rate": 0.00019956246464468294,
106
+ "loss": 1.23,
107
+ "step": 28
108
+ },
109
+ {
110
+ "epoch": 0.24793388429752067,
111
+ "grad_norm": 1.8799017667770386,
112
+ "learning_rate": 0.00019942865292219838,
113
+ "loss": 1.2164,
114
+ "step": 30
115
+ },
116
+ {
117
+ "epoch": 0.2644628099173554,
118
+ "grad_norm": 1.2413148880004883,
119
+ "learning_rate": 0.00019927707194927066,
120
+ "loss": 1.2924,
121
+ "step": 32
122
+ },
123
+ {
124
+ "epoch": 0.2809917355371901,
125
+ "grad_norm": 1.8723937273025513,
126
+ "learning_rate": 0.000199107748815478,
127
+ "loss": 1.2302,
128
+ "step": 34
129
+ },
130
+ {
131
+ "epoch": 0.2975206611570248,
132
+ "grad_norm": 1.393110752105713,
133
+ "learning_rate": 0.00019892071378116376,
134
+ "loss": 1.2276,
135
+ "step": 36
136
+ },
137
+ {
138
+ "epoch": 0.3140495867768595,
139
+ "grad_norm": 1.1459721326828003,
140
+ "learning_rate": 0.0001987160002720283,
141
+ "loss": 1.1504,
142
+ "step": 38
143
+ },
144
+ {
145
+ "epoch": 0.3305785123966942,
146
+ "grad_norm": 1.4680942296981812,
147
+ "learning_rate": 0.00019849364487315558,
148
+ "loss": 1.1623,
149
+ "step": 40
150
+ },
151
+ {
152
+ "epoch": 0.34710743801652894,
153
+ "grad_norm": 1.8715866804122925,
154
+ "learning_rate": 0.0001982536873224748,
155
+ "loss": 1.2155,
156
+ "step": 42
157
+ },
158
+ {
159
+ "epoch": 0.36363636363636365,
160
+ "grad_norm": 0.871064305305481,
161
+ "learning_rate": 0.0001979961705036587,
162
+ "loss": 1.1594,
163
+ "step": 44
164
+ },
165
+ {
166
+ "epoch": 0.38016528925619836,
167
+ "grad_norm": 0.8239800930023193,
168
+ "learning_rate": 0.00019772114043845965,
169
+ "loss": 1.1501,
170
+ "step": 46
171
+ },
172
+ {
173
+ "epoch": 0.39669421487603307,
174
+ "grad_norm": 0.9587319493293762,
175
+ "learning_rate": 0.0001974286462784851,
176
+ "loss": 1.1195,
177
+ "step": 48
178
+ },
179
+ {
180
+ "epoch": 0.4132231404958678,
181
+ "grad_norm": 1.1645926237106323,
182
+ "learning_rate": 0.0001971187402964132,
183
+ "loss": 1.1417,
184
+ "step": 50
185
+ },
186
+ {
187
+ "epoch": 0.4297520661157025,
188
+ "grad_norm": 0.576813817024231,
189
+ "learning_rate": 0.00019679147787665126,
190
+ "loss": 1.1445,
191
+ "step": 52
192
+ },
193
+ {
194
+ "epoch": 0.4462809917355372,
195
+ "grad_norm": 1.0733133554458618,
196
+ "learning_rate": 0.00019644691750543767,
197
+ "loss": 1.0979,
198
+ "step": 54
199
+ },
200
+ {
201
+ "epoch": 0.4628099173553719,
202
+ "grad_norm": 0.5801639556884766,
203
+ "learning_rate": 0.00019608512076038962,
204
+ "loss": 1.0977,
205
+ "step": 56
206
+ },
207
+ {
208
+ "epoch": 0.4793388429752066,
209
+ "grad_norm": 1.6796538829803467,
210
+ "learning_rate": 0.00019570615229949842,
211
+ "loss": 1.1925,
212
+ "step": 58
213
+ },
214
+ {
215
+ "epoch": 0.49586776859504134,
216
+ "grad_norm": 1.0563887357711792,
217
+ "learning_rate": 0.00019531007984957408,
218
+ "loss": 1.0657,
219
+ "step": 60
220
+ },
221
+ {
222
+ "epoch": 0.512396694214876,
223
+ "grad_norm": 0.9109811186790466,
224
+ "learning_rate": 0.00019489697419414182,
225
+ "loss": 1.1098,
226
+ "step": 62
227
+ },
228
+ {
229
+ "epoch": 0.5289256198347108,
230
+ "grad_norm": 0.7321667671203613,
231
+ "learning_rate": 0.0001944669091607919,
232
+ "loss": 1.0929,
233
+ "step": 64
234
+ },
235
+ {
236
+ "epoch": 0.5454545454545454,
237
+ "grad_norm": 0.685366690158844,
238
+ "learning_rate": 0.00019401996160798573,
239
+ "loss": 1.1242,
240
+ "step": 66
241
+ },
242
+ {
243
+ "epoch": 0.5619834710743802,
244
+ "grad_norm": 0.8959838151931763,
245
+ "learning_rate": 0.0001935562114113202,
246
+ "loss": 1.181,
247
+ "step": 68
248
+ },
249
+ {
250
+ "epoch": 0.5785123966942148,
251
+ "grad_norm": 0.9717262983322144,
252
+ "learning_rate": 0.00019307574144925287,
253
+ "loss": 1.2295,
254
+ "step": 70
255
+ },
256
+ {
257
+ "epoch": 0.5950413223140496,
258
+ "grad_norm": 1.0358582735061646,
259
+ "learning_rate": 0.00019257863758829035,
260
+ "loss": 1.1431,
261
+ "step": 72
262
+ },
263
+ {
264
+ "epoch": 0.6115702479338843,
265
+ "grad_norm": 0.7998526096343994,
266
+ "learning_rate": 0.00019206498866764288,
267
+ "loss": 1.1032,
268
+ "step": 74
269
+ },
270
+ {
271
+ "epoch": 0.628099173553719,
272
+ "grad_norm": 1.1496188640594482,
273
+ "learning_rate": 0.0001915348864833476,
274
+ "loss": 1.057,
275
+ "step": 76
276
+ },
277
+ {
278
+ "epoch": 0.6446280991735537,
279
+ "grad_norm": 0.652406632900238,
280
+ "learning_rate": 0.00019098842577186314,
281
+ "loss": 1.146,
282
+ "step": 78
283
+ },
284
+ {
285
+ "epoch": 0.6611570247933884,
286
+ "grad_norm": 0.9454944729804993,
287
+ "learning_rate": 0.00019042570419313925,
288
+ "loss": 1.1543,
289
+ "step": 80
290
+ },
291
+ {
292
+ "epoch": 0.6776859504132231,
293
+ "grad_norm": 0.7456652522087097,
294
+ "learning_rate": 0.00018984682231316333,
295
+ "loss": 1.1189,
296
+ "step": 82
297
+ },
298
+ {
299
+ "epoch": 0.6942148760330579,
300
+ "grad_norm": 0.7312512397766113,
301
+ "learning_rate": 0.00018925188358598813,
302
+ "loss": 1.0873,
303
+ "step": 84
304
+ },
305
+ {
306
+ "epoch": 0.7107438016528925,
307
+ "grad_norm": 0.8474765419960022,
308
+ "learning_rate": 0.000188640994335243,
309
+ "loss": 1.1698,
310
+ "step": 86
311
+ },
312
+ {
313
+ "epoch": 0.7272727272727273,
314
+ "grad_norm": 0.6979633569717407,
315
+ "learning_rate": 0.0001880142637351325,
316
+ "loss": 1.1417,
317
+ "step": 88
318
+ },
319
+ {
320
+ "epoch": 0.743801652892562,
321
+ "grad_norm": 0.5989161133766174,
322
+ "learning_rate": 0.00018737180379092537,
323
+ "loss": 1.0479,
324
+ "step": 90
325
+ },
326
+ {
327
+ "epoch": 0.7603305785123967,
328
+ "grad_norm": 0.5765272378921509,
329
+ "learning_rate": 0.00018671372931893773,
330
+ "loss": 1.1336,
331
+ "step": 92
332
+ },
333
+ {
334
+ "epoch": 0.7768595041322314,
335
+ "grad_norm": 0.6709849834442139,
336
+ "learning_rate": 0.00018604015792601396,
337
+ "loss": 1.1157,
338
+ "step": 94
339
+ },
340
+ {
341
+ "epoch": 0.7933884297520661,
342
+ "grad_norm": 0.8181343674659729,
343
+ "learning_rate": 0.00018535120998850848,
344
+ "loss": 1.0927,
345
+ "step": 96
346
+ },
347
+ {
348
+ "epoch": 0.8099173553719008,
349
+ "grad_norm": 0.6146332621574402,
350
+ "learning_rate": 0.00018464700863077312,
351
+ "loss": 1.0739,
352
+ "step": 98
353
+ },
354
+ {
355
+ "epoch": 0.8264462809917356,
356
+ "grad_norm": 0.9904415011405945,
357
+ "learning_rate": 0.00018392767970315313,
358
+ "loss": 1.0331,
359
+ "step": 100
360
+ },
361
+ {
362
+ "epoch": 0.8429752066115702,
363
+ "grad_norm": 0.6186695694923401,
364
+ "learning_rate": 0.0001831933517594957,
365
+ "loss": 1.0513,
366
+ "step": 102
367
+ },
368
+ {
369
+ "epoch": 0.859504132231405,
370
+ "grad_norm": 1.1912785768508911,
371
+ "learning_rate": 0.00018244415603417603,
372
+ "loss": 1.1567,
373
+ "step": 104
374
+ },
375
+ {
376
+ "epoch": 0.8760330578512396,
377
+ "grad_norm": 1.3681318759918213,
378
+ "learning_rate": 0.00018168022641864377,
379
+ "loss": 1.1497,
380
+ "step": 106
381
+ },
382
+ {
383
+ "epoch": 0.8925619834710744,
384
+ "grad_norm": 0.619476318359375,
385
+ "learning_rate": 0.00018090169943749476,
386
+ "loss": 1.1546,
387
+ "step": 108
388
+ },
389
+ {
390
+ "epoch": 0.9090909090909091,
391
+ "grad_norm": 0.7421219348907471,
392
+ "learning_rate": 0.00018010871422407236,
393
+ "loss": 1.1458,
394
+ "step": 110
395
+ },
396
+ {
397
+ "epoch": 0.9256198347107438,
398
+ "grad_norm": 0.6569286584854126,
399
+ "learning_rate": 0.00017930141249560233,
400
+ "loss": 1.12,
401
+ "step": 112
402
+ },
403
+ {
404
+ "epoch": 0.9421487603305785,
405
+ "grad_norm": 0.4168110191822052,
406
+ "learning_rate": 0.0001784799385278661,
407
+ "loss": 1.1682,
408
+ "step": 114
409
+ },
410
+ {
411
+ "epoch": 0.9586776859504132,
412
+ "grad_norm": 0.5620162487030029,
413
+ "learning_rate": 0.00017764443912941672,
414
+ "loss": 1.1828,
415
+ "step": 116
416
+ },
417
+ {
418
+ "epoch": 0.9752066115702479,
419
+ "grad_norm": 0.8095484375953674,
420
+ "learning_rate": 0.00017679506361534215,
421
+ "loss": 1.1953,
422
+ "step": 118
423
+ },
424
+ {
425
+ "epoch": 0.9917355371900827,
426
+ "grad_norm": 0.7646257281303406,
427
+ "learning_rate": 0.0001759319637805806,
428
+ "loss": 1.2148,
429
+ "step": 120
430
+ },
431
+ {
432
+ "epoch": 1.0082644628099173,
433
+ "grad_norm": 0.5254501104354858,
434
+ "learning_rate": 0.00017505529387279277,
435
+ "loss": 1.1359,
436
+ "step": 122
437
+ },
438
+ {
439
+ "epoch": 1.024793388429752,
440
+ "grad_norm": 0.6001765727996826,
441
+ "learning_rate": 0.00017416521056479577,
442
+ "loss": 1.1336,
443
+ "step": 124
444
+ },
445
+ {
446
+ "epoch": 1.0413223140495869,
447
+ "grad_norm": 0.35407504439353943,
448
+ "learning_rate": 0.00017326187292656333,
449
+ "loss": 1.1833,
450
+ "step": 126
451
+ },
452
+ {
453
+ "epoch": 1.0578512396694215,
454
+ "grad_norm": 0.414528489112854,
455
+ "learning_rate": 0.00017234544239679806,
456
+ "loss": 1.1301,
457
+ "step": 128
458
+ },
459
+ {
460
+ "epoch": 1.0743801652892562,
461
+ "grad_norm": 0.46355852484703064,
462
+ "learning_rate": 0.00017141608275408006,
463
+ "loss": 1.213,
464
+ "step": 130
465
+ },
466
+ {
467
+ "epoch": 1.0909090909090908,
468
+ "grad_norm": 0.5040593147277832,
469
+ "learning_rate": 0.00017047396008759754,
470
+ "loss": 1.132,
471
+ "step": 132
472
+ },
473
+ {
474
+ "epoch": 1.1074380165289257,
475
+ "grad_norm": 0.4813704192638397,
476
+ "learning_rate": 0.00016951924276746425,
477
+ "loss": 1.0831,
478
+ "step": 134
479
+ },
480
+ {
481
+ "epoch": 1.1239669421487604,
482
+ "grad_norm": 0.5174686312675476,
483
+ "learning_rate": 0.00016855210141462963,
484
+ "loss": 1.0514,
485
+ "step": 136
486
+ },
487
+ {
488
+ "epoch": 1.140495867768595,
489
+ "grad_norm": 0.4712466299533844,
490
+ "learning_rate": 0.00016757270887038654,
491
+ "loss": 1.1334,
492
+ "step": 138
493
+ },
494
+ {
495
+ "epoch": 1.1570247933884297,
496
+ "grad_norm": 0.5912173390388489,
497
+ "learning_rate": 0.00016658124016548197,
498
+ "loss": 1.1011,
499
+ "step": 140
500
+ },
501
+ {
502
+ "epoch": 1.1735537190082646,
503
+ "grad_norm": 0.6392802000045776,
504
+ "learning_rate": 0.00016557787248883696,
505
+ "loss": 1.1361,
506
+ "step": 142
507
+ },
508
+ {
509
+ "epoch": 1.1900826446280992,
510
+ "grad_norm": 0.7376368045806885,
511
+ "learning_rate": 0.00016456278515588024,
512
+ "loss": 1.109,
513
+ "step": 144
514
+ },
515
+ {
516
+ "epoch": 1.2066115702479339,
517
+ "grad_norm": 0.5020875930786133,
518
+ "learning_rate": 0.00016353615957650236,
519
+ "loss": 1.0925,
520
+ "step": 146
521
+ },
522
+ {
523
+ "epoch": 1.2231404958677685,
524
+ "grad_norm": 0.8081740736961365,
525
+ "learning_rate": 0.00016249817922263517,
526
+ "loss": 1.047,
527
+ "step": 148
528
+ },
529
+ {
530
+ "epoch": 1.2396694214876034,
531
+ "grad_norm": 0.6371219754219055,
532
+ "learning_rate": 0.00016144902959546286,
533
+ "loss": 1.113,
534
+ "step": 150
535
+ },
536
+ {
537
+ "epoch": 1.256198347107438,
538
+ "grad_norm": 0.7588189840316772,
539
+ "learning_rate": 0.00016038889819227045,
540
+ "loss": 1.1179,
541
+ "step": 152
542
+ },
543
+ {
544
+ "epoch": 1.2727272727272727,
545
+ "grad_norm": 0.6286205053329468,
546
+ "learning_rate": 0.00015931797447293552,
547
+ "loss": 1.1209,
548
+ "step": 154
549
+ },
550
+ {
551
+ "epoch": 1.2892561983471074,
552
+ "grad_norm": 0.797656238079071,
553
+ "learning_rate": 0.00015823644982606905,
554
+ "loss": 1.1698,
555
+ "step": 156
556
+ },
557
+ {
558
+ "epoch": 1.3057851239669422,
559
+ "grad_norm": 0.5368632078170776,
560
+ "learning_rate": 0.00015714451753481168,
561
+ "loss": 1.1973,
562
+ "step": 158
563
+ },
564
+ {
565
+ "epoch": 1.322314049586777,
566
+ "grad_norm": 0.4135212302207947,
567
+ "learning_rate": 0.00015604237274229147,
568
+ "loss": 1.1452,
569
+ "step": 160
570
+ },
571
+ {
572
+ "epoch": 1.3388429752066116,
573
+ "grad_norm": 0.5289668440818787,
574
+ "learning_rate": 0.00015493021241674918,
575
+ "loss": 1.1954,
576
+ "step": 162
577
+ },
578
+ {
579
+ "epoch": 1.3553719008264462,
580
+ "grad_norm": 0.4092061221599579,
581
+ "learning_rate": 0.00015380823531633729,
582
+ "loss": 1.1226,
583
+ "step": 164
584
+ },
585
+ {
586
+ "epoch": 1.3719008264462809,
587
+ "grad_norm": 0.7049645781517029,
588
+ "learning_rate": 0.00015267664195359917,
589
+ "loss": 1.0948,
590
+ "step": 166
591
+ },
592
+ {
593
+ "epoch": 1.3884297520661157,
594
+ "grad_norm": 0.47164198756217957,
595
+ "learning_rate": 0.00015153563455963499,
596
+ "loss": 1.0977,
597
+ "step": 168
598
+ },
599
+ {
600
+ "epoch": 1.4049586776859504,
601
+ "grad_norm": 0.7871695160865784,
602
+ "learning_rate": 0.00015038541704796003,
603
+ "loss": 1.1674,
604
+ "step": 170
605
+ },
606
+ {
607
+ "epoch": 1.421487603305785,
608
+ "grad_norm": 0.5381121635437012,
609
+ "learning_rate": 0.00014922619497806277,
610
+ "loss": 1.1415,
611
+ "step": 172
612
+ },
613
+ {
614
+ "epoch": 1.43801652892562,
615
+ "grad_norm": 0.39419299364089966,
616
+ "learning_rate": 0.00014805817551866838,
617
+ "loss": 1.0747,
618
+ "step": 174
619
+ },
620
+ {
621
+ "epoch": 1.4545454545454546,
622
+ "grad_norm": 0.38382914662361145,
623
+ "learning_rate": 0.00014688156741071514,
624
+ "loss": 1.1278,
625
+ "step": 176
626
+ },
627
+ {
628
+ "epoch": 1.4710743801652892,
629
+ "grad_norm": 0.32674962282180786,
630
+ "learning_rate": 0.00014569658093004935,
631
+ "loss": 0.9774,
632
+ "step": 178
633
+ },
634
+ {
635
+ "epoch": 1.487603305785124,
636
+ "grad_norm": 0.5443088412284851,
637
+ "learning_rate": 0.00014450342784984633,
638
+ "loss": 1.034,
639
+ "step": 180
640
+ },
641
+ {
642
+ "epoch": 1.5041322314049586,
643
+ "grad_norm": 0.6682401895523071,
644
+ "learning_rate": 0.00014330232140276366,
645
+ "loss": 1.1732,
646
+ "step": 182
647
+ },
648
+ {
649
+ "epoch": 1.5206611570247934,
650
+ "grad_norm": 0.5696044564247131,
651
+ "learning_rate": 0.0001420934762428335,
652
+ "loss": 1.0384,
653
+ "step": 184
654
+ },
655
+ {
656
+ "epoch": 1.537190082644628,
657
+ "grad_norm": 0.6782551407814026,
658
+ "learning_rate": 0.0001408771084071012,
659
+ "loss": 1.1107,
660
+ "step": 186
661
+ },
662
+ {
663
+ "epoch": 1.553719008264463,
664
+ "grad_norm": 0.8336123824119568,
665
+ "learning_rate": 0.00013965343527701628,
666
+ "loss": 1.0737,
667
+ "step": 188
668
+ },
669
+ {
670
+ "epoch": 1.5702479338842976,
671
+ "grad_norm": 0.539226233959198,
672
+ "learning_rate": 0.00013842267553958371,
673
+ "loss": 1.1665,
674
+ "step": 190
675
+ },
676
+ {
677
+ "epoch": 1.5867768595041323,
678
+ "grad_norm": 0.566620409488678,
679
+ "learning_rate": 0.00013718504914828135,
680
+ "loss": 1.1333,
681
+ "step": 192
682
+ },
683
+ {
684
+ "epoch": 1.603305785123967,
685
+ "grad_norm": 0.4735005795955658,
686
+ "learning_rate": 0.00013594077728375128,
687
+ "loss": 1.1709,
688
+ "step": 194
689
+ },
690
+ {
691
+ "epoch": 1.6198347107438016,
692
+ "grad_norm": 0.534383237361908,
693
+ "learning_rate": 0.00013469008231427207,
694
+ "loss": 1.0783,
695
+ "step": 196
696
+ },
697
+ {
698
+ "epoch": 1.6363636363636362,
699
+ "grad_norm": 0.8410363793373108,
700
+ "learning_rate": 0.0001334331877560182,
701
+ "loss": 1.0708,
702
+ "step": 198
703
+ },
704
+ {
705
+ "epoch": 1.6528925619834711,
706
+ "grad_norm": 0.6392219662666321,
707
+ "learning_rate": 0.00013217031823311488,
708
+ "loss": 1.0329,
709
+ "step": 200
710
+ },
711
+ {
712
+ "epoch": 1.6694214876033058,
713
+ "grad_norm": 0.5770404934883118,
714
+ "learning_rate": 0.00013090169943749476,
715
+ "loss": 1.0404,
716
+ "step": 202
717
+ },
718
+ {
719
+ "epoch": 1.6859504132231407,
720
+ "grad_norm": 0.6814575791358948,
721
+ "learning_rate": 0.00012962755808856342,
722
+ "loss": 1.0702,
723
+ "step": 204
724
+ },
725
+ {
726
+ "epoch": 1.7024793388429753,
727
+ "grad_norm": 0.673312783241272,
728
+ "learning_rate": 0.0001283481218926818,
729
+ "loss": 1.0529,
730
+ "step": 206
731
+ },
732
+ {
733
+ "epoch": 1.71900826446281,
734
+ "grad_norm": 0.6180073618888855,
735
+ "learning_rate": 0.0001270636195024719,
736
+ "loss": 1.0257,
737
+ "step": 208
738
+ },
739
+ {
740
+ "epoch": 1.7355371900826446,
741
+ "grad_norm": 0.5565724968910217,
742
+ "learning_rate": 0.00012577428047595344,
743
+ "loss": 1.1102,
744
+ "step": 210
745
+ },
746
+ {
747
+ "epoch": 1.7520661157024793,
748
+ "grad_norm": 0.5586270689964294,
749
+ "learning_rate": 0.00012448033523551865,
750
+ "loss": 1.0277,
751
+ "step": 212
752
+ },
753
+ {
754
+ "epoch": 1.768595041322314,
755
+ "grad_norm": 0.542448878288269,
756
+ "learning_rate": 0.00012318201502675285,
757
+ "loss": 1.0988,
758
+ "step": 214
759
+ },
760
+ {
761
+ "epoch": 1.7851239669421488,
762
+ "grad_norm": 0.513042151927948,
763
+ "learning_rate": 0.0001218795518771075,
764
+ "loss": 1.0828,
765
+ "step": 216
766
+ },
767
+ {
768
+ "epoch": 1.8016528925619835,
769
+ "grad_norm": 0.7613060474395752,
770
+ "learning_rate": 0.00012057317855443395,
771
+ "loss": 1.1962,
772
+ "step": 218
773
+ },
774
+ {
775
+ "epoch": 1.8181818181818183,
776
+ "grad_norm": 0.7522129416465759,
777
+ "learning_rate": 0.00011926312852538455,
778
+ "loss": 1.1339,
779
+ "step": 220
780
+ },
781
+ {
782
+ "epoch": 1.834710743801653,
783
+ "grad_norm": 0.4655594825744629,
784
+ "learning_rate": 0.00011794963591368893,
785
+ "loss": 1.0967,
786
+ "step": 222
787
+ },
788
+ {
789
+ "epoch": 1.8512396694214877,
790
+ "grad_norm": 0.5036570429801941,
791
+ "learning_rate": 0.00011663293545831302,
792
+ "loss": 1.0361,
793
+ "step": 224
794
+ },
795
+ {
796
+ "epoch": 1.8677685950413223,
797
+ "grad_norm": 0.43016380071640015,
798
+ "learning_rate": 0.00011531326247150803,
799
+ "loss": 1.1281,
800
+ "step": 226
801
+ },
802
+ {
803
+ "epoch": 1.884297520661157,
804
+ "grad_norm": 0.5184316635131836,
805
+ "learning_rate": 0.00011399085279675687,
806
+ "loss": 1.2083,
807
+ "step": 228
808
+ },
809
+ {
810
+ "epoch": 1.9008264462809916,
811
+ "grad_norm": 0.6556355357170105,
812
+ "learning_rate": 0.0001126659427666257,
813
+ "loss": 1.0266,
814
+ "step": 230
815
+ },
816
+ {
817
+ "epoch": 1.9173553719008265,
818
+ "grad_norm": 0.515681803226471,
819
+ "learning_rate": 0.00011133876916052821,
820
+ "loss": 1.0472,
821
+ "step": 232
822
+ },
823
+ {
824
+ "epoch": 1.9338842975206612,
825
+ "grad_norm": 0.4592064321041107,
826
+ "learning_rate": 0.00011000956916240985,
827
+ "loss": 1.054,
828
+ "step": 234
829
+ },
830
+ {
831
+ "epoch": 1.950413223140496,
832
+ "grad_norm": 0.5623230338096619,
833
+ "learning_rate": 0.00010867858031835975,
834
+ "loss": 1.1571,
835
+ "step": 236
836
+ },
837
+ {
838
+ "epoch": 1.9669421487603307,
839
+ "grad_norm": 0.5241667032241821,
840
+ "learning_rate": 0.00010734604049415822,
841
+ "loss": 1.0985,
842
+ "step": 238
843
+ },
844
+ {
845
+ "epoch": 1.9834710743801653,
846
+ "grad_norm": 0.54905104637146,
847
+ "learning_rate": 0.00010601218783276672,
848
+ "loss": 1.1088,
849
+ "step": 240
850
+ },
851
+ {
852
+ "epoch": 2.0,
853
+ "grad_norm": 0.8823345303535461,
854
+ "learning_rate": 0.00010467726071176853,
855
+ "loss": 1.0991,
856
+ "step": 242
857
+ },
858
+ {
859
+ "epoch": 2.0165289256198347,
860
+ "grad_norm": 0.5789780020713806,
861
+ "learning_rate": 0.00010334149770076747,
862
+ "loss": 1.1429,
863
+ "step": 244
864
+ },
865
+ {
866
+ "epoch": 2.0330578512396693,
867
+ "grad_norm": 0.6136715412139893,
868
+ "learning_rate": 0.00010200513751875227,
869
+ "loss": 1.0347,
870
+ "step": 246
871
+ },
872
+ {
873
+ "epoch": 2.049586776859504,
874
+ "grad_norm": 0.6462894082069397,
875
+ "learning_rate": 0.00010066841899143425,
876
+ "loss": 1.1008,
877
+ "step": 248
878
+ },
879
+ {
880
+ "epoch": 2.0661157024793386,
881
+ "grad_norm": 0.7272156476974487,
882
+ "learning_rate": 9.93315810085658e-05,
883
+ "loss": 1.1642,
884
+ "step": 250
885
+ },
886
+ {
887
+ "epoch": 2.0826446280991737,
888
+ "grad_norm": 0.5664356350898743,
889
+ "learning_rate": 9.799486248124775e-05,
890
+ "loss": 1.0799,
891
+ "step": 252
892
+ },
893
+ {
894
+ "epoch": 2.0991735537190084,
895
+ "grad_norm": 0.559697687625885,
896
+ "learning_rate": 9.665850229923258e-05,
897
+ "loss": 1.0536,
898
+ "step": 254
899
+ },
900
+ {
901
+ "epoch": 2.115702479338843,
902
+ "grad_norm": 0.7208287715911865,
903
+ "learning_rate": 9.532273928823151e-05,
904
+ "loss": 1.0595,
905
+ "step": 256
906
+ },
907
+ {
908
+ "epoch": 2.1322314049586777,
909
+ "grad_norm": 0.6985677480697632,
910
+ "learning_rate": 9.398781216723331e-05,
911
+ "loss": 1.0626,
912
+ "step": 258
913
+ },
914
+ {
915
+ "epoch": 2.1487603305785123,
916
+ "grad_norm": 0.7583006024360657,
917
+ "learning_rate": 9.26539595058418e-05,
918
+ "loss": 0.9919,
919
+ "step": 260
920
+ },
921
+ {
922
+ "epoch": 2.165289256198347,
923
+ "grad_norm": 0.6283029317855835,
924
+ "learning_rate": 9.132141968164026e-05,
925
+ "loss": 1.1292,
926
+ "step": 262
927
+ },
928
+ {
929
+ "epoch": 2.1818181818181817,
930
+ "grad_norm": 1.4162311553955078,
931
+ "learning_rate": 8.999043083759017e-05,
932
+ "loss": 1.1834,
933
+ "step": 264
934
+ },
935
+ {
936
+ "epoch": 2.1983471074380168,
937
+ "grad_norm": 0.5169870853424072,
938
+ "learning_rate": 8.866123083947182e-05,
939
+ "loss": 1.049,
940
+ "step": 266
941
+ },
942
+ {
943
+ "epoch": 2.2148760330578514,
944
+ "grad_norm": 0.9136308431625366,
945
+ "learning_rate": 8.733405723337432e-05,
946
+ "loss": 1.1336,
947
+ "step": 268
948
+ },
949
+ {
950
+ "epoch": 2.231404958677686,
951
+ "grad_norm": 0.6882078647613525,
952
+ "learning_rate": 8.600914720324316e-05,
953
+ "loss": 1.0953,
954
+ "step": 270
955
+ },
956
+ {
957
+ "epoch": 2.2479338842975207,
958
+ "grad_norm": 0.505447506904602,
959
+ "learning_rate": 8.4686737528492e-05,
960
+ "loss": 1.1014,
961
+ "step": 272
962
+ },
963
+ {
964
+ "epoch": 2.2644628099173554,
965
+ "grad_norm": 0.40685173869132996,
966
+ "learning_rate": 8.336706454168701e-05,
967
+ "loss": 1.0878,
968
+ "step": 274
969
+ },
970
+ {
971
+ "epoch": 2.28099173553719,
972
+ "grad_norm": 0.5715451836585999,
973
+ "learning_rate": 8.20503640863111e-05,
974
+ "loss": 1.0555,
975
+ "step": 276
976
+ },
977
+ {
978
+ "epoch": 2.2975206611570247,
979
+ "grad_norm": 0.5256667733192444,
980
+ "learning_rate": 8.073687147461547e-05,
981
+ "loss": 1.0678,
982
+ "step": 278
983
+ },
984
+ {
985
+ "epoch": 2.3140495867768593,
986
+ "grad_norm": 0.46319133043289185,
987
+ "learning_rate": 7.942682144556604e-05,
988
+ "loss": 1.0699,
989
+ "step": 280
990
+ },
991
+ {
992
+ "epoch": 2.330578512396694,
993
+ "grad_norm": 0.6000749468803406,
994
+ "learning_rate": 7.812044812289249e-05,
995
+ "loss": 1.1389,
996
+ "step": 282
997
+ },
998
+ {
999
+ "epoch": 2.347107438016529,
1000
+ "grad_norm": 0.5024062395095825,
1001
+ "learning_rate": 7.681798497324716e-05,
1002
+ "loss": 1.1033,
1003
+ "step": 284
1004
+ },
1005
+ {
1006
+ "epoch": 2.3636363636363638,
1007
+ "grad_norm": 0.6761860251426697,
1008
+ "learning_rate": 7.55196647644814e-05,
1009
+ "loss": 1.0665,
1010
+ "step": 286
1011
+ },
1012
+ {
1013
+ "epoch": 2.3801652892561984,
1014
+ "grad_norm": 0.7175805568695068,
1015
+ "learning_rate": 7.422571952404663e-05,
1016
+ "loss": 1.0511,
1017
+ "step": 288
1018
+ },
1019
+ {
1020
+ "epoch": 2.396694214876033,
1021
+ "grad_norm": 0.6883595585823059,
1022
+ "learning_rate": 7.293638049752812e-05,
1023
+ "loss": 1.0765,
1024
+ "step": 290
1025
+ },
1026
+ {
1027
+ "epoch": 2.4132231404958677,
1028
+ "grad_norm": 0.6382430791854858,
1029
+ "learning_rate": 7.165187810731823e-05,
1030
+ "loss": 1.1036,
1031
+ "step": 292
1032
+ },
1033
+ {
1034
+ "epoch": 2.4297520661157024,
1035
+ "grad_norm": 0.9323483109474182,
1036
+ "learning_rate": 7.037244191143661e-05,
1037
+ "loss": 1.1113,
1038
+ "step": 294
1039
+ },
1040
+ {
1041
+ "epoch": 2.446280991735537,
1042
+ "grad_norm": 0.9768907427787781,
1043
+ "learning_rate": 6.909830056250527e-05,
1044
+ "loss": 1.0983,
1045
+ "step": 296
1046
+ },
1047
+ {
1048
+ "epoch": 2.462809917355372,
1049
+ "grad_norm": 0.6783613562583923,
1050
+ "learning_rate": 6.782968176688514e-05,
1051
+ "loss": 1.1141,
1052
+ "step": 298
1053
+ },
1054
+ {
1055
+ "epoch": 2.479338842975207,
1056
+ "grad_norm": 0.753887951374054,
1057
+ "learning_rate": 6.656681224398183e-05,
1058
+ "loss": 1.1234,
1059
+ "step": 300
1060
+ },
1061
+ {
1062
+ "epoch": 2.4958677685950414,
1063
+ "grad_norm": 0.6788628697395325,
1064
+ "learning_rate": 6.530991768572794e-05,
1065
+ "loss": 1.0407,
1066
+ "step": 302
1067
+ },
1068
+ {
1069
+ "epoch": 2.512396694214876,
1070
+ "grad_norm": 0.6727221012115479,
1071
+ "learning_rate": 6.405922271624874e-05,
1072
+ "loss": 1.091,
1073
+ "step": 304
1074
+ },
1075
+ {
1076
+ "epoch": 2.5289256198347108,
1077
+ "grad_norm": 0.7418019771575928,
1078
+ "learning_rate": 6.281495085171869e-05,
1079
+ "loss": 1.0884,
1080
+ "step": 306
1081
+ },
1082
+ {
1083
+ "epoch": 2.5454545454545454,
1084
+ "grad_norm": 0.7063189148902893,
1085
+ "learning_rate": 6.15773244604163e-05,
1086
+ "loss": 1.0218,
1087
+ "step": 308
1088
+ },
1089
+ {
1090
+ "epoch": 2.56198347107438,
1091
+ "grad_norm": 1.0706840753555298,
1092
+ "learning_rate": 6.0346564722983736e-05,
1093
+ "loss": 1.0873,
1094
+ "step": 310
1095
+ },
1096
+ {
1097
+ "epoch": 2.5785123966942147,
1098
+ "grad_norm": 0.683253288269043,
1099
+ "learning_rate": 5.912289159289883e-05,
1100
+ "loss": 1.0346,
1101
+ "step": 312
1102
+ },
1103
+ {
1104
+ "epoch": 2.5950413223140494,
1105
+ "grad_norm": 0.5816308856010437,
1106
+ "learning_rate": 5.790652375716652e-05,
1107
+ "loss": 1.0113,
1108
+ "step": 314
1109
+ },
1110
+ {
1111
+ "epoch": 2.6115702479338845,
1112
+ "grad_norm": 0.7935141324996948,
1113
+ "learning_rate": 5.6697678597236356e-05,
1114
+ "loss": 1.1771,
1115
+ "step": 316
1116
+ },
1117
+ {
1118
+ "epoch": 2.628099173553719,
1119
+ "grad_norm": 0.6827735900878906,
1120
+ "learning_rate": 5.549657215015367e-05,
1121
+ "loss": 1.0899,
1122
+ "step": 318
1123
+ },
1124
+ {
1125
+ "epoch": 2.644628099173554,
1126
+ "grad_norm": 0.5408580899238586,
1127
+ "learning_rate": 5.430341906995065e-05,
1128
+ "loss": 1.1056,
1129
+ "step": 320
1130
+ },
1131
+ {
1132
+ "epoch": 2.6611570247933884,
1133
+ "grad_norm": 0.6996527910232544,
1134
+ "learning_rate": 5.31184325892849e-05,
1135
+ "loss": 1.0494,
1136
+ "step": 322
1137
+ },
1138
+ {
1139
+ "epoch": 2.677685950413223,
1140
+ "grad_norm": 0.6700767874717712,
1141
+ "learning_rate": 5.1941824481331626e-05,
1142
+ "loss": 1.1206,
1143
+ "step": 324
1144
+ },
1145
+ {
1146
+ "epoch": 2.6942148760330578,
1147
+ "grad_norm": 0.7690112590789795,
1148
+ "learning_rate": 5.077380502193725e-05,
1149
+ "loss": 1.1541,
1150
+ "step": 326
1151
+ },
1152
+ {
1153
+ "epoch": 2.7107438016528924,
1154
+ "grad_norm": 0.7921670079231262,
1155
+ "learning_rate": 4.961458295203999e-05,
1156
+ "loss": 1.0684,
1157
+ "step": 328
1158
+ },
1159
+ {
1160
+ "epoch": 2.7272727272727275,
1161
+ "grad_norm": 0.6989784240722656,
1162
+ "learning_rate": 4.8464365440365044e-05,
1163
+ "loss": 1.0097,
1164
+ "step": 330
1165
+ },
1166
+ {
1167
+ "epoch": 2.7438016528925617,
1168
+ "grad_norm": 0.6038585901260376,
1169
+ "learning_rate": 4.7323358046400844e-05,
1170
+ "loss": 1.0751,
1171
+ "step": 332
1172
+ },
1173
+ {
1174
+ "epoch": 2.760330578512397,
1175
+ "grad_norm": 0.8160024881362915,
1176
+ "learning_rate": 4.6191764683662744e-05,
1177
+ "loss": 0.9734,
1178
+ "step": 334
1179
+ },
1180
+ {
1181
+ "epoch": 2.7768595041322315,
1182
+ "grad_norm": 0.7741029858589172,
1183
+ "learning_rate": 4.506978758325081e-05,
1184
+ "loss": 1.1186,
1185
+ "step": 336
1186
+ },
1187
+ {
1188
+ "epoch": 2.793388429752066,
1189
+ "grad_norm": 0.8177542090415955,
1190
+ "learning_rate": 4.395762725770852e-05,
1191
+ "loss": 1.1538,
1192
+ "step": 338
1193
+ },
1194
+ {
1195
+ "epoch": 2.809917355371901,
1196
+ "grad_norm": 0.7617602944374084,
1197
+ "learning_rate": 4.285548246518837e-05,
1198
+ "loss": 1.1162,
1199
+ "step": 340
1200
+ },
1201
+ {
1202
+ "epoch": 2.8264462809917354,
1203
+ "grad_norm": 0.6426255106925964,
1204
+ "learning_rate": 4.176355017393099e-05,
1205
+ "loss": 1.0404,
1206
+ "step": 342
1207
+ },
1208
+ {
1209
+ "epoch": 2.84297520661157,
1210
+ "grad_norm": 0.6394379138946533,
1211
+ "learning_rate": 4.0682025527064486e-05,
1212
+ "loss": 1.1107,
1213
+ "step": 344
1214
+ },
1215
+ {
1216
+ "epoch": 2.8595041322314048,
1217
+ "grad_norm": 0.6632633209228516,
1218
+ "learning_rate": 3.961110180772955e-05,
1219
+ "loss": 1.1085,
1220
+ "step": 346
1221
+ },
1222
+ {
1223
+ "epoch": 2.87603305785124,
1224
+ "grad_norm": 0.5437737703323364,
1225
+ "learning_rate": 3.8550970404537144e-05,
1226
+ "loss": 1.0072,
1227
+ "step": 348
1228
+ },
1229
+ {
1230
+ "epoch": 2.8925619834710745,
1231
+ "grad_norm": 0.8178601264953613,
1232
+ "learning_rate": 3.750182077736486e-05,
1233
+ "loss": 1.0606,
1234
+ "step": 350
1235
+ },
1236
+ {
1237
+ "epoch": 2.909090909090909,
1238
+ "grad_norm": 0.8268756866455078,
1239
+ "learning_rate": 3.646384042349764e-05,
1240
+ "loss": 0.9642,
1241
+ "step": 352
1242
+ },
1243
+ {
1244
+ "epoch": 2.925619834710744,
1245
+ "grad_norm": 1.064460277557373,
1246
+ "learning_rate": 3.543721484411976e-05,
1247
+ "loss": 1.0881,
1248
+ "step": 354
1249
+ },
1250
+ {
1251
+ "epoch": 2.9421487603305785,
1252
+ "grad_norm": 0.7875952124595642,
1253
+ "learning_rate": 3.442212751116305e-05,
1254
+ "loss": 1.0141,
1255
+ "step": 356
1256
+ },
1257
+ {
1258
+ "epoch": 2.958677685950413,
1259
+ "grad_norm": 0.6071433424949646,
1260
+ "learning_rate": 3.3418759834518056e-05,
1261
+ "loss": 1.0692,
1262
+ "step": 358
1263
+ },
1264
+ {
1265
+ "epoch": 2.975206611570248,
1266
+ "grad_norm": 0.6311193704605103,
1267
+ "learning_rate": 3.24272911296135e-05,
1268
+ "loss": 0.9481,
1269
+ "step": 360
1270
+ },
1271
+ {
1272
+ "epoch": 2.991735537190083,
1273
+ "grad_norm": 0.7501534223556519,
1274
+ "learning_rate": 3.1447898585370384e-05,
1275
+ "loss": 1.1183,
1276
+ "step": 362
1277
+ },
1278
+ {
1279
+ "epoch": 3.0082644628099175,
1280
+ "grad_norm": 0.6752927899360657,
1281
+ "learning_rate": 3.0480757232535772e-05,
1282
+ "loss": 1.0628,
1283
+ "step": 364
1284
+ },
1285
+ {
1286
+ "epoch": 3.024793388429752,
1287
+ "grad_norm": 0.9237536787986755,
1288
+ "learning_rate": 2.9526039912402503e-05,
1289
+ "loss": 1.1539,
1290
+ "step": 366
1291
+ },
1292
+ {
1293
+ "epoch": 3.041322314049587,
1294
+ "grad_norm": 0.5888502597808838,
1295
+ "learning_rate": 2.8583917245919945e-05,
1296
+ "loss": 1.0937,
1297
+ "step": 368
1298
+ },
1299
+ {
1300
+ "epoch": 3.0578512396694215,
1301
+ "grad_norm": 0.8892382383346558,
1302
+ "learning_rate": 2.7654557603201957e-05,
1303
+ "loss": 1.0385,
1304
+ "step": 370
1305
+ },
1306
+ {
1307
+ "epoch": 3.074380165289256,
1308
+ "grad_norm": 0.7525384426116943,
1309
+ "learning_rate": 2.673812707343669e-05,
1310
+ "loss": 0.9868,
1311
+ "step": 372
1312
+ },
1313
+ {
1314
+ "epoch": 3.090909090909091,
1315
+ "grad_norm": 0.8219236731529236,
1316
+ "learning_rate": 2.5834789435204243e-05,
1317
+ "loss": 1.0349,
1318
+ "step": 374
1319
+ },
1320
+ {
1321
+ "epoch": 3.1074380165289255,
1322
+ "grad_norm": 1.2317075729370117,
1323
+ "learning_rate": 2.494470612720725e-05,
1324
+ "loss": 1.0493,
1325
+ "step": 376
1326
+ },
1327
+ {
1328
+ "epoch": 3.12396694214876,
1329
+ "grad_norm": 0.9741241931915283,
1330
+ "learning_rate": 2.4068036219419432e-05,
1331
+ "loss": 1.0685,
1332
+ "step": 378
1333
+ },
1334
+ {
1335
+ "epoch": 3.1404958677685952,
1336
+ "grad_norm": 0.910832405090332,
1337
+ "learning_rate": 2.3204936384657872e-05,
1338
+ "loss": 1.0191,
1339
+ "step": 380
1340
+ },
1341
+ {
1342
+ "epoch": 3.15702479338843,
1343
+ "grad_norm": 0.8739452958106995,
1344
+ "learning_rate": 2.235556087058328e-05,
1345
+ "loss": 1.0511,
1346
+ "step": 382
1347
+ },
1348
+ {
1349
+ "epoch": 3.1735537190082646,
1350
+ "grad_norm": 1.0163301229476929,
1351
+ "learning_rate": 2.1520061472133902e-05,
1352
+ "loss": 1.0358,
1353
+ "step": 384
1354
+ },
1355
+ {
1356
+ "epoch": 3.190082644628099,
1357
+ "grad_norm": 0.8788334131240845,
1358
+ "learning_rate": 2.069858750439768e-05,
1359
+ "loss": 1.0087,
1360
+ "step": 386
1361
+ },
1362
+ {
1363
+ "epoch": 3.206611570247934,
1364
+ "grad_norm": 0.8024322390556335,
1365
+ "learning_rate": 1.9891285775927682e-05,
1366
+ "loss": 0.973,
1367
+ "step": 388
1368
+ },
1369
+ {
1370
+ "epoch": 3.2231404958677685,
1371
+ "grad_norm": 1.0479772090911865,
1372
+ "learning_rate": 1.9098300562505266e-05,
1373
+ "loss": 1.0334,
1374
+ "step": 390
1375
+ },
1376
+ {
1377
+ "epoch": 3.239669421487603,
1378
+ "grad_norm": 0.8988475203514099,
1379
+ "learning_rate": 1.831977358135625e-05,
1380
+ "loss": 1.0007,
1381
+ "step": 392
1382
+ },
1383
+ {
1384
+ "epoch": 3.256198347107438,
1385
+ "grad_norm": 0.9283938407897949,
1386
+ "learning_rate": 1.7555843965823992e-05,
1387
+ "loss": 1.0458,
1388
+ "step": 394
1389
+ },
1390
+ {
1391
+ "epoch": 3.2727272727272725,
1392
+ "grad_norm": 1.1304008960723877,
1393
+ "learning_rate": 1.680664824050432e-05,
1394
+ "loss": 1.0623,
1395
+ "step": 396
1396
+ },
1397
+ {
1398
+ "epoch": 3.2892561983471076,
1399
+ "grad_norm": 0.9807791113853455,
1400
+ "learning_rate": 1.6072320296846898e-05,
1401
+ "loss": 1.0569,
1402
+ "step": 398
1403
+ },
1404
+ {
1405
+ "epoch": 3.3057851239669422,
1406
+ "grad_norm": 1.191261887550354,
1407
+ "learning_rate": 1.5352991369226865e-05,
1408
+ "loss": 0.9944,
1409
+ "step": 400
1410
+ },
1411
+ {
1412
+ "epoch": 3.322314049586777,
1413
+ "grad_norm": 1.0459243059158325,
1414
+ "learning_rate": 1.4648790011491542e-05,
1415
+ "loss": 1.1106,
1416
+ "step": 402
1417
+ },
1418
+ {
1419
+ "epoch": 3.3388429752066116,
1420
+ "grad_norm": 1.2179893255233765,
1421
+ "learning_rate": 1.3959842073986085e-05,
1422
+ "loss": 0.8982,
1423
+ "step": 404
1424
+ },
1425
+ {
1426
+ "epoch": 3.355371900826446,
1427
+ "grad_norm": 1.219767689704895,
1428
+ "learning_rate": 1.3286270681062274e-05,
1429
+ "loss": 1.0193,
1430
+ "step": 406
1431
+ },
1432
+ {
1433
+ "epoch": 3.371900826446281,
1434
+ "grad_norm": 1.2486698627471924,
1435
+ "learning_rate": 1.262819620907465e-05,
1436
+ "loss": 0.9492,
1437
+ "step": 408
1438
+ },
1439
+ {
1440
+ "epoch": 3.3884297520661155,
1441
+ "grad_norm": 1.2574692964553833,
1442
+ "learning_rate": 1.1985736264867509e-05,
1443
+ "loss": 1.0077,
1444
+ "step": 410
1445
+ },
1446
+ {
1447
+ "epoch": 3.4049586776859506,
1448
+ "grad_norm": 1.1906545162200928,
1449
+ "learning_rate": 1.1359005664756994e-05,
1450
+ "loss": 1.014,
1451
+ "step": 412
1452
+ },
1453
+ {
1454
+ "epoch": 3.4214876033057853,
1455
+ "grad_norm": 1.3691320419311523,
1456
+ "learning_rate": 1.0748116414011888e-05,
1457
+ "loss": 1.0101,
1458
+ "step": 414
1459
+ },
1460
+ {
1461
+ "epoch": 3.43801652892562,
1462
+ "grad_norm": 1.3103783130645752,
1463
+ "learning_rate": 1.0153177686836691e-05,
1464
+ "loss": 1.0953,
1465
+ "step": 416
1466
+ },
1467
+ {
1468
+ "epoch": 3.4545454545454546,
1469
+ "grad_norm": 1.1507749557495117,
1470
+ "learning_rate": 9.574295806860767e-06,
1471
+ "loss": 1.0044,
1472
+ "step": 418
1473
+ },
1474
+ {
1475
+ "epoch": 3.4710743801652892,
1476
+ "grad_norm": 1.3561640977859497,
1477
+ "learning_rate": 9.011574228136865e-06,
1478
+ "loss": 1.1707,
1479
+ "step": 420
1480
+ },
1481
+ {
1482
+ "epoch": 3.487603305785124,
1483
+ "grad_norm": 1.31363046169281,
1484
+ "learning_rate": 8.465113516652424e-06,
1485
+ "loss": 1.0806,
1486
+ "step": 422
1487
+ },
1488
+ {
1489
+ "epoch": 3.5041322314049586,
1490
+ "grad_norm": 1.0729997158050537,
1491
+ "learning_rate": 7.935011332357112e-06,
1492
+ "loss": 1.0657,
1493
+ "step": 424
1494
+ },
1495
+ {
1496
+ "epoch": 3.5206611570247937,
1497
+ "grad_norm": 1.6132832765579224,
1498
+ "learning_rate": 7.4213624117096755e-06,
1499
+ "loss": 1.0328,
1500
+ "step": 426
1501
+ },
1502
+ {
1503
+ "epoch": 3.537190082644628,
1504
+ "grad_norm": 0.8563976287841797,
1505
+ "learning_rate": 6.924258550747154e-06,
1506
+ "loss": 0.9862,
1507
+ "step": 428
1508
+ },
1509
+ {
1510
+ "epoch": 3.553719008264463,
1511
+ "grad_norm": 1.1601394414901733,
1512
+ "learning_rate": 6.4437885886798224e-06,
1513
+ "loss": 0.957,
1514
+ "step": 430
1515
+ },
1516
+ {
1517
+ "epoch": 3.5702479338842976,
1518
+ "grad_norm": 1.1800570487976074,
1519
+ "learning_rate": 5.980038392014309e-06,
1520
+ "loss": 1.0435,
1521
+ "step": 432
1522
+ },
1523
+ {
1524
+ "epoch": 3.5867768595041323,
1525
+ "grad_norm": 1.334128737449646,
1526
+ "learning_rate": 5.533090839208133e-06,
1527
+ "loss": 1.0272,
1528
+ "step": 434
1529
+ },
1530
+ {
1531
+ "epoch": 3.603305785123967,
1532
+ "grad_norm": 1.2104339599609375,
1533
+ "learning_rate": 5.103025805858197e-06,
1534
+ "loss": 0.9882,
1535
+ "step": 436
1536
+ },
1537
+ {
1538
+ "epoch": 3.6198347107438016,
1539
+ "grad_norm": 1.1799639463424683,
1540
+ "learning_rate": 4.68992015042592e-06,
1541
+ "loss": 1.0944,
1542
+ "step": 438
1543
+ },
1544
+ {
1545
+ "epoch": 3.6363636363636362,
1546
+ "grad_norm": 1.5897610187530518,
1547
+ "learning_rate": 4.293847700501585e-06,
1548
+ "loss": 1.0863,
1549
+ "step": 440
1550
+ },
1551
+ {
1552
+ "epoch": 3.652892561983471,
1553
+ "grad_norm": 1.3954260349273682,
1554
+ "learning_rate": 3.914879239610392e-06,
1555
+ "loss": 1.0532,
1556
+ "step": 442
1557
+ },
1558
+ {
1559
+ "epoch": 3.669421487603306,
1560
+ "grad_norm": 1.1017087697982788,
1561
+ "learning_rate": 3.5530824945623542e-06,
1562
+ "loss": 1.0719,
1563
+ "step": 444
1564
+ },
1565
+ {
1566
+ "epoch": 3.6859504132231407,
1567
+ "grad_norm": 0.9582527279853821,
1568
+ "learning_rate": 3.2085221233487562e-06,
1569
+ "loss": 1.0178,
1570
+ "step": 446
1571
+ },
1572
+ {
1573
+ "epoch": 3.7024793388429753,
1574
+ "grad_norm": 1.5694618225097656,
1575
+ "learning_rate": 2.8812597035868137e-06,
1576
+ "loss": 1.0657,
1577
+ "step": 448
1578
+ },
1579
+ {
1580
+ "epoch": 3.71900826446281,
1581
+ "grad_norm": 0.8071503639221191,
1582
+ "learning_rate": 2.5713537215149132e-06,
1583
+ "loss": 1.0048,
1584
+ "step": 450
1585
+ },
1586
+ {
1587
+ "epoch": 3.7355371900826446,
1588
+ "grad_norm": 1.3347952365875244,
1589
+ "learning_rate": 2.2788595615403474e-06,
1590
+ "loss": 1.0384,
1591
+ "step": 452
1592
+ },
1593
+ {
1594
+ "epoch": 3.7520661157024793,
1595
+ "grad_norm": 1.4382898807525635,
1596
+ "learning_rate": 2.003829496341325e-06,
1597
+ "loss": 1.0574,
1598
+ "step": 454
1599
+ },
1600
+ {
1601
+ "epoch": 3.768595041322314,
1602
+ "grad_norm": 1.1163910627365112,
1603
+ "learning_rate": 1.7463126775252191e-06,
1604
+ "loss": 1.1162,
1605
+ "step": 456
1606
+ },
1607
+ {
1608
+ "epoch": 3.785123966942149,
1609
+ "grad_norm": 1.3733882904052734,
1610
+ "learning_rate": 1.5063551268444276e-06,
1611
+ "loss": 1.0597,
1612
+ "step": 458
1613
+ },
1614
+ {
1615
+ "epoch": 3.8016528925619832,
1616
+ "grad_norm": 0.9412918090820312,
1617
+ "learning_rate": 1.2839997279717076e-06,
1618
+ "loss": 1.0342,
1619
+ "step": 460
1620
+ },
1621
+ {
1622
+ "epoch": 3.8181818181818183,
1623
+ "grad_norm": 1.3428821563720703,
1624
+ "learning_rate": 1.0792862188362396e-06,
1625
+ "loss": 1.0588,
1626
+ "step": 462
1627
+ },
1628
+ {
1629
+ "epoch": 3.834710743801653,
1630
+ "grad_norm": 1.326529860496521,
1631
+ "learning_rate": 8.922511845219971e-07,
1632
+ "loss": 1.0857,
1633
+ "step": 464
1634
+ },
1635
+ {
1636
+ "epoch": 3.8512396694214877,
1637
+ "grad_norm": 1.1826655864715576,
1638
+ "learning_rate": 7.229280507293657e-07,
1639
+ "loss": 1.0606,
1640
+ "step": 466
1641
+ },
1642
+ {
1643
+ "epoch": 3.8677685950413223,
1644
+ "grad_norm": 1.266386866569519,
1645
+ "learning_rate": 5.713470778016538e-07,
1646
+ "loss": 1.0195,
1647
+ "step": 468
1648
+ },
1649
+ {
1650
+ "epoch": 3.884297520661157,
1651
+ "grad_norm": 1.2093831300735474,
1652
+ "learning_rate": 4.375353553170647e-07,
1653
+ "loss": 1.0039,
1654
+ "step": 470
1655
+ },
1656
+ {
1657
+ "epoch": 3.9008264462809916,
1658
+ "grad_norm": 1.0445263385772705,
1659
+ "learning_rate": 3.2151679724748975e-07,
1660
+ "loss": 1.0612,
1661
+ "step": 472
1662
+ },
1663
+ {
1664
+ "epoch": 3.9173553719008263,
1665
+ "grad_norm": 1.331885814666748,
1666
+ "learning_rate": 2.2331213768468363e-07,
1667
+ "loss": 1.1568,
1668
+ "step": 474
1669
+ },
1670
+ {
1671
+ "epoch": 3.9338842975206614,
1672
+ "grad_norm": 1.254831075668335,
1673
+ "learning_rate": 1.4293892713486134e-07,
1674
+ "loss": 1.0544,
1675
+ "step": 476
1676
+ },
1677
+ {
1678
+ "epoch": 3.950413223140496,
1679
+ "grad_norm": 0.9736830592155457,
1680
+ "learning_rate": 8.041152938216278e-08,
1681
+ "loss": 0.976,
1682
+ "step": 478
1683
+ },
1684
+ {
1685
+ "epoch": 3.9669421487603307,
1686
+ "grad_norm": 1.0081415176391602,
1687
+ "learning_rate": 3.5741118921628346e-08,
1688
+ "loss": 1.0152,
1689
+ "step": 480
1690
+ },
1691
+ {
1692
+ "epoch": 3.9834710743801653,
1693
+ "grad_norm": 1.2260342836380005,
1694
+ "learning_rate": 8.93567896219638e-09,
1695
+ "loss": 1.0639,
1696
+ "step": 482
1697
+ },
1698
+ {
1699
+ "epoch": 4.0,
1700
+ "grad_norm": 3.486053466796875,
1701
+ "learning_rate": 0.0,
1702
+ "loss": 1.1032,
1703
+ "step": 484
1704
+ }
1705
+ ],
1706
+ "logging_steps": 2,
1707
+ "max_steps": 484,
1708
+ "num_input_tokens_seen": 0,
1709
+ "num_train_epochs": 4,
1710
+ "save_steps": 500,
1711
+ "stateful_callbacks": {
1712
+ "TrainerControl": {
1713
+ "args": {
1714
+ "should_epoch_stop": false,
1715
+ "should_evaluate": false,
1716
+ "should_log": false,
1717
+ "should_save": true,
1718
+ "should_training_stop": true
1719
+ },
1720
+ "attributes": {}
1721
+ }
1722
+ },
1723
+ "total_flos": 9157352821358592.0,
1724
+ "train_batch_size": 16,
1725
+ "trial_name": null,
1726
+ "trial_params": null
1727
+ }