gijs commited on
Commit
203d3ff
·
verified ·
1 Parent(s): 8de5419

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +17 -0
  2. gama/gama-20250422_171856/checkpoint-2350/README.md +202 -0
  3. gama/gama-20250422_171856/checkpoint-2350/adapter_config.json +34 -0
  4. gama/gama-20250422_171856/checkpoint-2350/special_tokens_map.json +24 -0
  5. gama/gama-20250422_171856/checkpoint-2350/tokenizer.json +0 -0
  6. gama/gama-20250422_171856/checkpoint-2350/tokenizer_config.json +44 -0
  7. gama/gama-20250422_171856/checkpoint-2350/trainer_state.json +1914 -0
  8. gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/README.md +202 -0
  9. gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/adapter_config.json +34 -0
  10. gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/special_tokens_map.json +24 -0
  11. gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer.json +0 -0
  12. gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer_config.json +44 -0
  13. gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/trainer_state.json +1378 -0
  14. gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/README.md +202 -0
  15. gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/adapter_config.json +34 -0
  16. gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/special_tokens_map.json +24 -0
  17. gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer.json +0 -0
  18. gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer_config.json +44 -0
  19. gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/trainer_state.json +1378 -0
  20. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/README.md +202 -0
  21. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/adapter_config.json +34 -0
  22. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/special_tokens_map.json +24 -0
  23. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer.json +0 -0
  24. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer_config.json +44 -0
  25. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/trainer_state.json +1826 -0
  26. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/README.md +202 -0
  27. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/adapter_config.json +34 -0
  28. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/special_tokens_map.json +24 -0
  29. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer.json +0 -0
  30. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer_config.json +44 -0
  31. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/trainer_state.json +0 -0
  32. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/README.md +202 -0
  33. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/adapter_config.json +34 -0
  34. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/rng_state.pth +3 -0
  35. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/scheduler.pt +3 -0
  36. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/special_tokens_map.json +24 -0
  37. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer.json +0 -0
  38. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer_config.json +44 -0
  39. gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/trainer_state.json +0 -0
  40. gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/README.md +202 -0
  41. gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/adapter_config.json +34 -0
  42. gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/special_tokens_map.json +24 -0
  43. gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer.json +0 -0
  44. gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer_config.json +44 -0
  45. gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/trainer_state.json +0 -0
  46. gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/README.md +202 -0
  47. gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/adapter_config.json +34 -0
  48. gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/special_tokens_map.json +24 -0
  49. gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer.json +0 -0
  50. gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer_config.json +44 -0
.gitattributes CHANGED
@@ -668,3 +668,20 @@ grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-1500/tokenizer.json filter
668
  grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
669
  grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
670
  grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
  grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
669
  grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
670
  grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
671
+ grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-2700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
672
+ grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-2100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
673
+ grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
674
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-1416/tokenizer.json filter=lfs diff=lfs merge=lfs -text
675
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-14514/tokenizer.json filter=lfs diff=lfs merge=lfs -text
676
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-12036/tokenizer.json filter=lfs diff=lfs merge=lfs -text
677
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-6372/tokenizer.json filter=lfs diff=lfs merge=lfs -text
678
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-11328/tokenizer.json filter=lfs diff=lfs merge=lfs -text
679
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-354/tokenizer.json filter=lfs diff=lfs merge=lfs -text
680
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-4602/tokenizer.json filter=lfs diff=lfs merge=lfs -text
681
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-12744/tokenizer.json filter=lfs diff=lfs merge=lfs -text
682
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-9204/tokenizer.json filter=lfs diff=lfs merge=lfs -text
683
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-3894/tokenizer.json filter=lfs diff=lfs merge=lfs -text
684
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-4956/tokenizer.json filter=lfs diff=lfs merge=lfs -text
685
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-2124/tokenizer.json filter=lfs diff=lfs merge=lfs -text
686
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-13452/tokenizer.json filter=lfs diff=lfs merge=lfs -text
687
+ grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-12390/tokenizer.json filter=lfs diff=lfs merge=lfs -text
gama/gama-20250422_171856/checkpoint-2350/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.0
gama/gama-20250422_171856/checkpoint-2350/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 8,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "v_proj",
28
+ "q_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "trainable_token_indices": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gama/gama-20250422_171856/checkpoint-2350/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
gama/gama-20250422_171856/checkpoint-2350/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gama/gama-20250422_171856/checkpoint-2350/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
gama/gama-20250422_171856/checkpoint-2350/trainer_state.json ADDED
@@ -0,0 +1,1914 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.033332387733681315,
6
+ "eval_steps": 500,
7
+ "global_step": 2350,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0001418399478028992,
14
+ "learning_rate": 0.0001999744688093955,
15
+ "loss": 1.4644,
16
+ "mean_token_accuracy": 0.6479970395565033,
17
+ "num_tokens": 10375.0,
18
+ "step": 10
19
+ },
20
+ {
21
+ "epoch": 0.0002836798956057984,
22
+ "learning_rate": 0.0001999461008198349,
23
+ "loss": 1.3344,
24
+ "mean_token_accuracy": 0.6637877106666565,
25
+ "num_tokens": 20535.0,
26
+ "step": 20
27
+ },
28
+ {
29
+ "epoch": 0.0004255198434086976,
30
+ "learning_rate": 0.00019991773283027433,
31
+ "loss": 1.3386,
32
+ "mean_token_accuracy": 0.6674083709716797,
33
+ "num_tokens": 30845.0,
34
+ "step": 30
35
+ },
36
+ {
37
+ "epoch": 0.0005673597912115968,
38
+ "learning_rate": 0.00019988936484071373,
39
+ "loss": 1.2811,
40
+ "mean_token_accuracy": 0.6720114409923553,
41
+ "num_tokens": 41159.0,
42
+ "step": 40
43
+ },
44
+ {
45
+ "epoch": 0.0007091997390144961,
46
+ "learning_rate": 0.00019986099685115316,
47
+ "loss": 1.3092,
48
+ "mean_token_accuracy": 0.6668342292308808,
49
+ "num_tokens": 51351.0,
50
+ "step": 50
51
+ },
52
+ {
53
+ "epoch": 0.0008510396868173952,
54
+ "learning_rate": 0.0001998326288615926,
55
+ "loss": 1.2572,
56
+ "mean_token_accuracy": 0.678832185268402,
57
+ "num_tokens": 61567.0,
58
+ "step": 60
59
+ },
60
+ {
61
+ "epoch": 0.0009928796346202944,
62
+ "learning_rate": 0.00019980426087203202,
63
+ "loss": 1.3006,
64
+ "mean_token_accuracy": 0.6674412608146667,
65
+ "num_tokens": 71722.0,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 0.0011347195824231936,
70
+ "learning_rate": 0.00019977589288247143,
71
+ "loss": 1.248,
72
+ "mean_token_accuracy": 0.6744147539138794,
73
+ "num_tokens": 81984.0,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.0012765595302260929,
78
+ "learning_rate": 0.00019974752489291083,
79
+ "loss": 1.2643,
80
+ "mean_token_accuracy": 0.6711925864219666,
81
+ "num_tokens": 92376.0,
82
+ "step": 90
83
+ },
84
+ {
85
+ "epoch": 0.0014183994780289921,
86
+ "learning_rate": 0.0001997191569033503,
87
+ "loss": 1.28,
88
+ "mean_token_accuracy": 0.67249955534935,
89
+ "num_tokens": 102699.0,
90
+ "step": 100
91
+ },
92
+ {
93
+ "epoch": 0.0015602394258318914,
94
+ "learning_rate": 0.0001996907889137897,
95
+ "loss": 1.2525,
96
+ "mean_token_accuracy": 0.6758616745471955,
97
+ "num_tokens": 112760.0,
98
+ "step": 110
99
+ },
100
+ {
101
+ "epoch": 0.0017020793736347904,
102
+ "learning_rate": 0.00019966242092422912,
103
+ "loss": 1.2454,
104
+ "mean_token_accuracy": 0.6850893795490265,
105
+ "num_tokens": 122946.0,
106
+ "step": 120
107
+ },
108
+ {
109
+ "epoch": 0.0018439193214376897,
110
+ "learning_rate": 0.00019963405293466852,
111
+ "loss": 1.2673,
112
+ "mean_token_accuracy": 0.6743516206741333,
113
+ "num_tokens": 133225.0,
114
+ "step": 130
115
+ },
116
+ {
117
+ "epoch": 0.0019857592692405887,
118
+ "learning_rate": 0.00019960568494510795,
119
+ "loss": 1.2961,
120
+ "mean_token_accuracy": 0.672141146659851,
121
+ "num_tokens": 143602.0,
122
+ "step": 140
123
+ },
124
+ {
125
+ "epoch": 0.002127599217043488,
126
+ "learning_rate": 0.00019957731695554738,
127
+ "loss": 1.2268,
128
+ "mean_token_accuracy": 0.6834299504756928,
129
+ "num_tokens": 154040.0,
130
+ "step": 150
131
+ },
132
+ {
133
+ "epoch": 0.0022694391648463872,
134
+ "learning_rate": 0.00019954894896598679,
135
+ "loss": 1.2483,
136
+ "mean_token_accuracy": 0.6738203048706055,
137
+ "num_tokens": 164237.0,
138
+ "step": 160
139
+ },
140
+ {
141
+ "epoch": 0.0024112791126492867,
142
+ "learning_rate": 0.00019952058097642622,
143
+ "loss": 1.2644,
144
+ "mean_token_accuracy": 0.6698677897453308,
145
+ "num_tokens": 174418.0,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 0.0025531190604521858,
150
+ "learning_rate": 0.00019949221298686562,
151
+ "loss": 1.2569,
152
+ "mean_token_accuracy": 0.6776521384716034,
153
+ "num_tokens": 184818.0,
154
+ "step": 180
155
+ },
156
+ {
157
+ "epoch": 0.002694959008255085,
158
+ "learning_rate": 0.00019946384499730505,
159
+ "loss": 1.218,
160
+ "mean_token_accuracy": 0.6825460493564606,
161
+ "num_tokens": 194989.0,
162
+ "step": 190
163
+ },
164
+ {
165
+ "epoch": 0.0028367989560579843,
166
+ "learning_rate": 0.00019943547700774448,
167
+ "loss": 1.1975,
168
+ "mean_token_accuracy": 0.6864433467388154,
169
+ "num_tokens": 205418.0,
170
+ "step": 200
171
+ },
172
+ {
173
+ "epoch": 0.0029786389038608833,
174
+ "learning_rate": 0.00019940710901818388,
175
+ "loss": 1.1862,
176
+ "mean_token_accuracy": 0.6890658676624298,
177
+ "num_tokens": 215645.0,
178
+ "step": 210
179
+ },
180
+ {
181
+ "epoch": 0.0031204788516637828,
182
+ "learning_rate": 0.0001993787410286233,
183
+ "loss": 1.2141,
184
+ "mean_token_accuracy": 0.684105110168457,
185
+ "num_tokens": 225986.0,
186
+ "step": 220
187
+ },
188
+ {
189
+ "epoch": 0.003262318799466682,
190
+ "learning_rate": 0.00019935037303906271,
191
+ "loss": 1.2246,
192
+ "mean_token_accuracy": 0.6754761219024659,
193
+ "num_tokens": 236317.0,
194
+ "step": 230
195
+ },
196
+ {
197
+ "epoch": 0.003404158747269581,
198
+ "learning_rate": 0.00019932200504950217,
199
+ "loss": 1.1578,
200
+ "mean_token_accuracy": 0.6994408905506134,
201
+ "num_tokens": 246467.0,
202
+ "step": 240
203
+ },
204
+ {
205
+ "epoch": 0.0035459986950724803,
206
+ "learning_rate": 0.00019929363705994157,
207
+ "loss": 1.1978,
208
+ "mean_token_accuracy": 0.6848730027675629,
209
+ "num_tokens": 256900.0,
210
+ "step": 250
211
+ },
212
+ {
213
+ "epoch": 0.0036878386428753794,
214
+ "learning_rate": 0.00019926526907038098,
215
+ "loss": 1.1915,
216
+ "mean_token_accuracy": 0.6893563270568848,
217
+ "num_tokens": 267130.0,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 0.003829678590678279,
222
+ "learning_rate": 0.0001992369010808204,
223
+ "loss": 1.1796,
224
+ "mean_token_accuracy": 0.6892049252986908,
225
+ "num_tokens": 277453.0,
226
+ "step": 270
227
+ },
228
+ {
229
+ "epoch": 0.0039715185384811775,
230
+ "learning_rate": 0.00019920853309125984,
231
+ "loss": 1.1481,
232
+ "mean_token_accuracy": 0.696202689409256,
233
+ "num_tokens": 287556.0,
234
+ "step": 280
235
+ },
236
+ {
237
+ "epoch": 0.004113358486284077,
238
+ "learning_rate": 0.00019918016510169927,
239
+ "loss": 1.1762,
240
+ "mean_token_accuracy": 0.6873701572418213,
241
+ "num_tokens": 297903.0,
242
+ "step": 290
243
+ },
244
+ {
245
+ "epoch": 0.004255198434086976,
246
+ "learning_rate": 0.00019915179711213867,
247
+ "loss": 1.1251,
248
+ "mean_token_accuracy": 0.6990953743457794,
249
+ "num_tokens": 307835.0,
250
+ "step": 300
251
+ },
252
+ {
253
+ "epoch": 0.0043970383818898754,
254
+ "learning_rate": 0.0001991234291225781,
255
+ "loss": 1.1808,
256
+ "mean_token_accuracy": 0.6895361363887786,
257
+ "num_tokens": 318327.0,
258
+ "step": 310
259
+ },
260
+ {
261
+ "epoch": 0.0045388783296927745,
262
+ "learning_rate": 0.0001990950611330175,
263
+ "loss": 1.1756,
264
+ "mean_token_accuracy": 0.6966897130012513,
265
+ "num_tokens": 328720.0,
266
+ "step": 320
267
+ },
268
+ {
269
+ "epoch": 0.0046807182774956735,
270
+ "learning_rate": 0.00019906669314345693,
271
+ "loss": 1.1887,
272
+ "mean_token_accuracy": 0.682871812582016,
273
+ "num_tokens": 338831.0,
274
+ "step": 330
275
+ },
276
+ {
277
+ "epoch": 0.004822558225298573,
278
+ "learning_rate": 0.00019903832515389636,
279
+ "loss": 1.1953,
280
+ "mean_token_accuracy": 0.6855618298053742,
281
+ "num_tokens": 349098.0,
282
+ "step": 340
283
+ },
284
+ {
285
+ "epoch": 0.0049643981731014725,
286
+ "learning_rate": 0.00019900995716433577,
287
+ "loss": 1.1808,
288
+ "mean_token_accuracy": 0.6865513443946838,
289
+ "num_tokens": 359364.0,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 0.0051062381209043715,
294
+ "learning_rate": 0.0001989815891747752,
295
+ "loss": 1.1186,
296
+ "mean_token_accuracy": 0.7019730627536773,
297
+ "num_tokens": 369582.0,
298
+ "step": 360
299
+ },
300
+ {
301
+ "epoch": 0.0052480780687072705,
302
+ "learning_rate": 0.00019895322118521463,
303
+ "loss": 1.1432,
304
+ "mean_token_accuracy": 0.6962596535682678,
305
+ "num_tokens": 379501.0,
306
+ "step": 370
307
+ },
308
+ {
309
+ "epoch": 0.00538991801651017,
310
+ "learning_rate": 0.00019892485319565403,
311
+ "loss": 1.1904,
312
+ "mean_token_accuracy": 0.6854041993618012,
313
+ "num_tokens": 389789.0,
314
+ "step": 380
315
+ },
316
+ {
317
+ "epoch": 0.0055317579643130695,
318
+ "learning_rate": 0.00019889648520609346,
319
+ "loss": 1.1778,
320
+ "mean_token_accuracy": 0.6948013961315155,
321
+ "num_tokens": 400015.0,
322
+ "step": 390
323
+ },
324
+ {
325
+ "epoch": 0.0056735979121159685,
326
+ "learning_rate": 0.00019886811721653286,
327
+ "loss": 1.2114,
328
+ "mean_token_accuracy": 0.6789448976516723,
329
+ "num_tokens": 410407.0,
330
+ "step": 400
331
+ },
332
+ {
333
+ "epoch": 0.005815437859918868,
334
+ "learning_rate": 0.00019883974922697232,
335
+ "loss": 1.1815,
336
+ "mean_token_accuracy": 0.6859633207321167,
337
+ "num_tokens": 420563.0,
338
+ "step": 410
339
+ },
340
+ {
341
+ "epoch": 0.005957277807721767,
342
+ "learning_rate": 0.00019881138123741172,
343
+ "loss": 1.1739,
344
+ "mean_token_accuracy": 0.6891894340515137,
345
+ "num_tokens": 430802.0,
346
+ "step": 420
347
+ },
348
+ {
349
+ "epoch": 0.006099117755524666,
350
+ "learning_rate": 0.00019878301324785112,
351
+ "loss": 1.132,
352
+ "mean_token_accuracy": 0.7001429855823517,
353
+ "num_tokens": 440998.0,
354
+ "step": 430
355
+ },
356
+ {
357
+ "epoch": 0.0062409577033275656,
358
+ "learning_rate": 0.00019875464525829055,
359
+ "loss": 1.1474,
360
+ "mean_token_accuracy": 0.693991506099701,
361
+ "num_tokens": 451216.0,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 0.006382797651130465,
366
+ "learning_rate": 0.00019872627726872996,
367
+ "loss": 1.199,
368
+ "mean_token_accuracy": 0.6871366202831268,
369
+ "num_tokens": 461276.0,
370
+ "step": 450
371
+ },
372
+ {
373
+ "epoch": 0.006524637598933364,
374
+ "learning_rate": 0.0001986979092791694,
375
+ "loss": 1.1204,
376
+ "mean_token_accuracy": 0.700880628824234,
377
+ "num_tokens": 471433.0,
378
+ "step": 460
379
+ },
380
+ {
381
+ "epoch": 0.006666477546736263,
382
+ "learning_rate": 0.00019866954128960882,
383
+ "loss": 1.2102,
384
+ "mean_token_accuracy": 0.6879798650741578,
385
+ "num_tokens": 481531.0,
386
+ "step": 470
387
+ },
388
+ {
389
+ "epoch": 0.006808317494539162,
390
+ "learning_rate": 0.00019864117330004825,
391
+ "loss": 1.1195,
392
+ "mean_token_accuracy": 0.6937784433364869,
393
+ "num_tokens": 491954.0,
394
+ "step": 480
395
+ },
396
+ {
397
+ "epoch": 0.006950157442342062,
398
+ "learning_rate": 0.00019861280531048765,
399
+ "loss": 1.1455,
400
+ "mean_token_accuracy": 0.7021094024181366,
401
+ "num_tokens": 502274.0,
402
+ "step": 490
403
+ },
404
+ {
405
+ "epoch": 0.007091997390144961,
406
+ "learning_rate": 0.00019858443732092708,
407
+ "loss": 1.1608,
408
+ "mean_token_accuracy": 0.6934002816677094,
409
+ "num_tokens": 512604.0,
410
+ "step": 500
411
+ },
412
+ {
413
+ "epoch": 0.00723383733794786,
414
+ "learning_rate": 0.0001985560693313665,
415
+ "loss": 1.1705,
416
+ "mean_token_accuracy": 0.695448386669159,
417
+ "num_tokens": 522792.0,
418
+ "step": 510
419
+ },
420
+ {
421
+ "epoch": 0.007375677285750759,
422
+ "learning_rate": 0.0001985277013418059,
423
+ "loss": 1.1575,
424
+ "mean_token_accuracy": 0.693002599477768,
425
+ "num_tokens": 532886.0,
426
+ "step": 520
427
+ },
428
+ {
429
+ "epoch": 0.007517517233553658,
430
+ "learning_rate": 0.00019849933335224534,
431
+ "loss": 1.1261,
432
+ "mean_token_accuracy": 0.6963518977165222,
433
+ "num_tokens": 542956.0,
434
+ "step": 530
435
+ },
436
+ {
437
+ "epoch": 0.007659357181356558,
438
+ "learning_rate": 0.00019847096536268474,
439
+ "loss": 1.1214,
440
+ "mean_token_accuracy": 0.700639396905899,
441
+ "num_tokens": 553042.0,
442
+ "step": 540
443
+ },
444
+ {
445
+ "epoch": 0.007801197129159457,
446
+ "learning_rate": 0.00019844259737312417,
447
+ "loss": 1.1725,
448
+ "mean_token_accuracy": 0.6955362856388092,
449
+ "num_tokens": 563291.0,
450
+ "step": 550
451
+ },
452
+ {
453
+ "epoch": 0.007943037076962355,
454
+ "learning_rate": 0.0001984142293835636,
455
+ "loss": 1.1567,
456
+ "mean_token_accuracy": 0.6907780110836029,
457
+ "num_tokens": 573513.0,
458
+ "step": 560
459
+ },
460
+ {
461
+ "epoch": 0.008084877024765255,
462
+ "learning_rate": 0.000198385861394003,
463
+ "loss": 1.1467,
464
+ "mean_token_accuracy": 0.6990721464157105,
465
+ "num_tokens": 583892.0,
466
+ "step": 570
467
+ },
468
+ {
469
+ "epoch": 0.008226716972568155,
470
+ "learning_rate": 0.00019835749340444244,
471
+ "loss": 1.1326,
472
+ "mean_token_accuracy": 0.6969013214111328,
473
+ "num_tokens": 593838.0,
474
+ "step": 580
475
+ },
476
+ {
477
+ "epoch": 0.008368556920371053,
478
+ "learning_rate": 0.00019832912541488187,
479
+ "loss": 1.1711,
480
+ "mean_token_accuracy": 0.6954678654670715,
481
+ "num_tokens": 603881.0,
482
+ "step": 590
483
+ },
484
+ {
485
+ "epoch": 0.008510396868173953,
486
+ "learning_rate": 0.00019830075742532127,
487
+ "loss": 1.0689,
488
+ "mean_token_accuracy": 0.7051353633403779,
489
+ "num_tokens": 614246.0,
490
+ "step": 600
491
+ },
492
+ {
493
+ "epoch": 0.008652236815976851,
494
+ "learning_rate": 0.0001982723894357607,
495
+ "loss": 1.1453,
496
+ "mean_token_accuracy": 0.6947243511676788,
497
+ "num_tokens": 624741.0,
498
+ "step": 610
499
+ },
500
+ {
501
+ "epoch": 0.008794076763779751,
502
+ "learning_rate": 0.0001982440214462001,
503
+ "loss": 1.1411,
504
+ "mean_token_accuracy": 0.6965227723121643,
505
+ "num_tokens": 634891.0,
506
+ "step": 620
507
+ },
508
+ {
509
+ "epoch": 0.00893591671158265,
510
+ "learning_rate": 0.00019821565345663953,
511
+ "loss": 1.1215,
512
+ "mean_token_accuracy": 0.703746247291565,
513
+ "num_tokens": 645040.0,
514
+ "step": 630
515
+ },
516
+ {
517
+ "epoch": 0.009077756659385549,
518
+ "learning_rate": 0.00019818728546707896,
519
+ "loss": 1.1233,
520
+ "mean_token_accuracy": 0.6995523154735566,
521
+ "num_tokens": 655495.0,
522
+ "step": 640
523
+ },
524
+ {
525
+ "epoch": 0.009219596607188449,
526
+ "learning_rate": 0.0001981589174775184,
527
+ "loss": 1.1817,
528
+ "mean_token_accuracy": 0.6966328859329224,
529
+ "num_tokens": 665731.0,
530
+ "step": 650
531
+ },
532
+ {
533
+ "epoch": 0.009361436554991347,
534
+ "learning_rate": 0.0001981305494879578,
535
+ "loss": 1.1228,
536
+ "mean_token_accuracy": 0.6941804051399231,
537
+ "num_tokens": 676297.0,
538
+ "step": 660
539
+ },
540
+ {
541
+ "epoch": 0.009503276502794247,
542
+ "learning_rate": 0.0001981021814983972,
543
+ "loss": 1.1338,
544
+ "mean_token_accuracy": 0.6983364522457123,
545
+ "num_tokens": 686502.0,
546
+ "step": 670
547
+ },
548
+ {
549
+ "epoch": 0.009645116450597147,
550
+ "learning_rate": 0.00019807381350883666,
551
+ "loss": 1.1052,
552
+ "mean_token_accuracy": 0.7062141060829162,
553
+ "num_tokens": 696824.0,
554
+ "step": 680
555
+ },
556
+ {
557
+ "epoch": 0.009786956398400045,
558
+ "learning_rate": 0.00019804544551927606,
559
+ "loss": 1.0801,
560
+ "mean_token_accuracy": 0.7161077737808228,
561
+ "num_tokens": 707051.0,
562
+ "step": 690
563
+ },
564
+ {
565
+ "epoch": 0.009928796346202945,
566
+ "learning_rate": 0.0001980170775297155,
567
+ "loss": 1.1189,
568
+ "mean_token_accuracy": 0.7090901613235474,
569
+ "num_tokens": 717106.0,
570
+ "step": 700
571
+ },
572
+ {
573
+ "epoch": 0.010070636294005843,
574
+ "learning_rate": 0.0001979887095401549,
575
+ "loss": 1.0914,
576
+ "mean_token_accuracy": 0.709138709306717,
577
+ "num_tokens": 727375.0,
578
+ "step": 710
579
+ },
580
+ {
581
+ "epoch": 0.010212476241808743,
582
+ "learning_rate": 0.00019796034155059432,
583
+ "loss": 1.1572,
584
+ "mean_token_accuracy": 0.6868231952190399,
585
+ "num_tokens": 737794.0,
586
+ "step": 720
587
+ },
588
+ {
589
+ "epoch": 0.010354316189611643,
590
+ "learning_rate": 0.00019793197356103375,
591
+ "loss": 1.1172,
592
+ "mean_token_accuracy": 0.6967993915081024,
593
+ "num_tokens": 748005.0,
594
+ "step": 730
595
+ },
596
+ {
597
+ "epoch": 0.010496156137414541,
598
+ "learning_rate": 0.00019790360557147315,
599
+ "loss": 1.1412,
600
+ "mean_token_accuracy": 0.7041096150875091,
601
+ "num_tokens": 758273.0,
602
+ "step": 740
603
+ },
604
+ {
605
+ "epoch": 0.010637996085217441,
606
+ "learning_rate": 0.00019787523758191258,
607
+ "loss": 1.1151,
608
+ "mean_token_accuracy": 0.701738464832306,
609
+ "num_tokens": 768532.0,
610
+ "step": 750
611
+ },
612
+ {
613
+ "epoch": 0.01077983603302034,
614
+ "learning_rate": 0.000197846869592352,
615
+ "loss": 1.0879,
616
+ "mean_token_accuracy": 0.7017017006874084,
617
+ "num_tokens": 778726.0,
618
+ "step": 760
619
+ },
620
+ {
621
+ "epoch": 0.010921675980823239,
622
+ "learning_rate": 0.00019781850160279142,
623
+ "loss": 1.1212,
624
+ "mean_token_accuracy": 0.6997404515743255,
625
+ "num_tokens": 788976.0,
626
+ "step": 770
627
+ },
628
+ {
629
+ "epoch": 0.011063515928626139,
630
+ "learning_rate": 0.00019779013361323085,
631
+ "loss": 1.11,
632
+ "mean_token_accuracy": 0.6939224183559418,
633
+ "num_tokens": 799236.0,
634
+ "step": 780
635
+ },
636
+ {
637
+ "epoch": 0.011205355876429037,
638
+ "learning_rate": 0.00019776176562367025,
639
+ "loss": 1.1234,
640
+ "mean_token_accuracy": 0.7009412169456481,
641
+ "num_tokens": 809407.0,
642
+ "step": 790
643
+ },
644
+ {
645
+ "epoch": 0.011347195824231937,
646
+ "learning_rate": 0.00019773339763410968,
647
+ "loss": 1.1241,
648
+ "mean_token_accuracy": 0.700713324546814,
649
+ "num_tokens": 819778.0,
650
+ "step": 800
651
+ },
652
+ {
653
+ "epoch": 0.011489035772034835,
654
+ "learning_rate": 0.0001977050296445491,
655
+ "loss": 1.1238,
656
+ "mean_token_accuracy": 0.7003967940807343,
657
+ "num_tokens": 829656.0,
658
+ "step": 810
659
+ },
660
+ {
661
+ "epoch": 0.011630875719837735,
662
+ "learning_rate": 0.00019767666165498854,
663
+ "loss": 1.121,
664
+ "mean_token_accuracy": 0.703648030757904,
665
+ "num_tokens": 839892.0,
666
+ "step": 820
667
+ },
668
+ {
669
+ "epoch": 0.011772715667640635,
670
+ "learning_rate": 0.00019764829366542794,
671
+ "loss": 1.1255,
672
+ "mean_token_accuracy": 0.7010339736938477,
673
+ "num_tokens": 849962.0,
674
+ "step": 830
675
+ },
676
+ {
677
+ "epoch": 0.011914555615443533,
678
+ "learning_rate": 0.00019761992567586734,
679
+ "loss": 1.1608,
680
+ "mean_token_accuracy": 0.691221284866333,
681
+ "num_tokens": 860382.0,
682
+ "step": 840
683
+ },
684
+ {
685
+ "epoch": 0.012056395563246433,
686
+ "learning_rate": 0.00019759155768630677,
687
+ "loss": 1.1149,
688
+ "mean_token_accuracy": 0.702646654844284,
689
+ "num_tokens": 870754.0,
690
+ "step": 850
691
+ },
692
+ {
693
+ "epoch": 0.012198235511049331,
694
+ "learning_rate": 0.0001975631896967462,
695
+ "loss": 1.1507,
696
+ "mean_token_accuracy": 0.6957122564315796,
697
+ "num_tokens": 880951.0,
698
+ "step": 860
699
+ },
700
+ {
701
+ "epoch": 0.012340075458852231,
702
+ "learning_rate": 0.00019753482170718563,
703
+ "loss": 1.1001,
704
+ "mean_token_accuracy": 0.7044150590896606,
705
+ "num_tokens": 891008.0,
706
+ "step": 870
707
+ },
708
+ {
709
+ "epoch": 0.012481915406655131,
710
+ "learning_rate": 0.00019750645371762504,
711
+ "loss": 1.1419,
712
+ "mean_token_accuracy": 0.7001836776733399,
713
+ "num_tokens": 901300.0,
714
+ "step": 880
715
+ },
716
+ {
717
+ "epoch": 0.01262375535445803,
718
+ "learning_rate": 0.00019747808572806447,
719
+ "loss": 1.1985,
720
+ "mean_token_accuracy": 0.6821943819522858,
721
+ "num_tokens": 911677.0,
722
+ "step": 890
723
+ },
724
+ {
725
+ "epoch": 0.01276559530226093,
726
+ "learning_rate": 0.0001974497177385039,
727
+ "loss": 1.1275,
728
+ "mean_token_accuracy": 0.6965928733348846,
729
+ "num_tokens": 921937.0,
730
+ "step": 900
731
+ },
732
+ {
733
+ "epoch": 0.012907435250063827,
734
+ "learning_rate": 0.0001974213497489433,
735
+ "loss": 1.1085,
736
+ "mean_token_accuracy": 0.7022442996501923,
737
+ "num_tokens": 932391.0,
738
+ "step": 910
739
+ },
740
+ {
741
+ "epoch": 0.013049275197866727,
742
+ "learning_rate": 0.00019739298175938273,
743
+ "loss": 1.1387,
744
+ "mean_token_accuracy": 0.7010680258274078,
745
+ "num_tokens": 942696.0,
746
+ "step": 920
747
+ },
748
+ {
749
+ "epoch": 0.013191115145669627,
750
+ "learning_rate": 0.00019736461376982213,
751
+ "loss": 1.1503,
752
+ "mean_token_accuracy": 0.693393486738205,
753
+ "num_tokens": 953051.0,
754
+ "step": 930
755
+ },
756
+ {
757
+ "epoch": 0.013332955093472525,
758
+ "learning_rate": 0.00019733624578026156,
759
+ "loss": 1.1153,
760
+ "mean_token_accuracy": 0.6921448647975922,
761
+ "num_tokens": 963299.0,
762
+ "step": 940
763
+ },
764
+ {
765
+ "epoch": 0.013474795041275425,
766
+ "learning_rate": 0.000197307877790701,
767
+ "loss": 1.1478,
768
+ "mean_token_accuracy": 0.6896324157714844,
769
+ "num_tokens": 973501.0,
770
+ "step": 950
771
+ },
772
+ {
773
+ "epoch": 0.013616634989078323,
774
+ "learning_rate": 0.0001972795098011404,
775
+ "loss": 1.1163,
776
+ "mean_token_accuracy": 0.7062079787254334,
777
+ "num_tokens": 983819.0,
778
+ "step": 960
779
+ },
780
+ {
781
+ "epoch": 0.013758474936881223,
782
+ "learning_rate": 0.00019725114181157983,
783
+ "loss": 1.134,
784
+ "mean_token_accuracy": 0.6980367541313172,
785
+ "num_tokens": 994251.0,
786
+ "step": 970
787
+ },
788
+ {
789
+ "epoch": 0.013900314884684123,
790
+ "learning_rate": 0.00019722277382201923,
791
+ "loss": 1.1761,
792
+ "mean_token_accuracy": 0.6891869902610779,
793
+ "num_tokens": 1004567.0,
794
+ "step": 980
795
+ },
796
+ {
797
+ "epoch": 0.014042154832487021,
798
+ "learning_rate": 0.00019719440583245869,
799
+ "loss": 1.1048,
800
+ "mean_token_accuracy": 0.7076132833957672,
801
+ "num_tokens": 1014866.0,
802
+ "step": 990
803
+ },
804
+ {
805
+ "epoch": 0.014183994780289921,
806
+ "learning_rate": 0.0001971660378428981,
807
+ "loss": 1.1435,
808
+ "mean_token_accuracy": 0.6922993123531341,
809
+ "num_tokens": 1025085.0,
810
+ "step": 1000
811
+ },
812
+ {
813
+ "epoch": 0.01432583472809282,
814
+ "learning_rate": 0.0001971376698533375,
815
+ "loss": 1.0938,
816
+ "mean_token_accuracy": 0.7052319467067718,
817
+ "num_tokens": 1035538.0,
818
+ "step": 1010
819
+ },
820
+ {
821
+ "epoch": 0.01446767467589572,
822
+ "learning_rate": 0.00019710930186377692,
823
+ "loss": 1.1406,
824
+ "mean_token_accuracy": 0.6931207239627838,
825
+ "num_tokens": 1045628.0,
826
+ "step": 1020
827
+ },
828
+ {
829
+ "epoch": 0.01460951462369862,
830
+ "learning_rate": 0.00019708093387421632,
831
+ "loss": 1.1303,
832
+ "mean_token_accuracy": 0.698544704914093,
833
+ "num_tokens": 1055971.0,
834
+ "step": 1030
835
+ },
836
+ {
837
+ "epoch": 0.014751354571501517,
838
+ "learning_rate": 0.00019705256588465578,
839
+ "loss": 1.1573,
840
+ "mean_token_accuracy": 0.689105898141861,
841
+ "num_tokens": 1066042.0,
842
+ "step": 1040
843
+ },
844
+ {
845
+ "epoch": 0.014893194519304417,
846
+ "learning_rate": 0.00019702419789509518,
847
+ "loss": 1.0628,
848
+ "mean_token_accuracy": 0.7112344741821289,
849
+ "num_tokens": 1076048.0,
850
+ "step": 1050
851
+ },
852
+ {
853
+ "epoch": 0.015035034467107316,
854
+ "learning_rate": 0.00019699582990553461,
855
+ "loss": 1.1377,
856
+ "mean_token_accuracy": 0.6986138761043549,
857
+ "num_tokens": 1086480.0,
858
+ "step": 1060
859
+ },
860
+ {
861
+ "epoch": 0.015176874414910215,
862
+ "learning_rate": 0.00019696746191597402,
863
+ "loss": 1.1203,
864
+ "mean_token_accuracy": 0.7030752837657929,
865
+ "num_tokens": 1096900.0,
866
+ "step": 1070
867
+ },
868
+ {
869
+ "epoch": 0.015318714362713115,
870
+ "learning_rate": 0.00019693909392641345,
871
+ "loss": 1.1438,
872
+ "mean_token_accuracy": 0.6927322566509246,
873
+ "num_tokens": 1107480.0,
874
+ "step": 1080
875
+ },
876
+ {
877
+ "epoch": 0.015460554310516014,
878
+ "learning_rate": 0.00019691072593685288,
879
+ "loss": 1.0853,
880
+ "mean_token_accuracy": 0.7137106359004974,
881
+ "num_tokens": 1117902.0,
882
+ "step": 1090
883
+ },
884
+ {
885
+ "epoch": 0.015602394258318913,
886
+ "learning_rate": 0.00019688235794729228,
887
+ "loss": 1.1405,
888
+ "mean_token_accuracy": 0.6944672584533691,
889
+ "num_tokens": 1128045.0,
890
+ "step": 1100
891
+ },
892
+ {
893
+ "epoch": 0.015744234206121813,
894
+ "learning_rate": 0.0001968539899577317,
895
+ "loss": 1.1166,
896
+ "mean_token_accuracy": 0.6966266989707947,
897
+ "num_tokens": 1138338.0,
898
+ "step": 1110
899
+ },
900
+ {
901
+ "epoch": 0.01588607415392471,
902
+ "learning_rate": 0.0001968256219681711,
903
+ "loss": 1.1176,
904
+ "mean_token_accuracy": 0.7043495059013367,
905
+ "num_tokens": 1148428.0,
906
+ "step": 1120
907
+ },
908
+ {
909
+ "epoch": 0.01602791410172761,
910
+ "learning_rate": 0.00019679725397861054,
911
+ "loss": 1.0583,
912
+ "mean_token_accuracy": 0.7091330707073211,
913
+ "num_tokens": 1158359.0,
914
+ "step": 1130
915
+ },
916
+ {
917
+ "epoch": 0.01616975404953051,
918
+ "learning_rate": 0.00019676888598904997,
919
+ "loss": 1.2325,
920
+ "mean_token_accuracy": 0.6778323352336884,
921
+ "num_tokens": 1168601.0,
922
+ "step": 1140
923
+ },
924
+ {
925
+ "epoch": 0.01631159399733341,
926
+ "learning_rate": 0.00019674051799948938,
927
+ "loss": 1.0737,
928
+ "mean_token_accuracy": 0.7033764302730561,
929
+ "num_tokens": 1178900.0,
930
+ "step": 1150
931
+ },
932
+ {
933
+ "epoch": 0.01645343394513631,
934
+ "learning_rate": 0.0001967121500099288,
935
+ "loss": 1.1099,
936
+ "mean_token_accuracy": 0.7059059202671051,
937
+ "num_tokens": 1189152.0,
938
+ "step": 1160
939
+ },
940
+ {
941
+ "epoch": 0.016595273892939206,
942
+ "learning_rate": 0.00019668378202036824,
943
+ "loss": 1.1593,
944
+ "mean_token_accuracy": 0.6904339075088501,
945
+ "num_tokens": 1199505.0,
946
+ "step": 1170
947
+ },
948
+ {
949
+ "epoch": 0.016737113840742106,
950
+ "learning_rate": 0.00019665541403080764,
951
+ "loss": 1.0748,
952
+ "mean_token_accuracy": 0.708430927991867,
953
+ "num_tokens": 1209675.0,
954
+ "step": 1180
955
+ },
956
+ {
957
+ "epoch": 0.016878953788545006,
958
+ "learning_rate": 0.00019662704604124707,
959
+ "loss": 1.1252,
960
+ "mean_token_accuracy": 0.7042509257793427,
961
+ "num_tokens": 1219595.0,
962
+ "step": 1190
963
+ },
964
+ {
965
+ "epoch": 0.017020793736347906,
966
+ "learning_rate": 0.00019659867805168647,
967
+ "loss": 1.0726,
968
+ "mean_token_accuracy": 0.6974501132965087,
969
+ "num_tokens": 1229844.0,
970
+ "step": 1200
971
+ },
972
+ {
973
+ "epoch": 0.017162633684150806,
974
+ "learning_rate": 0.00019657031006212593,
975
+ "loss": 1.0663,
976
+ "mean_token_accuracy": 0.7197851002216339,
977
+ "num_tokens": 1239909.0,
978
+ "step": 1210
979
+ },
980
+ {
981
+ "epoch": 0.017304473631953702,
982
+ "learning_rate": 0.00019654194207256533,
983
+ "loss": 1.0802,
984
+ "mean_token_accuracy": 0.7073646426200867,
985
+ "num_tokens": 1250098.0,
986
+ "step": 1220
987
+ },
988
+ {
989
+ "epoch": 0.017446313579756602,
990
+ "learning_rate": 0.00019651357408300476,
991
+ "loss": 1.1082,
992
+ "mean_token_accuracy": 0.7057863056659699,
993
+ "num_tokens": 1260400.0,
994
+ "step": 1230
995
+ },
996
+ {
997
+ "epoch": 0.017588153527559502,
998
+ "learning_rate": 0.00019648520609344416,
999
+ "loss": 1.1259,
1000
+ "mean_token_accuracy": 0.699841320514679,
1001
+ "num_tokens": 1270621.0,
1002
+ "step": 1240
1003
+ },
1004
+ {
1005
+ "epoch": 0.0177299934753624,
1006
+ "learning_rate": 0.00019645683810388357,
1007
+ "loss": 1.1123,
1008
+ "mean_token_accuracy": 0.6950526118278504,
1009
+ "num_tokens": 1280810.0,
1010
+ "step": 1250
1011
+ },
1012
+ {
1013
+ "epoch": 0.0178718334231653,
1014
+ "learning_rate": 0.00019642847011432302,
1015
+ "loss": 1.132,
1016
+ "mean_token_accuracy": 0.692725783586502,
1017
+ "num_tokens": 1291192.0,
1018
+ "step": 1260
1019
+ },
1020
+ {
1021
+ "epoch": 0.018013673370968198,
1022
+ "learning_rate": 0.00019640010212476243,
1023
+ "loss": 1.1279,
1024
+ "mean_token_accuracy": 0.703784042596817,
1025
+ "num_tokens": 1301253.0,
1026
+ "step": 1270
1027
+ },
1028
+ {
1029
+ "epoch": 0.018155513318771098,
1030
+ "learning_rate": 0.00019637173413520186,
1031
+ "loss": 1.064,
1032
+ "mean_token_accuracy": 0.7076753437519073,
1033
+ "num_tokens": 1311455.0,
1034
+ "step": 1280
1035
+ },
1036
+ {
1037
+ "epoch": 0.018297353266573998,
1038
+ "learning_rate": 0.00019634336614564126,
1039
+ "loss": 1.1056,
1040
+ "mean_token_accuracy": 0.7009041368961334,
1041
+ "num_tokens": 1322049.0,
1042
+ "step": 1290
1043
+ },
1044
+ {
1045
+ "epoch": 0.018439193214376898,
1046
+ "learning_rate": 0.0001963149981560807,
1047
+ "loss": 1.1307,
1048
+ "mean_token_accuracy": 0.6981959402561188,
1049
+ "num_tokens": 1332529.0,
1050
+ "step": 1300
1051
+ },
1052
+ {
1053
+ "epoch": 0.018581033162179798,
1054
+ "learning_rate": 0.00019628663016652012,
1055
+ "loss": 1.1149,
1056
+ "mean_token_accuracy": 0.6912563383579254,
1057
+ "num_tokens": 1342800.0,
1058
+ "step": 1310
1059
+ },
1060
+ {
1061
+ "epoch": 0.018722873109982694,
1062
+ "learning_rate": 0.00019625826217695952,
1063
+ "loss": 1.0897,
1064
+ "mean_token_accuracy": 0.7025132238864898,
1065
+ "num_tokens": 1353110.0,
1066
+ "step": 1320
1067
+ },
1068
+ {
1069
+ "epoch": 0.018864713057785594,
1070
+ "learning_rate": 0.00019622989418739895,
1071
+ "loss": 1.1089,
1072
+ "mean_token_accuracy": 0.7050705254077911,
1073
+ "num_tokens": 1363266.0,
1074
+ "step": 1330
1075
+ },
1076
+ {
1077
+ "epoch": 0.019006553005588494,
1078
+ "learning_rate": 0.00019620152619783835,
1079
+ "loss": 1.1038,
1080
+ "mean_token_accuracy": 0.7059103548526764,
1081
+ "num_tokens": 1373500.0,
1082
+ "step": 1340
1083
+ },
1084
+ {
1085
+ "epoch": 0.019148392953391394,
1086
+ "learning_rate": 0.00019617315820827778,
1087
+ "loss": 1.1222,
1088
+ "mean_token_accuracy": 0.7083336532115936,
1089
+ "num_tokens": 1383687.0,
1090
+ "step": 1350
1091
+ },
1092
+ {
1093
+ "epoch": 0.019290232901194294,
1094
+ "learning_rate": 0.00019614479021871721,
1095
+ "loss": 1.1291,
1096
+ "mean_token_accuracy": 0.6995004296302796,
1097
+ "num_tokens": 1394120.0,
1098
+ "step": 1360
1099
+ },
1100
+ {
1101
+ "epoch": 0.01943207284899719,
1102
+ "learning_rate": 0.00019611642222915662,
1103
+ "loss": 1.0957,
1104
+ "mean_token_accuracy": 0.7055183351039886,
1105
+ "num_tokens": 1404491.0,
1106
+ "step": 1370
1107
+ },
1108
+ {
1109
+ "epoch": 0.01957391279680009,
1110
+ "learning_rate": 0.00019608805423959605,
1111
+ "loss": 1.1187,
1112
+ "mean_token_accuracy": 0.6980840861797333,
1113
+ "num_tokens": 1414654.0,
1114
+ "step": 1380
1115
+ },
1116
+ {
1117
+ "epoch": 0.01971575274460299,
1118
+ "learning_rate": 0.00019605968625003548,
1119
+ "loss": 1.1484,
1120
+ "mean_token_accuracy": 0.6943079948425293,
1121
+ "num_tokens": 1424822.0,
1122
+ "step": 1390
1123
+ },
1124
+ {
1125
+ "epoch": 0.01985759269240589,
1126
+ "learning_rate": 0.0001960313182604749,
1127
+ "loss": 1.1231,
1128
+ "mean_token_accuracy": 0.7027773916721344,
1129
+ "num_tokens": 1435210.0,
1130
+ "step": 1400
1131
+ },
1132
+ {
1133
+ "epoch": 0.01999943264020879,
1134
+ "learning_rate": 0.0001960029502709143,
1135
+ "loss": 1.0682,
1136
+ "mean_token_accuracy": 0.7105901122093201,
1137
+ "num_tokens": 1445483.0,
1138
+ "step": 1410
1139
+ },
1140
+ {
1141
+ "epoch": 0.020141272588011686,
1142
+ "learning_rate": 0.0001959745822813537,
1143
+ "loss": 1.0946,
1144
+ "mean_token_accuracy": 0.7032223284244538,
1145
+ "num_tokens": 1455942.0,
1146
+ "step": 1420
1147
+ },
1148
+ {
1149
+ "epoch": 0.020283112535814586,
1150
+ "learning_rate": 0.00019594621429179314,
1151
+ "loss": 1.0769,
1152
+ "mean_token_accuracy": 0.7099736094474792,
1153
+ "num_tokens": 1465992.0,
1154
+ "step": 1430
1155
+ },
1156
+ {
1157
+ "epoch": 0.020424952483617486,
1158
+ "learning_rate": 0.00019591784630223257,
1159
+ "loss": 1.044,
1160
+ "mean_token_accuracy": 0.7143494069576264,
1161
+ "num_tokens": 1476070.0,
1162
+ "step": 1440
1163
+ },
1164
+ {
1165
+ "epoch": 0.020566792431420386,
1166
+ "learning_rate": 0.000195889478312672,
1167
+ "loss": 1.0988,
1168
+ "mean_token_accuracy": 0.6990650355815887,
1169
+ "num_tokens": 1486388.0,
1170
+ "step": 1450
1171
+ },
1172
+ {
1173
+ "epoch": 0.020708632379223286,
1174
+ "learning_rate": 0.0001958611103231114,
1175
+ "loss": 1.0812,
1176
+ "mean_token_accuracy": 0.7099774420261383,
1177
+ "num_tokens": 1496689.0,
1178
+ "step": 1460
1179
+ },
1180
+ {
1181
+ "epoch": 0.020850472327026182,
1182
+ "learning_rate": 0.00019583274233355084,
1183
+ "loss": 1.0747,
1184
+ "mean_token_accuracy": 0.7068913519382477,
1185
+ "num_tokens": 1506676.0,
1186
+ "step": 1470
1187
+ },
1188
+ {
1189
+ "epoch": 0.020992312274829082,
1190
+ "learning_rate": 0.00019580437434399027,
1191
+ "loss": 1.1216,
1192
+ "mean_token_accuracy": 0.6996153056621551,
1193
+ "num_tokens": 1516996.0,
1194
+ "step": 1480
1195
+ },
1196
+ {
1197
+ "epoch": 0.021134152222631982,
1198
+ "learning_rate": 0.00019577600635442967,
1199
+ "loss": 1.0814,
1200
+ "mean_token_accuracy": 0.7133045315742492,
1201
+ "num_tokens": 1526981.0,
1202
+ "step": 1490
1203
+ },
1204
+ {
1205
+ "epoch": 0.021275992170434882,
1206
+ "learning_rate": 0.0001957476383648691,
1207
+ "loss": 1.042,
1208
+ "mean_token_accuracy": 0.7156777441501617,
1209
+ "num_tokens": 1536967.0,
1210
+ "step": 1500
1211
+ },
1212
+ {
1213
+ "epoch": 0.021417832118237782,
1214
+ "learning_rate": 0.0001957192703753085,
1215
+ "loss": 1.0735,
1216
+ "mean_token_accuracy": 0.7105422735214233,
1217
+ "num_tokens": 1547354.0,
1218
+ "step": 1510
1219
+ },
1220
+ {
1221
+ "epoch": 0.02155967206604068,
1222
+ "learning_rate": 0.00019569090238574793,
1223
+ "loss": 1.1221,
1224
+ "mean_token_accuracy": 0.6900959551334381,
1225
+ "num_tokens": 1557285.0,
1226
+ "step": 1520
1227
+ },
1228
+ {
1229
+ "epoch": 0.021701512013843578,
1230
+ "learning_rate": 0.00019566253439618736,
1231
+ "loss": 1.1432,
1232
+ "mean_token_accuracy": 0.7033149361610412,
1233
+ "num_tokens": 1567450.0,
1234
+ "step": 1530
1235
+ },
1236
+ {
1237
+ "epoch": 0.021843351961646478,
1238
+ "learning_rate": 0.00019563416640662676,
1239
+ "loss": 1.0863,
1240
+ "mean_token_accuracy": 0.7135837018489838,
1241
+ "num_tokens": 1577684.0,
1242
+ "step": 1540
1243
+ },
1244
+ {
1245
+ "epoch": 0.021985191909449378,
1246
+ "learning_rate": 0.0001956057984170662,
1247
+ "loss": 1.0564,
1248
+ "mean_token_accuracy": 0.7110729515552521,
1249
+ "num_tokens": 1588410.0,
1250
+ "step": 1550
1251
+ },
1252
+ {
1253
+ "epoch": 0.022127031857252278,
1254
+ "learning_rate": 0.0001955774304275056,
1255
+ "loss": 1.1056,
1256
+ "mean_token_accuracy": 0.7048493981361389,
1257
+ "num_tokens": 1598595.0,
1258
+ "step": 1560
1259
+ },
1260
+ {
1261
+ "epoch": 0.022268871805055174,
1262
+ "learning_rate": 0.00019554906243794505,
1263
+ "loss": 1.0904,
1264
+ "mean_token_accuracy": 0.707161259651184,
1265
+ "num_tokens": 1608782.0,
1266
+ "step": 1570
1267
+ },
1268
+ {
1269
+ "epoch": 0.022410711752858074,
1270
+ "learning_rate": 0.00019552069444838446,
1271
+ "loss": 1.0611,
1272
+ "mean_token_accuracy": 0.7107515692710876,
1273
+ "num_tokens": 1618985.0,
1274
+ "step": 1580
1275
+ },
1276
+ {
1277
+ "epoch": 0.022552551700660974,
1278
+ "learning_rate": 0.00019549232645882386,
1279
+ "loss": 1.0864,
1280
+ "mean_token_accuracy": 0.7008066534996032,
1281
+ "num_tokens": 1629402.0,
1282
+ "step": 1590
1283
+ },
1284
+ {
1285
+ "epoch": 0.022694391648463874,
1286
+ "learning_rate": 0.0001954639584692633,
1287
+ "loss": 1.1076,
1288
+ "mean_token_accuracy": 0.6996842324733734,
1289
+ "num_tokens": 1639522.0,
1290
+ "step": 1600
1291
+ },
1292
+ {
1293
+ "epoch": 0.022836231596266774,
1294
+ "learning_rate": 0.00019543559047970272,
1295
+ "loss": 1.1011,
1296
+ "mean_token_accuracy": 0.7056106328964233,
1297
+ "num_tokens": 1649778.0,
1298
+ "step": 1610
1299
+ },
1300
+ {
1301
+ "epoch": 0.02297807154406967,
1302
+ "learning_rate": 0.00019540722249014215,
1303
+ "loss": 1.118,
1304
+ "mean_token_accuracy": 0.696357262134552,
1305
+ "num_tokens": 1660207.0,
1306
+ "step": 1620
1307
+ },
1308
+ {
1309
+ "epoch": 0.02311991149187257,
1310
+ "learning_rate": 0.00019537885450058155,
1311
+ "loss": 1.0676,
1312
+ "mean_token_accuracy": 0.7104279041290283,
1313
+ "num_tokens": 1670377.0,
1314
+ "step": 1630
1315
+ },
1316
+ {
1317
+ "epoch": 0.02326175143967547,
1318
+ "learning_rate": 0.00019535048651102098,
1319
+ "loss": 1.0716,
1320
+ "mean_token_accuracy": 0.7117774069309235,
1321
+ "num_tokens": 1680672.0,
1322
+ "step": 1640
1323
+ },
1324
+ {
1325
+ "epoch": 0.02340359138747837,
1326
+ "learning_rate": 0.00019532211852146038,
1327
+ "loss": 1.088,
1328
+ "mean_token_accuracy": 0.7042657971382141,
1329
+ "num_tokens": 1691141.0,
1330
+ "step": 1650
1331
+ },
1332
+ {
1333
+ "epoch": 0.02354543133528127,
1334
+ "learning_rate": 0.00019529375053189981,
1335
+ "loss": 1.1076,
1336
+ "mean_token_accuracy": 0.7016879081726074,
1337
+ "num_tokens": 1701472.0,
1338
+ "step": 1660
1339
+ },
1340
+ {
1341
+ "epoch": 0.023687271283084167,
1342
+ "learning_rate": 0.00019526538254233924,
1343
+ "loss": 1.0842,
1344
+ "mean_token_accuracy": 0.7067211866378784,
1345
+ "num_tokens": 1711678.0,
1346
+ "step": 1670
1347
+ },
1348
+ {
1349
+ "epoch": 0.023829111230887066,
1350
+ "learning_rate": 0.00019523701455277865,
1351
+ "loss": 1.0875,
1352
+ "mean_token_accuracy": 0.7065619647502899,
1353
+ "num_tokens": 1722146.0,
1354
+ "step": 1680
1355
+ },
1356
+ {
1357
+ "epoch": 0.023970951178689966,
1358
+ "learning_rate": 0.00019520864656321808,
1359
+ "loss": 1.0687,
1360
+ "mean_token_accuracy": 0.7062947809696197,
1361
+ "num_tokens": 1732546.0,
1362
+ "step": 1690
1363
+ },
1364
+ {
1365
+ "epoch": 0.024112791126492866,
1366
+ "learning_rate": 0.0001951802785736575,
1367
+ "loss": 1.0985,
1368
+ "mean_token_accuracy": 0.7017097353935242,
1369
+ "num_tokens": 1742873.0,
1370
+ "step": 1700
1371
+ },
1372
+ {
1373
+ "epoch": 0.024254631074295766,
1374
+ "learning_rate": 0.0001951519105840969,
1375
+ "loss": 1.0768,
1376
+ "mean_token_accuracy": 0.7073511421680451,
1377
+ "num_tokens": 1753232.0,
1378
+ "step": 1710
1379
+ },
1380
+ {
1381
+ "epoch": 0.024396471022098663,
1382
+ "learning_rate": 0.00019512354259453634,
1383
+ "loss": 1.0708,
1384
+ "mean_token_accuracy": 0.7018490791320801,
1385
+ "num_tokens": 1763622.0,
1386
+ "step": 1720
1387
+ },
1388
+ {
1389
+ "epoch": 0.024538310969901563,
1390
+ "learning_rate": 0.00019509517460497574,
1391
+ "loss": 1.1175,
1392
+ "mean_token_accuracy": 0.7016505122184753,
1393
+ "num_tokens": 1774027.0,
1394
+ "step": 1730
1395
+ },
1396
+ {
1397
+ "epoch": 0.024680150917704462,
1398
+ "learning_rate": 0.00019506680661541517,
1399
+ "loss": 1.0774,
1400
+ "mean_token_accuracy": 0.7039576828479767,
1401
+ "num_tokens": 1784277.0,
1402
+ "step": 1740
1403
+ },
1404
+ {
1405
+ "epoch": 0.024821990865507362,
1406
+ "learning_rate": 0.0001950384386258546,
1407
+ "loss": 1.092,
1408
+ "mean_token_accuracy": 0.7060896992683411,
1409
+ "num_tokens": 1794436.0,
1410
+ "step": 1750
1411
+ },
1412
+ {
1413
+ "epoch": 0.024963830813310262,
1414
+ "learning_rate": 0.000195010070636294,
1415
+ "loss": 1.0478,
1416
+ "mean_token_accuracy": 0.708050674200058,
1417
+ "num_tokens": 1804575.0,
1418
+ "step": 1760
1419
+ },
1420
+ {
1421
+ "epoch": 0.02510567076111316,
1422
+ "learning_rate": 0.00019498170264673344,
1423
+ "loss": 1.0737,
1424
+ "mean_token_accuracy": 0.7079983413219452,
1425
+ "num_tokens": 1814603.0,
1426
+ "step": 1770
1427
+ },
1428
+ {
1429
+ "epoch": 0.02524751070891606,
1430
+ "learning_rate": 0.00019495333465717284,
1431
+ "loss": 1.0445,
1432
+ "mean_token_accuracy": 0.7188371956348419,
1433
+ "num_tokens": 1824990.0,
1434
+ "step": 1780
1435
+ },
1436
+ {
1437
+ "epoch": 0.02538935065671896,
1438
+ "learning_rate": 0.0001949249666676123,
1439
+ "loss": 1.0557,
1440
+ "mean_token_accuracy": 0.7108985543251037,
1441
+ "num_tokens": 1835477.0,
1442
+ "step": 1790
1443
+ },
1444
+ {
1445
+ "epoch": 0.02553119060452186,
1446
+ "learning_rate": 0.0001948965986780517,
1447
+ "loss": 1.054,
1448
+ "mean_token_accuracy": 0.7082186043262482,
1449
+ "num_tokens": 1845736.0,
1450
+ "step": 1800
1451
+ },
1452
+ {
1453
+ "epoch": 0.02567303055232476,
1454
+ "learning_rate": 0.00019486823068849113,
1455
+ "loss": 1.0705,
1456
+ "mean_token_accuracy": 0.7082441449165344,
1457
+ "num_tokens": 1855966.0,
1458
+ "step": 1810
1459
+ },
1460
+ {
1461
+ "epoch": 0.025814870500127655,
1462
+ "learning_rate": 0.00019483986269893053,
1463
+ "loss": 1.0473,
1464
+ "mean_token_accuracy": 0.7133744478225708,
1465
+ "num_tokens": 1866194.0,
1466
+ "step": 1820
1467
+ },
1468
+ {
1469
+ "epoch": 0.025956710447930555,
1470
+ "learning_rate": 0.00019481149470936993,
1471
+ "loss": 1.0913,
1472
+ "mean_token_accuracy": 0.700226366519928,
1473
+ "num_tokens": 1876551.0,
1474
+ "step": 1830
1475
+ },
1476
+ {
1477
+ "epoch": 0.026098550395733455,
1478
+ "learning_rate": 0.0001947831267198094,
1479
+ "loss": 1.0627,
1480
+ "mean_token_accuracy": 0.7126640200614929,
1481
+ "num_tokens": 1886799.0,
1482
+ "step": 1840
1483
+ },
1484
+ {
1485
+ "epoch": 0.026240390343536354,
1486
+ "learning_rate": 0.0001947547587302488,
1487
+ "loss": 1.1058,
1488
+ "mean_token_accuracy": 0.7105142951011658,
1489
+ "num_tokens": 1897204.0,
1490
+ "step": 1850
1491
+ },
1492
+ {
1493
+ "epoch": 0.026382230291339254,
1494
+ "learning_rate": 0.00019472639074068822,
1495
+ "loss": 1.1202,
1496
+ "mean_token_accuracy": 0.6887533903121948,
1497
+ "num_tokens": 1907365.0,
1498
+ "step": 1860
1499
+ },
1500
+ {
1501
+ "epoch": 0.02652407023914215,
1502
+ "learning_rate": 0.00019469802275112763,
1503
+ "loss": 1.0668,
1504
+ "mean_token_accuracy": 0.7144553422927856,
1505
+ "num_tokens": 1917702.0,
1506
+ "step": 1870
1507
+ },
1508
+ {
1509
+ "epoch": 0.02666591018694505,
1510
+ "learning_rate": 0.00019466965476156706,
1511
+ "loss": 1.1356,
1512
+ "mean_token_accuracy": 0.6965939939022064,
1513
+ "num_tokens": 1928031.0,
1514
+ "step": 1880
1515
+ },
1516
+ {
1517
+ "epoch": 0.02680775013474795,
1518
+ "learning_rate": 0.0001946412867720065,
1519
+ "loss": 1.0778,
1520
+ "mean_token_accuracy": 0.7089079439640045,
1521
+ "num_tokens": 1938084.0,
1522
+ "step": 1890
1523
+ },
1524
+ {
1525
+ "epoch": 0.02694959008255085,
1526
+ "learning_rate": 0.0001946129187824459,
1527
+ "loss": 1.0483,
1528
+ "mean_token_accuracy": 0.7225513160228729,
1529
+ "num_tokens": 1948205.0,
1530
+ "step": 1900
1531
+ },
1532
+ {
1533
+ "epoch": 0.02709143003035375,
1534
+ "learning_rate": 0.00019458455079288532,
1535
+ "loss": 1.0154,
1536
+ "mean_token_accuracy": 0.7130086362361908,
1537
+ "num_tokens": 1958368.0,
1538
+ "step": 1910
1539
+ },
1540
+ {
1541
+ "epoch": 0.027233269978156647,
1542
+ "learning_rate": 0.00019455618280332472,
1543
+ "loss": 1.1258,
1544
+ "mean_token_accuracy": 0.695936119556427,
1545
+ "num_tokens": 1968738.0,
1546
+ "step": 1920
1547
+ },
1548
+ {
1549
+ "epoch": 0.027375109925959547,
1550
+ "learning_rate": 0.00019452781481376415,
1551
+ "loss": 1.0626,
1552
+ "mean_token_accuracy": 0.7156413078308106,
1553
+ "num_tokens": 1979045.0,
1554
+ "step": 1930
1555
+ },
1556
+ {
1557
+ "epoch": 0.027516949873762447,
1558
+ "learning_rate": 0.00019449944682420358,
1559
+ "loss": 1.0519,
1560
+ "mean_token_accuracy": 0.7116637229919434,
1561
+ "num_tokens": 1989218.0,
1562
+ "step": 1940
1563
+ },
1564
+ {
1565
+ "epoch": 0.027658789821565347,
1566
+ "learning_rate": 0.00019447107883464299,
1567
+ "loss": 1.0712,
1568
+ "mean_token_accuracy": 0.711160945892334,
1569
+ "num_tokens": 1999563.0,
1570
+ "step": 1950
1571
+ },
1572
+ {
1573
+ "epoch": 0.027800629769368247,
1574
+ "learning_rate": 0.00019444271084508242,
1575
+ "loss": 1.1045,
1576
+ "mean_token_accuracy": 0.7045223116874695,
1577
+ "num_tokens": 2009911.0,
1578
+ "step": 1960
1579
+ },
1580
+ {
1581
+ "epoch": 0.027942469717171143,
1582
+ "learning_rate": 0.00019441434285552184,
1583
+ "loss": 1.0463,
1584
+ "mean_token_accuracy": 0.7148343741893768,
1585
+ "num_tokens": 2020120.0,
1586
+ "step": 1970
1587
+ },
1588
+ {
1589
+ "epoch": 0.028084309664974043,
1590
+ "learning_rate": 0.00019438597486596127,
1591
+ "loss": 1.0957,
1592
+ "mean_token_accuracy": 0.7087677419185638,
1593
+ "num_tokens": 2030445.0,
1594
+ "step": 1980
1595
+ },
1596
+ {
1597
+ "epoch": 0.028226149612776943,
1598
+ "learning_rate": 0.00019435760687640068,
1599
+ "loss": 1.0338,
1600
+ "mean_token_accuracy": 0.7160651028156281,
1601
+ "num_tokens": 2040755.0,
1602
+ "step": 1990
1603
+ },
1604
+ {
1605
+ "epoch": 0.028367989560579843,
1606
+ "learning_rate": 0.00019432923888684008,
1607
+ "loss": 1.0508,
1608
+ "mean_token_accuracy": 0.710547685623169,
1609
+ "num_tokens": 2050888.0,
1610
+ "step": 2000
1611
+ },
1612
+ {
1613
+ "epoch": 0.028509829508382743,
1614
+ "learning_rate": 0.00019430087089727954,
1615
+ "loss": 1.0991,
1616
+ "mean_token_accuracy": 0.6983346939086914,
1617
+ "num_tokens": 2061216.0,
1618
+ "step": 2010
1619
+ },
1620
+ {
1621
+ "epoch": 0.02865166945618564,
1622
+ "learning_rate": 0.00019427250290771894,
1623
+ "loss": 1.0335,
1624
+ "mean_token_accuracy": 0.7186195015907287,
1625
+ "num_tokens": 2071685.0,
1626
+ "step": 2020
1627
+ },
1628
+ {
1629
+ "epoch": 0.02879350940398854,
1630
+ "learning_rate": 0.00019424413491815837,
1631
+ "loss": 1.0356,
1632
+ "mean_token_accuracy": 0.707346785068512,
1633
+ "num_tokens": 2081856.0,
1634
+ "step": 2030
1635
+ },
1636
+ {
1637
+ "epoch": 0.02893534935179144,
1638
+ "learning_rate": 0.00019421576692859777,
1639
+ "loss": 1.0796,
1640
+ "mean_token_accuracy": 0.713982081413269,
1641
+ "num_tokens": 2092164.0,
1642
+ "step": 2040
1643
+ },
1644
+ {
1645
+ "epoch": 0.02907718929959434,
1646
+ "learning_rate": 0.0001941873989390372,
1647
+ "loss": 1.0606,
1648
+ "mean_token_accuracy": 0.7008832335472107,
1649
+ "num_tokens": 2102352.0,
1650
+ "step": 2050
1651
+ },
1652
+ {
1653
+ "epoch": 0.02921902924739724,
1654
+ "learning_rate": 0.00019415903094947663,
1655
+ "loss": 1.0889,
1656
+ "mean_token_accuracy": 0.7124337434768677,
1657
+ "num_tokens": 2112565.0,
1658
+ "step": 2060
1659
+ },
1660
+ {
1661
+ "epoch": 0.029360869195200135,
1662
+ "learning_rate": 0.00019413066295991604,
1663
+ "loss": 1.1011,
1664
+ "mean_token_accuracy": 0.7071171522140502,
1665
+ "num_tokens": 2122888.0,
1666
+ "step": 2070
1667
+ },
1668
+ {
1669
+ "epoch": 0.029502709143003035,
1670
+ "learning_rate": 0.00019410229497035547,
1671
+ "loss": 1.1143,
1672
+ "mean_token_accuracy": 0.706645280122757,
1673
+ "num_tokens": 2133066.0,
1674
+ "step": 2080
1675
+ },
1676
+ {
1677
+ "epoch": 0.029644549090805935,
1678
+ "learning_rate": 0.00019407392698079487,
1679
+ "loss": 1.0579,
1680
+ "mean_token_accuracy": 0.7105309844017029,
1681
+ "num_tokens": 2143320.0,
1682
+ "step": 2090
1683
+ },
1684
+ {
1685
+ "epoch": 0.029786389038608835,
1686
+ "learning_rate": 0.0001940455589912343,
1687
+ "loss": 1.0705,
1688
+ "mean_token_accuracy": 0.7104713022708893,
1689
+ "num_tokens": 2153668.0,
1690
+ "step": 2100
1691
+ },
1692
+ {
1693
+ "epoch": 0.029928228986411735,
1694
+ "learning_rate": 0.00019401719100167373,
1695
+ "loss": 1.0957,
1696
+ "mean_token_accuracy": 0.7042783737182617,
1697
+ "num_tokens": 2164139.0,
1698
+ "step": 2110
1699
+ },
1700
+ {
1701
+ "epoch": 0.03007006893421463,
1702
+ "learning_rate": 0.00019398882301211313,
1703
+ "loss": 1.0888,
1704
+ "mean_token_accuracy": 0.7090662837028503,
1705
+ "num_tokens": 2174254.0,
1706
+ "step": 2120
1707
+ },
1708
+ {
1709
+ "epoch": 0.03021190888201753,
1710
+ "learning_rate": 0.00019396045502255256,
1711
+ "loss": 1.085,
1712
+ "mean_token_accuracy": 0.7122257769107818,
1713
+ "num_tokens": 2184390.0,
1714
+ "step": 2130
1715
+ },
1716
+ {
1717
+ "epoch": 0.03035374882982043,
1718
+ "learning_rate": 0.00019393208703299196,
1719
+ "loss": 1.0785,
1720
+ "mean_token_accuracy": 0.7105566322803497,
1721
+ "num_tokens": 2194501.0,
1722
+ "step": 2140
1723
+ },
1724
+ {
1725
+ "epoch": 0.03049558877762333,
1726
+ "learning_rate": 0.00019390371904343142,
1727
+ "loss": 1.078,
1728
+ "mean_token_accuracy": 0.7077171504497528,
1729
+ "num_tokens": 2204777.0,
1730
+ "step": 2150
1731
+ },
1732
+ {
1733
+ "epoch": 0.03063742872542623,
1734
+ "learning_rate": 0.00019387535105387082,
1735
+ "loss": 1.0877,
1736
+ "mean_token_accuracy": 0.7124951481819153,
1737
+ "num_tokens": 2215021.0,
1738
+ "step": 2160
1739
+ },
1740
+ {
1741
+ "epoch": 0.030779268673229127,
1742
+ "learning_rate": 0.00019384698306431023,
1743
+ "loss": 1.114,
1744
+ "mean_token_accuracy": 0.6971140921115875,
1745
+ "num_tokens": 2225350.0,
1746
+ "step": 2170
1747
+ },
1748
+ {
1749
+ "epoch": 0.030921108621032027,
1750
+ "learning_rate": 0.00019381861507474966,
1751
+ "loss": 1.081,
1752
+ "mean_token_accuracy": 0.7068113803863525,
1753
+ "num_tokens": 2235521.0,
1754
+ "step": 2180
1755
+ },
1756
+ {
1757
+ "epoch": 0.031062948568834927,
1758
+ "learning_rate": 0.0001937902470851891,
1759
+ "loss": 1.0834,
1760
+ "mean_token_accuracy": 0.7073126614093781,
1761
+ "num_tokens": 2245925.0,
1762
+ "step": 2190
1763
+ },
1764
+ {
1765
+ "epoch": 0.031204788516637827,
1766
+ "learning_rate": 0.00019376187909562852,
1767
+ "loss": 1.0519,
1768
+ "mean_token_accuracy": 0.7111847221851348,
1769
+ "num_tokens": 2256364.0,
1770
+ "step": 2200
1771
+ },
1772
+ {
1773
+ "epoch": 0.03134662846444072,
1774
+ "learning_rate": 0.00019373351110606792,
1775
+ "loss": 1.0787,
1776
+ "mean_token_accuracy": 0.709247374534607,
1777
+ "num_tokens": 2266801.0,
1778
+ "step": 2210
1779
+ },
1780
+ {
1781
+ "epoch": 0.03148846841224363,
1782
+ "learning_rate": 0.00019370514311650735,
1783
+ "loss": 1.0879,
1784
+ "mean_token_accuracy": 0.7051020622253418,
1785
+ "num_tokens": 2276883.0,
1786
+ "step": 2220
1787
+ },
1788
+ {
1789
+ "epoch": 0.03163030836004652,
1790
+ "learning_rate": 0.00019367677512694675,
1791
+ "loss": 1.0577,
1792
+ "mean_token_accuracy": 0.7003655672073364,
1793
+ "num_tokens": 2287178.0,
1794
+ "step": 2230
1795
+ },
1796
+ {
1797
+ "epoch": 0.03177214830784942,
1798
+ "learning_rate": 0.00019364840713738618,
1799
+ "loss": 1.1319,
1800
+ "mean_token_accuracy": 0.6982253730297089,
1801
+ "num_tokens": 2297269.0,
1802
+ "step": 2240
1803
+ },
1804
+ {
1805
+ "epoch": 0.03191398825565232,
1806
+ "learning_rate": 0.0001936200391478256,
1807
+ "loss": 1.0966,
1808
+ "mean_token_accuracy": 0.7028747737407685,
1809
+ "num_tokens": 2307553.0,
1810
+ "step": 2250
1811
+ },
1812
+ {
1813
+ "epoch": 0.03205582820345522,
1814
+ "learning_rate": 0.00019359167115826502,
1815
+ "loss": 1.0234,
1816
+ "mean_token_accuracy": 0.7202287912368774,
1817
+ "num_tokens": 2317537.0,
1818
+ "step": 2260
1819
+ },
1820
+ {
1821
+ "epoch": 0.03219766815125812,
1822
+ "learning_rate": 0.00019356330316870445,
1823
+ "loss": 1.0733,
1824
+ "mean_token_accuracy": 0.7071494162082672,
1825
+ "num_tokens": 2327652.0,
1826
+ "step": 2270
1827
+ },
1828
+ {
1829
+ "epoch": 0.03233950809906102,
1830
+ "learning_rate": 0.00019353493517914388,
1831
+ "loss": 1.0692,
1832
+ "mean_token_accuracy": 0.704166728258133,
1833
+ "num_tokens": 2337948.0,
1834
+ "step": 2280
1835
+ },
1836
+ {
1837
+ "epoch": 0.032481348046863916,
1838
+ "learning_rate": 0.00019350656718958328,
1839
+ "loss": 1.0926,
1840
+ "mean_token_accuracy": 0.7022884428501129,
1841
+ "num_tokens": 2348363.0,
1842
+ "step": 2290
1843
+ },
1844
+ {
1845
+ "epoch": 0.03262318799466682,
1846
+ "learning_rate": 0.0001934781992000227,
1847
+ "loss": 1.1009,
1848
+ "mean_token_accuracy": 0.7048872351646424,
1849
+ "num_tokens": 2358709.0,
1850
+ "step": 2300
1851
+ },
1852
+ {
1853
+ "epoch": 0.032765027942469716,
1854
+ "learning_rate": 0.0001934498312104621,
1855
+ "loss": 1.0722,
1856
+ "mean_token_accuracy": 0.7133905410766601,
1857
+ "num_tokens": 2368896.0,
1858
+ "step": 2310
1859
+ },
1860
+ {
1861
+ "epoch": 0.03290686789027262,
1862
+ "learning_rate": 0.00019342146322090154,
1863
+ "loss": 1.0448,
1864
+ "mean_token_accuracy": 0.7166795194149017,
1865
+ "num_tokens": 2379133.0,
1866
+ "step": 2320
1867
+ },
1868
+ {
1869
+ "epoch": 0.033048707838075515,
1870
+ "learning_rate": 0.00019339309523134097,
1871
+ "loss": 1.07,
1872
+ "mean_token_accuracy": 0.7065820157527923,
1873
+ "num_tokens": 2389292.0,
1874
+ "step": 2330
1875
+ },
1876
+ {
1877
+ "epoch": 0.03319054778587841,
1878
+ "learning_rate": 0.00019336472724178037,
1879
+ "loss": 1.084,
1880
+ "mean_token_accuracy": 0.7133280396461487,
1881
+ "num_tokens": 2399640.0,
1882
+ "step": 2340
1883
+ },
1884
+ {
1885
+ "epoch": 0.033332387733681315,
1886
+ "learning_rate": 0.0001933363592522198,
1887
+ "loss": 1.0789,
1888
+ "mean_token_accuracy": 0.70665163397789,
1889
+ "num_tokens": 2410039.0,
1890
+ "step": 2350
1891
+ }
1892
+ ],
1893
+ "logging_steps": 10,
1894
+ "max_steps": 70502,
1895
+ "num_input_tokens_seen": 0,
1896
+ "num_train_epochs": 9223372036854775807,
1897
+ "save_steps": 2350,
1898
+ "stateful_callbacks": {
1899
+ "TrainerControl": {
1900
+ "args": {
1901
+ "should_epoch_stop": false,
1902
+ "should_evaluate": false,
1903
+ "should_log": false,
1904
+ "should_save": true,
1905
+ "should_training_stop": false
1906
+ },
1907
+ "attributes": {}
1908
+ }
1909
+ },
1910
+ "total_flos": 1.2857716965310464e+17,
1911
+ "train_batch_size": 16,
1912
+ "trial_name": null,
1913
+ "trial_params": null
1914
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.0
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 8,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "q_proj",
28
+ "v_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "trainable_token_indices": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/trainer_state.json ADDED
@@ -0,0 +1,1378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.03332542225386654,
6
+ "eval_steps": 500,
7
+ "global_step": 1685,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0001977769866698311,
14
+ "learning_rate": 0.00019996440014239942,
15
+ "loss": 0.3352,
16
+ "mean_token_accuracy": 0.9023264050483704,
17
+ "num_tokens": 19163.0,
18
+ "step": 10
19
+ },
20
+ {
21
+ "epoch": 0.0003955539733396622,
22
+ "learning_rate": 0.0001999248447450655,
23
+ "loss": 0.1524,
24
+ "mean_token_accuracy": 0.947095412015915,
25
+ "num_tokens": 38071.0,
26
+ "step": 20
27
+ },
28
+ {
29
+ "epoch": 0.0005933309600094933,
30
+ "learning_rate": 0.0001998852893477315,
31
+ "loss": 0.1461,
32
+ "mean_token_accuracy": 0.9463379800319671,
33
+ "num_tokens": 57017.0,
34
+ "step": 30
35
+ },
36
+ {
37
+ "epoch": 0.0007911079466793244,
38
+ "learning_rate": 0.00019984573395039754,
39
+ "loss": 0.1024,
40
+ "mean_token_accuracy": 0.9588611423969269,
41
+ "num_tokens": 76047.0,
42
+ "step": 40
43
+ },
44
+ {
45
+ "epoch": 0.0009888849333491555,
46
+ "learning_rate": 0.00019980617855306357,
47
+ "loss": 0.117,
48
+ "mean_token_accuracy": 0.9553473949432373,
49
+ "num_tokens": 94824.0,
50
+ "step": 50
51
+ },
52
+ {
53
+ "epoch": 0.0011866619200189867,
54
+ "learning_rate": 0.0001997666231557296,
55
+ "loss": 0.1281,
56
+ "mean_token_accuracy": 0.9538084208965302,
57
+ "num_tokens": 113467.0,
58
+ "step": 60
59
+ },
60
+ {
61
+ "epoch": 0.0013844389066888178,
62
+ "learning_rate": 0.00019972706775839565,
63
+ "loss": 0.1075,
64
+ "mean_token_accuracy": 0.961921775341034,
65
+ "num_tokens": 132229.0,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 0.0015822158933586489,
70
+ "learning_rate": 0.00019968751236106166,
71
+ "loss": 0.1057,
72
+ "mean_token_accuracy": 0.9676864743232727,
73
+ "num_tokens": 150811.0,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.00177999288002848,
78
+ "learning_rate": 0.00019964795696372772,
79
+ "loss": 0.1048,
80
+ "mean_token_accuracy": 0.9597055971622467,
81
+ "num_tokens": 169804.0,
82
+ "step": 90
83
+ },
84
+ {
85
+ "epoch": 0.001977769866698311,
86
+ "learning_rate": 0.00019960840156639376,
87
+ "loss": 0.1147,
88
+ "mean_token_accuracy": 0.9561040580272675,
89
+ "num_tokens": 188503.0,
90
+ "step": 100
91
+ },
92
+ {
93
+ "epoch": 0.002175546853368142,
94
+ "learning_rate": 0.00019956884616905977,
95
+ "loss": 0.0904,
96
+ "mean_token_accuracy": 0.9663344562053681,
97
+ "num_tokens": 207370.0,
98
+ "step": 110
99
+ },
100
+ {
101
+ "epoch": 0.0023733238400379733,
102
+ "learning_rate": 0.0001995292907717258,
103
+ "loss": 0.085,
104
+ "mean_token_accuracy": 0.9692767798900604,
105
+ "num_tokens": 226639.0,
106
+ "step": 120
107
+ },
108
+ {
109
+ "epoch": 0.0025711008267078044,
110
+ "learning_rate": 0.00019948973537439185,
111
+ "loss": 0.1055,
112
+ "mean_token_accuracy": 0.962373024225235,
113
+ "num_tokens": 245318.0,
114
+ "step": 130
115
+ },
116
+ {
117
+ "epoch": 0.0027688778133776355,
118
+ "learning_rate": 0.00019945017997705788,
119
+ "loss": 0.1023,
120
+ "mean_token_accuracy": 0.9603676617145538,
121
+ "num_tokens": 264089.0,
122
+ "step": 140
123
+ },
124
+ {
125
+ "epoch": 0.0029666548000474666,
126
+ "learning_rate": 0.0001994106245797239,
127
+ "loss": 0.106,
128
+ "mean_token_accuracy": 0.9619979500770569,
129
+ "num_tokens": 282977.0,
130
+ "step": 150
131
+ },
132
+ {
133
+ "epoch": 0.0031644317867172977,
134
+ "learning_rate": 0.00019937106918238996,
135
+ "loss": 0.1021,
136
+ "mean_token_accuracy": 0.9600433588027955,
137
+ "num_tokens": 301615.0,
138
+ "step": 160
139
+ },
140
+ {
141
+ "epoch": 0.003362208773387129,
142
+ "learning_rate": 0.000199331513785056,
143
+ "loss": 0.095,
144
+ "mean_token_accuracy": 0.9641285121440888,
145
+ "num_tokens": 320502.0,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 0.00355998576005696,
150
+ "learning_rate": 0.000199291958387722,
151
+ "loss": 0.0722,
152
+ "mean_token_accuracy": 0.9708887040615082,
153
+ "num_tokens": 339616.0,
154
+ "step": 180
155
+ },
156
+ {
157
+ "epoch": 0.003757762746726791,
158
+ "learning_rate": 0.00019925240299038804,
159
+ "loss": 0.0951,
160
+ "mean_token_accuracy": 0.9712291181087493,
161
+ "num_tokens": 358474.0,
162
+ "step": 190
163
+ },
164
+ {
165
+ "epoch": 0.003955539733396622,
166
+ "learning_rate": 0.00019921284759305408,
167
+ "loss": 0.1194,
168
+ "mean_token_accuracy": 0.9630812525749206,
169
+ "num_tokens": 377094.0,
170
+ "step": 200
171
+ },
172
+ {
173
+ "epoch": 0.004153316720066453,
174
+ "learning_rate": 0.00019917329219572012,
175
+ "loss": 0.1002,
176
+ "mean_token_accuracy": 0.9660979807376862,
177
+ "num_tokens": 396000.0,
178
+ "step": 210
179
+ },
180
+ {
181
+ "epoch": 0.004351093706736284,
182
+ "learning_rate": 0.00019913373679838613,
183
+ "loss": 0.0954,
184
+ "mean_token_accuracy": 0.9636943399906158,
185
+ "num_tokens": 415019.0,
186
+ "step": 220
187
+ },
188
+ {
189
+ "epoch": 0.0045488706934061155,
190
+ "learning_rate": 0.0001990941814010522,
191
+ "loss": 0.1114,
192
+ "mean_token_accuracy": 0.9662698566913605,
193
+ "num_tokens": 433711.0,
194
+ "step": 230
195
+ },
196
+ {
197
+ "epoch": 0.004746647680075947,
198
+ "learning_rate": 0.00019905462600371823,
199
+ "loss": 0.0915,
200
+ "mean_token_accuracy": 0.9679243505001068,
201
+ "num_tokens": 452483.0,
202
+ "step": 240
203
+ },
204
+ {
205
+ "epoch": 0.004944424666745778,
206
+ "learning_rate": 0.00019901507060638424,
207
+ "loss": 0.095,
208
+ "mean_token_accuracy": 0.9688079237937928,
209
+ "num_tokens": 471395.0,
210
+ "step": 250
211
+ },
212
+ {
213
+ "epoch": 0.005142201653415609,
214
+ "learning_rate": 0.00019897551520905028,
215
+ "loss": 0.1123,
216
+ "mean_token_accuracy": 0.962276142835617,
217
+ "num_tokens": 489983.0,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 0.00533997864008544,
222
+ "learning_rate": 0.00019893595981171632,
223
+ "loss": 0.0855,
224
+ "mean_token_accuracy": 0.9698696434497833,
225
+ "num_tokens": 509148.0,
226
+ "step": 270
227
+ },
228
+ {
229
+ "epoch": 0.005537755626755271,
230
+ "learning_rate": 0.00019889640441438235,
231
+ "loss": 0.0777,
232
+ "mean_token_accuracy": 0.9697826623916626,
233
+ "num_tokens": 528042.0,
234
+ "step": 280
235
+ },
236
+ {
237
+ "epoch": 0.005735532613425102,
238
+ "learning_rate": 0.0001988568490170484,
239
+ "loss": 0.0944,
240
+ "mean_token_accuracy": 0.9690817773342133,
241
+ "num_tokens": 546656.0,
242
+ "step": 290
243
+ },
244
+ {
245
+ "epoch": 0.005933309600094933,
246
+ "learning_rate": 0.00019881729361971443,
247
+ "loss": 0.0872,
248
+ "mean_token_accuracy": 0.9661558032035827,
249
+ "num_tokens": 565279.0,
250
+ "step": 300
251
+ },
252
+ {
253
+ "epoch": 0.006131086586764764,
254
+ "learning_rate": 0.00019877773822238047,
255
+ "loss": 0.09,
256
+ "mean_token_accuracy": 0.9669564247131348,
257
+ "num_tokens": 584196.0,
258
+ "step": 310
259
+ },
260
+ {
261
+ "epoch": 0.0063288635734345955,
262
+ "learning_rate": 0.00019873818282504648,
263
+ "loss": 0.0702,
264
+ "mean_token_accuracy": 0.9722951114177704,
265
+ "num_tokens": 603050.0,
266
+ "step": 320
267
+ },
268
+ {
269
+ "epoch": 0.006526640560104427,
270
+ "learning_rate": 0.00019869862742771251,
271
+ "loss": 0.0923,
272
+ "mean_token_accuracy": 0.9684451401233674,
273
+ "num_tokens": 621880.0,
274
+ "step": 330
275
+ },
276
+ {
277
+ "epoch": 0.006724417546774258,
278
+ "learning_rate": 0.00019865907203037855,
279
+ "loss": 0.0976,
280
+ "mean_token_accuracy": 0.9660769879817963,
281
+ "num_tokens": 640657.0,
282
+ "step": 340
283
+ },
284
+ {
285
+ "epoch": 0.006922194533444089,
286
+ "learning_rate": 0.0001986195166330446,
287
+ "loss": 0.107,
288
+ "mean_token_accuracy": 0.9633386790752411,
289
+ "num_tokens": 659503.0,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 0.00711997152011392,
294
+ "learning_rate": 0.00019857996123571063,
295
+ "loss": 0.1058,
296
+ "mean_token_accuracy": 0.9641251742839814,
297
+ "num_tokens": 678570.0,
298
+ "step": 360
299
+ },
300
+ {
301
+ "epoch": 0.007317748506783751,
302
+ "learning_rate": 0.00019854040583837666,
303
+ "loss": 0.097,
304
+ "mean_token_accuracy": 0.9664280533790588,
305
+ "num_tokens": 697294.0,
306
+ "step": 370
307
+ },
308
+ {
309
+ "epoch": 0.007515525493453582,
310
+ "learning_rate": 0.0001985008504410427,
311
+ "loss": 0.0677,
312
+ "mean_token_accuracy": 0.9754213869571686,
313
+ "num_tokens": 716458.0,
314
+ "step": 380
315
+ },
316
+ {
317
+ "epoch": 0.007713302480123413,
318
+ "learning_rate": 0.0001984612950437087,
319
+ "loss": 0.0622,
320
+ "mean_token_accuracy": 0.9724574089050293,
321
+ "num_tokens": 735437.0,
322
+ "step": 390
323
+ },
324
+ {
325
+ "epoch": 0.007911079466793244,
326
+ "learning_rate": 0.00019842173964637475,
327
+ "loss": 0.1004,
328
+ "mean_token_accuracy": 0.9710333228111268,
329
+ "num_tokens": 754416.0,
330
+ "step": 400
331
+ },
332
+ {
333
+ "epoch": 0.008108856453463075,
334
+ "learning_rate": 0.0001983821842490408,
335
+ "loss": 0.0922,
336
+ "mean_token_accuracy": 0.9718518137931824,
337
+ "num_tokens": 773091.0,
338
+ "step": 410
339
+ },
340
+ {
341
+ "epoch": 0.008306633440132907,
342
+ "learning_rate": 0.00019834262885170682,
343
+ "loss": 0.0835,
344
+ "mean_token_accuracy": 0.9694978713989257,
345
+ "num_tokens": 791900.0,
346
+ "step": 420
347
+ },
348
+ {
349
+ "epoch": 0.008504410426802738,
350
+ "learning_rate": 0.00019830307345437286,
351
+ "loss": 0.0822,
352
+ "mean_token_accuracy": 0.974584549665451,
353
+ "num_tokens": 810547.0,
354
+ "step": 430
355
+ },
356
+ {
357
+ "epoch": 0.008702187413472569,
358
+ "learning_rate": 0.0001982635180570389,
359
+ "loss": 0.0733,
360
+ "mean_token_accuracy": 0.9770256340503692,
361
+ "num_tokens": 829753.0,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 0.0088999644001424,
366
+ "learning_rate": 0.00019822396265970494,
367
+ "loss": 0.0882,
368
+ "mean_token_accuracy": 0.9668483734130859,
369
+ "num_tokens": 848294.0,
370
+ "step": 450
371
+ },
372
+ {
373
+ "epoch": 0.009097741386812231,
374
+ "learning_rate": 0.00019818440726237095,
375
+ "loss": 0.0921,
376
+ "mean_token_accuracy": 0.9715612173080445,
377
+ "num_tokens": 867635.0,
378
+ "step": 460
379
+ },
380
+ {
381
+ "epoch": 0.009295518373482062,
382
+ "learning_rate": 0.00019814485186503699,
383
+ "loss": 0.0668,
384
+ "mean_token_accuracy": 0.9765562832355499,
385
+ "num_tokens": 886596.0,
386
+ "step": 470
387
+ },
388
+ {
389
+ "epoch": 0.009493295360151893,
390
+ "learning_rate": 0.00019810529646770302,
391
+ "loss": 0.0797,
392
+ "mean_token_accuracy": 0.972789865732193,
393
+ "num_tokens": 905312.0,
394
+ "step": 480
395
+ },
396
+ {
397
+ "epoch": 0.009691072346821724,
398
+ "learning_rate": 0.00019806574107036906,
399
+ "loss": 0.0792,
400
+ "mean_token_accuracy": 0.976115608215332,
401
+ "num_tokens": 924072.0,
402
+ "step": 490
403
+ },
404
+ {
405
+ "epoch": 0.009888849333491555,
406
+ "learning_rate": 0.0001980261856730351,
407
+ "loss": 0.0672,
408
+ "mean_token_accuracy": 0.9746761620044708,
409
+ "num_tokens": 942844.0,
410
+ "step": 500
411
+ },
412
+ {
413
+ "epoch": 0.010086626320161387,
414
+ "learning_rate": 0.00019798663027570113,
415
+ "loss": 0.1031,
416
+ "mean_token_accuracy": 0.9693170130252838,
417
+ "num_tokens": 961524.0,
418
+ "step": 510
419
+ },
420
+ {
421
+ "epoch": 0.010284403306831218,
422
+ "learning_rate": 0.00019794707487836717,
423
+ "loss": 0.0899,
424
+ "mean_token_accuracy": 0.9669535160064697,
425
+ "num_tokens": 980332.0,
426
+ "step": 520
427
+ },
428
+ {
429
+ "epoch": 0.010482180293501049,
430
+ "learning_rate": 0.00019790751948103318,
431
+ "loss": 0.0861,
432
+ "mean_token_accuracy": 0.9700904309749603,
433
+ "num_tokens": 999163.0,
434
+ "step": 530
435
+ },
436
+ {
437
+ "epoch": 0.01067995728017088,
438
+ "learning_rate": 0.00019786796408369922,
439
+ "loss": 0.0997,
440
+ "mean_token_accuracy": 0.9640280067920685,
441
+ "num_tokens": 1018053.0,
442
+ "step": 540
443
+ },
444
+ {
445
+ "epoch": 0.010877734266840711,
446
+ "learning_rate": 0.00019782840868636528,
447
+ "loss": 0.0795,
448
+ "mean_token_accuracy": 0.9717990577220916,
449
+ "num_tokens": 1036828.0,
450
+ "step": 550
451
+ },
452
+ {
453
+ "epoch": 0.011075511253510542,
454
+ "learning_rate": 0.0001977888532890313,
455
+ "loss": 0.0885,
456
+ "mean_token_accuracy": 0.9719638526439667,
457
+ "num_tokens": 1055843.0,
458
+ "step": 560
459
+ },
460
+ {
461
+ "epoch": 0.011273288240180373,
462
+ "learning_rate": 0.00019774929789169733,
463
+ "loss": 0.0823,
464
+ "mean_token_accuracy": 0.9698925375938415,
465
+ "num_tokens": 1074733.0,
466
+ "step": 570
467
+ },
468
+ {
469
+ "epoch": 0.011471065226850204,
470
+ "learning_rate": 0.00019770974249436337,
471
+ "loss": 0.0945,
472
+ "mean_token_accuracy": 0.9694046437740326,
473
+ "num_tokens": 1093519.0,
474
+ "step": 580
475
+ },
476
+ {
477
+ "epoch": 0.011668842213520035,
478
+ "learning_rate": 0.0001976701870970294,
479
+ "loss": 0.0797,
480
+ "mean_token_accuracy": 0.975245076417923,
481
+ "num_tokens": 1112394.0,
482
+ "step": 590
483
+ },
484
+ {
485
+ "epoch": 0.011866619200189867,
486
+ "learning_rate": 0.00019763063169969542,
487
+ "loss": 0.0816,
488
+ "mean_token_accuracy": 0.9766307890415191,
489
+ "num_tokens": 1131435.0,
490
+ "step": 600
491
+ },
492
+ {
493
+ "epoch": 0.012064396186859698,
494
+ "learning_rate": 0.00019759107630236146,
495
+ "loss": 0.089,
496
+ "mean_token_accuracy": 0.9677313148975373,
497
+ "num_tokens": 1150274.0,
498
+ "step": 610
499
+ },
500
+ {
501
+ "epoch": 0.012262173173529529,
502
+ "learning_rate": 0.00019755152090502752,
503
+ "loss": 0.096,
504
+ "mean_token_accuracy": 0.9663532435894012,
505
+ "num_tokens": 1169076.0,
506
+ "step": 620
507
+ },
508
+ {
509
+ "epoch": 0.01245995016019936,
510
+ "learning_rate": 0.00019751196550769353,
511
+ "loss": 0.0686,
512
+ "mean_token_accuracy": 0.9754896223545074,
513
+ "num_tokens": 1187908.0,
514
+ "step": 630
515
+ },
516
+ {
517
+ "epoch": 0.012657727146869191,
518
+ "learning_rate": 0.00019747241011035957,
519
+ "loss": 0.0902,
520
+ "mean_token_accuracy": 0.9659872591495514,
521
+ "num_tokens": 1206872.0,
522
+ "step": 640
523
+ },
524
+ {
525
+ "epoch": 0.012855504133539022,
526
+ "learning_rate": 0.0001974328547130256,
527
+ "loss": 0.0757,
528
+ "mean_token_accuracy": 0.9724668145179749,
529
+ "num_tokens": 1225767.0,
530
+ "step": 650
531
+ },
532
+ {
533
+ "epoch": 0.013053281120208853,
534
+ "learning_rate": 0.00019739329931569164,
535
+ "loss": 0.0798,
536
+ "mean_token_accuracy": 0.9697497248649597,
537
+ "num_tokens": 1244340.0,
538
+ "step": 660
539
+ },
540
+ {
541
+ "epoch": 0.013251058106878684,
542
+ "learning_rate": 0.00019735374391835765,
543
+ "loss": 0.0874,
544
+ "mean_token_accuracy": 0.9725019693374634,
545
+ "num_tokens": 1263304.0,
546
+ "step": 670
547
+ },
548
+ {
549
+ "epoch": 0.013448835093548515,
550
+ "learning_rate": 0.0001973141885210237,
551
+ "loss": 0.0894,
552
+ "mean_token_accuracy": 0.9702820897102356,
553
+ "num_tokens": 1282028.0,
554
+ "step": 680
555
+ },
556
+ {
557
+ "epoch": 0.013646612080218347,
558
+ "learning_rate": 0.00019727463312368976,
559
+ "loss": 0.0958,
560
+ "mean_token_accuracy": 0.9666474103927613,
561
+ "num_tokens": 1300780.0,
562
+ "step": 690
563
+ },
564
+ {
565
+ "epoch": 0.013844389066888178,
566
+ "learning_rate": 0.00019723507772635577,
567
+ "loss": 0.0808,
568
+ "mean_token_accuracy": 0.9751847207546234,
569
+ "num_tokens": 1319535.0,
570
+ "step": 700
571
+ },
572
+ {
573
+ "epoch": 0.014042166053558009,
574
+ "learning_rate": 0.0001971955223290218,
575
+ "loss": 0.1044,
576
+ "mean_token_accuracy": 0.9665757656097412,
577
+ "num_tokens": 1338391.0,
578
+ "step": 710
579
+ },
580
+ {
581
+ "epoch": 0.01423994304022784,
582
+ "learning_rate": 0.00019715596693168784,
583
+ "loss": 0.0798,
584
+ "mean_token_accuracy": 0.9741999268531799,
585
+ "num_tokens": 1357072.0,
586
+ "step": 720
587
+ },
588
+ {
589
+ "epoch": 0.014437720026897671,
590
+ "learning_rate": 0.00019711641153435388,
591
+ "loss": 0.0661,
592
+ "mean_token_accuracy": 0.9795381426811218,
593
+ "num_tokens": 1375999.0,
594
+ "step": 730
595
+ },
596
+ {
597
+ "epoch": 0.014635497013567502,
598
+ "learning_rate": 0.00019707685613701992,
599
+ "loss": 0.082,
600
+ "mean_token_accuracy": 0.9710344612598419,
601
+ "num_tokens": 1394654.0,
602
+ "step": 740
603
+ },
604
+ {
605
+ "epoch": 0.014833274000237333,
606
+ "learning_rate": 0.00019703730073968593,
607
+ "loss": 0.0825,
608
+ "mean_token_accuracy": 0.9703740537166595,
609
+ "num_tokens": 1413323.0,
610
+ "step": 750
611
+ },
612
+ {
613
+ "epoch": 0.015031050986907164,
614
+ "learning_rate": 0.000196997745342352,
615
+ "loss": 0.0797,
616
+ "mean_token_accuracy": 0.9717476069927216,
617
+ "num_tokens": 1432274.0,
618
+ "step": 760
619
+ },
620
+ {
621
+ "epoch": 0.015228827973576995,
622
+ "learning_rate": 0.000196958189945018,
623
+ "loss": 0.0811,
624
+ "mean_token_accuracy": 0.9687287509441376,
625
+ "num_tokens": 1451009.0,
626
+ "step": 770
627
+ },
628
+ {
629
+ "epoch": 0.015426604960246827,
630
+ "learning_rate": 0.00019691863454768404,
631
+ "loss": 0.0802,
632
+ "mean_token_accuracy": 0.9698404908180237,
633
+ "num_tokens": 1469741.0,
634
+ "step": 780
635
+ },
636
+ {
637
+ "epoch": 0.015624381946916658,
638
+ "learning_rate": 0.00019687907915035008,
639
+ "loss": 0.0698,
640
+ "mean_token_accuracy": 0.9755463302135468,
641
+ "num_tokens": 1488644.0,
642
+ "step": 790
643
+ },
644
+ {
645
+ "epoch": 0.01582215893358649,
646
+ "learning_rate": 0.0001968395237530161,
647
+ "loss": 0.0698,
648
+ "mean_token_accuracy": 0.9777807116508483,
649
+ "num_tokens": 1507384.0,
650
+ "step": 800
651
+ },
652
+ {
653
+ "epoch": 0.016019935920256318,
654
+ "learning_rate": 0.00019679996835568215,
655
+ "loss": 0.085,
656
+ "mean_token_accuracy": 0.9731873035430908,
657
+ "num_tokens": 1526208.0,
658
+ "step": 810
659
+ },
660
+ {
661
+ "epoch": 0.01621771290692615,
662
+ "learning_rate": 0.00019676041295834816,
663
+ "loss": 0.0682,
664
+ "mean_token_accuracy": 0.9786826431751251,
665
+ "num_tokens": 1545086.0,
666
+ "step": 820
667
+ },
668
+ {
669
+ "epoch": 0.01641548989359598,
670
+ "learning_rate": 0.00019672085756101423,
671
+ "loss": 0.0687,
672
+ "mean_token_accuracy": 0.9754264533519745,
673
+ "num_tokens": 1563972.0,
674
+ "step": 830
675
+ },
676
+ {
677
+ "epoch": 0.016613266880265813,
678
+ "learning_rate": 0.00019668130216368024,
679
+ "loss": 0.0847,
680
+ "mean_token_accuracy": 0.9757691383361816,
681
+ "num_tokens": 1582893.0,
682
+ "step": 840
683
+ },
684
+ {
685
+ "epoch": 0.016811043866935643,
686
+ "learning_rate": 0.00019664174676634627,
687
+ "loss": 0.0978,
688
+ "mean_token_accuracy": 0.9691124320030212,
689
+ "num_tokens": 1601719.0,
690
+ "step": 850
691
+ },
692
+ {
693
+ "epoch": 0.017008820853605475,
694
+ "learning_rate": 0.0001966021913690123,
695
+ "loss": 0.0834,
696
+ "mean_token_accuracy": 0.9732567369937897,
697
+ "num_tokens": 1620493.0,
698
+ "step": 860
699
+ },
700
+ {
701
+ "epoch": 0.017206597840275305,
702
+ "learning_rate": 0.00019656263597167835,
703
+ "loss": 0.0703,
704
+ "mean_token_accuracy": 0.9783392190933228,
705
+ "num_tokens": 1639336.0,
706
+ "step": 870
707
+ },
708
+ {
709
+ "epoch": 0.017404374826945138,
710
+ "learning_rate": 0.00019652308057434439,
711
+ "loss": 0.0916,
712
+ "mean_token_accuracy": 0.9702894032001496,
713
+ "num_tokens": 1658213.0,
714
+ "step": 880
715
+ },
716
+ {
717
+ "epoch": 0.017602151813614967,
718
+ "learning_rate": 0.0001964835251770104,
719
+ "loss": 0.0631,
720
+ "mean_token_accuracy": 0.9804218530654907,
721
+ "num_tokens": 1677260.0,
722
+ "step": 890
723
+ },
724
+ {
725
+ "epoch": 0.0177999288002848,
726
+ "learning_rate": 0.00019644396977967646,
727
+ "loss": 0.0942,
728
+ "mean_token_accuracy": 0.9701037347316742,
729
+ "num_tokens": 1696174.0,
730
+ "step": 900
731
+ },
732
+ {
733
+ "epoch": 0.01799770578695463,
734
+ "learning_rate": 0.00019640441438234247,
735
+ "loss": 0.0827,
736
+ "mean_token_accuracy": 0.9717662394046783,
737
+ "num_tokens": 1714822.0,
738
+ "step": 910
739
+ },
740
+ {
741
+ "epoch": 0.018195482773624462,
742
+ "learning_rate": 0.0001963648589850085,
743
+ "loss": 0.0728,
744
+ "mean_token_accuracy": 0.9710807025432586,
745
+ "num_tokens": 1733526.0,
746
+ "step": 920
747
+ },
748
+ {
749
+ "epoch": 0.01839325976029429,
750
+ "learning_rate": 0.00019632530358767455,
751
+ "loss": 0.0689,
752
+ "mean_token_accuracy": 0.9746571719646454,
753
+ "num_tokens": 1752469.0,
754
+ "step": 930
755
+ },
756
+ {
757
+ "epoch": 0.018591036746964124,
758
+ "learning_rate": 0.00019628574819034058,
759
+ "loss": 0.0648,
760
+ "mean_token_accuracy": 0.9815335392951965,
761
+ "num_tokens": 1771439.0,
762
+ "step": 940
763
+ },
764
+ {
765
+ "epoch": 0.018788813733633954,
766
+ "learning_rate": 0.00019624619279300662,
767
+ "loss": 0.0773,
768
+ "mean_token_accuracy": 0.9698800563812255,
769
+ "num_tokens": 1790105.0,
770
+ "step": 950
771
+ },
772
+ {
773
+ "epoch": 0.018986590720303786,
774
+ "learning_rate": 0.00019620663739567263,
775
+ "loss": 0.067,
776
+ "mean_token_accuracy": 0.9776150286197662,
777
+ "num_tokens": 1809000.0,
778
+ "step": 960
779
+ },
780
+ {
781
+ "epoch": 0.019184367706973616,
782
+ "learning_rate": 0.0001961670819983387,
783
+ "loss": 0.0771,
784
+ "mean_token_accuracy": 0.9723577439785004,
785
+ "num_tokens": 1827743.0,
786
+ "step": 970
787
+ },
788
+ {
789
+ "epoch": 0.01938214469364345,
790
+ "learning_rate": 0.0001961275266010047,
791
+ "loss": 0.0829,
792
+ "mean_token_accuracy": 0.9695183992385864,
793
+ "num_tokens": 1846483.0,
794
+ "step": 980
795
+ },
796
+ {
797
+ "epoch": 0.019579921680313278,
798
+ "learning_rate": 0.00019608797120367074,
799
+ "loss": 0.0544,
800
+ "mean_token_accuracy": 0.9838368058204651,
801
+ "num_tokens": 1865230.0,
802
+ "step": 990
803
+ },
804
+ {
805
+ "epoch": 0.01977769866698311,
806
+ "learning_rate": 0.00019604841580633678,
807
+ "loss": 0.0913,
808
+ "mean_token_accuracy": 0.9702223718166352,
809
+ "num_tokens": 1884295.0,
810
+ "step": 1000
811
+ },
812
+ {
813
+ "epoch": 0.01997547565365294,
814
+ "learning_rate": 0.00019600886040900282,
815
+ "loss": 0.0794,
816
+ "mean_token_accuracy": 0.9764923632144928,
817
+ "num_tokens": 1903297.0,
818
+ "step": 1010
819
+ },
820
+ {
821
+ "epoch": 0.020173252640322773,
822
+ "learning_rate": 0.00019596930501166886,
823
+ "loss": 0.0673,
824
+ "mean_token_accuracy": 0.9780684530735015,
825
+ "num_tokens": 1922020.0,
826
+ "step": 1020
827
+ },
828
+ {
829
+ "epoch": 0.020371029626992603,
830
+ "learning_rate": 0.00019592974961433487,
831
+ "loss": 0.0626,
832
+ "mean_token_accuracy": 0.9800810873508453,
833
+ "num_tokens": 1940771.0,
834
+ "step": 1030
835
+ },
836
+ {
837
+ "epoch": 0.020568806613662435,
838
+ "learning_rate": 0.00019589019421700093,
839
+ "loss": 0.0765,
840
+ "mean_token_accuracy": 0.9712104678153992,
841
+ "num_tokens": 1959790.0,
842
+ "step": 1040
843
+ },
844
+ {
845
+ "epoch": 0.020766583600332265,
846
+ "learning_rate": 0.00019585063881966694,
847
+ "loss": 0.0648,
848
+ "mean_token_accuracy": 0.978104192018509,
849
+ "num_tokens": 1978766.0,
850
+ "step": 1050
851
+ },
852
+ {
853
+ "epoch": 0.020964360587002098,
854
+ "learning_rate": 0.00019581108342233298,
855
+ "loss": 0.0938,
856
+ "mean_token_accuracy": 0.9689741492271423,
857
+ "num_tokens": 1997785.0,
858
+ "step": 1060
859
+ },
860
+ {
861
+ "epoch": 0.021162137573671927,
862
+ "learning_rate": 0.00019577152802499902,
863
+ "loss": 0.0891,
864
+ "mean_token_accuracy": 0.9727595269680023,
865
+ "num_tokens": 2016334.0,
866
+ "step": 1070
867
+ },
868
+ {
869
+ "epoch": 0.02135991456034176,
870
+ "learning_rate": 0.00019573197262766505,
871
+ "loss": 0.0818,
872
+ "mean_token_accuracy": 0.9725301325321197,
873
+ "num_tokens": 2035271.0,
874
+ "step": 1080
875
+ },
876
+ {
877
+ "epoch": 0.02155769154701159,
878
+ "learning_rate": 0.0001956924172303311,
879
+ "loss": 0.0721,
880
+ "mean_token_accuracy": 0.9739335179328918,
881
+ "num_tokens": 2054158.0,
882
+ "step": 1090
883
+ },
884
+ {
885
+ "epoch": 0.021755468533681422,
886
+ "learning_rate": 0.0001956528618329971,
887
+ "loss": 0.0785,
888
+ "mean_token_accuracy": 0.971097469329834,
889
+ "num_tokens": 2072794.0,
890
+ "step": 1100
891
+ },
892
+ {
893
+ "epoch": 0.02195324552035125,
894
+ "learning_rate": 0.00019561330643566317,
895
+ "loss": 0.0759,
896
+ "mean_token_accuracy": 0.9685551345348358,
897
+ "num_tokens": 2091583.0,
898
+ "step": 1110
899
+ },
900
+ {
901
+ "epoch": 0.022151022507021084,
902
+ "learning_rate": 0.0001955737510383292,
903
+ "loss": 0.0584,
904
+ "mean_token_accuracy": 0.9821391940116883,
905
+ "num_tokens": 2110207.0,
906
+ "step": 1120
907
+ },
908
+ {
909
+ "epoch": 0.022348799493690914,
910
+ "learning_rate": 0.00019553419564099521,
911
+ "loss": 0.0841,
912
+ "mean_token_accuracy": 0.9727806925773621,
913
+ "num_tokens": 2129006.0,
914
+ "step": 1130
915
+ },
916
+ {
917
+ "epoch": 0.022546576480360746,
918
+ "learning_rate": 0.00019549464024366125,
919
+ "loss": 0.0708,
920
+ "mean_token_accuracy": 0.9772637248039245,
921
+ "num_tokens": 2147866.0,
922
+ "step": 1140
923
+ },
924
+ {
925
+ "epoch": 0.022744353467030576,
926
+ "learning_rate": 0.0001954550848463273,
927
+ "loss": 0.0615,
928
+ "mean_token_accuracy": 0.9771132528781891,
929
+ "num_tokens": 2166678.0,
930
+ "step": 1150
931
+ },
932
+ {
933
+ "epoch": 0.02294213045370041,
934
+ "learning_rate": 0.00019541552944899333,
935
+ "loss": 0.0716,
936
+ "mean_token_accuracy": 0.97862588763237,
937
+ "num_tokens": 2185491.0,
938
+ "step": 1160
939
+ },
940
+ {
941
+ "epoch": 0.023139907440370238,
942
+ "learning_rate": 0.00019537597405165934,
943
+ "loss": 0.0835,
944
+ "mean_token_accuracy": 0.9729329645633698,
945
+ "num_tokens": 2204270.0,
946
+ "step": 1170
947
+ },
948
+ {
949
+ "epoch": 0.02333768442704007,
950
+ "learning_rate": 0.0001953364186543254,
951
+ "loss": 0.0825,
952
+ "mean_token_accuracy": 0.975412392616272,
953
+ "num_tokens": 2223262.0,
954
+ "step": 1180
955
+ },
956
+ {
957
+ "epoch": 0.0235354614137099,
958
+ "learning_rate": 0.00019529686325699144,
959
+ "loss": 0.0694,
960
+ "mean_token_accuracy": 0.9787419438362122,
961
+ "num_tokens": 2242082.0,
962
+ "step": 1190
963
+ },
964
+ {
965
+ "epoch": 0.023733238400379733,
966
+ "learning_rate": 0.00019525730785965745,
967
+ "loss": 0.0971,
968
+ "mean_token_accuracy": 0.9742420554161072,
969
+ "num_tokens": 2260874.0,
970
+ "step": 1200
971
+ },
972
+ {
973
+ "epoch": 0.023931015387049562,
974
+ "learning_rate": 0.00019521775246232349,
975
+ "loss": 0.0813,
976
+ "mean_token_accuracy": 0.973499870300293,
977
+ "num_tokens": 2279620.0,
978
+ "step": 1210
979
+ },
980
+ {
981
+ "epoch": 0.024128792373719395,
982
+ "learning_rate": 0.00019517819706498952,
983
+ "loss": 0.0591,
984
+ "mean_token_accuracy": 0.9806219100952148,
985
+ "num_tokens": 2298359.0,
986
+ "step": 1220
987
+ },
988
+ {
989
+ "epoch": 0.024326569360389225,
990
+ "learning_rate": 0.00019513864166765556,
991
+ "loss": 0.0784,
992
+ "mean_token_accuracy": 0.9767163157463074,
993
+ "num_tokens": 2317171.0,
994
+ "step": 1230
995
+ },
996
+ {
997
+ "epoch": 0.024524346347059058,
998
+ "learning_rate": 0.00019509908627032157,
999
+ "loss": 0.0867,
1000
+ "mean_token_accuracy": 0.9671182572841645,
1001
+ "num_tokens": 2335825.0,
1002
+ "step": 1240
1003
+ },
1004
+ {
1005
+ "epoch": 0.024722123333728887,
1006
+ "learning_rate": 0.00019505953087298764,
1007
+ "loss": 0.0727,
1008
+ "mean_token_accuracy": 0.9711700201034545,
1009
+ "num_tokens": 2354806.0,
1010
+ "step": 1250
1011
+ },
1012
+ {
1013
+ "epoch": 0.02491990032039872,
1014
+ "learning_rate": 0.00019501997547565367,
1015
+ "loss": 0.0901,
1016
+ "mean_token_accuracy": 0.9734555304050445,
1017
+ "num_tokens": 2373919.0,
1018
+ "step": 1260
1019
+ },
1020
+ {
1021
+ "epoch": 0.02511767730706855,
1022
+ "learning_rate": 0.00019498042007831968,
1023
+ "loss": 0.0579,
1024
+ "mean_token_accuracy": 0.9764434218406677,
1025
+ "num_tokens": 2393043.0,
1026
+ "step": 1270
1027
+ },
1028
+ {
1029
+ "epoch": 0.025315454293738382,
1030
+ "learning_rate": 0.00019494086468098575,
1031
+ "loss": 0.0623,
1032
+ "mean_token_accuracy": 0.9802065193653107,
1033
+ "num_tokens": 2411888.0,
1034
+ "step": 1280
1035
+ },
1036
+ {
1037
+ "epoch": 0.02551323128040821,
1038
+ "learning_rate": 0.00019490130928365176,
1039
+ "loss": 0.0739,
1040
+ "mean_token_accuracy": 0.9758767008781433,
1041
+ "num_tokens": 2430790.0,
1042
+ "step": 1290
1043
+ },
1044
+ {
1045
+ "epoch": 0.025711008267078044,
1046
+ "learning_rate": 0.0001948617538863178,
1047
+ "loss": 0.0676,
1048
+ "mean_token_accuracy": 0.9723714172840119,
1049
+ "num_tokens": 2449534.0,
1050
+ "step": 1300
1051
+ },
1052
+ {
1053
+ "epoch": 0.025908785253747874,
1054
+ "learning_rate": 0.0001948221984889838,
1055
+ "loss": 0.0842,
1056
+ "mean_token_accuracy": 0.9739417016506196,
1057
+ "num_tokens": 2468500.0,
1058
+ "step": 1310
1059
+ },
1060
+ {
1061
+ "epoch": 0.026106562240417706,
1062
+ "learning_rate": 0.00019478264309164987,
1063
+ "loss": 0.0812,
1064
+ "mean_token_accuracy": 0.9725883424282074,
1065
+ "num_tokens": 2487522.0,
1066
+ "step": 1320
1067
+ },
1068
+ {
1069
+ "epoch": 0.026304339227087536,
1070
+ "learning_rate": 0.0001947430876943159,
1071
+ "loss": 0.081,
1072
+ "mean_token_accuracy": 0.9718135535717011,
1073
+ "num_tokens": 2506228.0,
1074
+ "step": 1330
1075
+ },
1076
+ {
1077
+ "epoch": 0.02650211621375737,
1078
+ "learning_rate": 0.00019470353229698192,
1079
+ "loss": 0.0913,
1080
+ "mean_token_accuracy": 0.9727324843406677,
1081
+ "num_tokens": 2524993.0,
1082
+ "step": 1340
1083
+ },
1084
+ {
1085
+ "epoch": 0.026699893200427198,
1086
+ "learning_rate": 0.00019466397689964798,
1087
+ "loss": 0.0961,
1088
+ "mean_token_accuracy": 0.9686046600341797,
1089
+ "num_tokens": 2543814.0,
1090
+ "step": 1350
1091
+ },
1092
+ {
1093
+ "epoch": 0.02689767018709703,
1094
+ "learning_rate": 0.000194624421502314,
1095
+ "loss": 0.0961,
1096
+ "mean_token_accuracy": 0.9635930359363556,
1097
+ "num_tokens": 2562624.0,
1098
+ "step": 1360
1099
+ },
1100
+ {
1101
+ "epoch": 0.02709544717376686,
1102
+ "learning_rate": 0.00019458486610498003,
1103
+ "loss": 0.0882,
1104
+ "mean_token_accuracy": 0.971700656414032,
1105
+ "num_tokens": 2581229.0,
1106
+ "step": 1370
1107
+ },
1108
+ {
1109
+ "epoch": 0.027293224160436693,
1110
+ "learning_rate": 0.00019454531070764607,
1111
+ "loss": 0.0875,
1112
+ "mean_token_accuracy": 0.9696930944919586,
1113
+ "num_tokens": 2600073.0,
1114
+ "step": 1380
1115
+ },
1116
+ {
1117
+ "epoch": 0.027491001147106522,
1118
+ "learning_rate": 0.0001945057553103121,
1119
+ "loss": 0.0732,
1120
+ "mean_token_accuracy": 0.9732912659645081,
1121
+ "num_tokens": 2618837.0,
1122
+ "step": 1390
1123
+ },
1124
+ {
1125
+ "epoch": 0.027688778133776355,
1126
+ "learning_rate": 0.00019446619991297814,
1127
+ "loss": 0.0711,
1128
+ "mean_token_accuracy": 0.975910484790802,
1129
+ "num_tokens": 2637487.0,
1130
+ "step": 1400
1131
+ },
1132
+ {
1133
+ "epoch": 0.027886555120446185,
1134
+ "learning_rate": 0.00019442664451564415,
1135
+ "loss": 0.0944,
1136
+ "mean_token_accuracy": 0.96585413813591,
1137
+ "num_tokens": 2656004.0,
1138
+ "step": 1410
1139
+ },
1140
+ {
1141
+ "epoch": 0.028084332107116017,
1142
+ "learning_rate": 0.00019438708911831022,
1143
+ "loss": 0.0553,
1144
+ "mean_token_accuracy": 0.980397754907608,
1145
+ "num_tokens": 2675089.0,
1146
+ "step": 1420
1147
+ },
1148
+ {
1149
+ "epoch": 0.028282109093785847,
1150
+ "learning_rate": 0.00019434753372097623,
1151
+ "loss": 0.062,
1152
+ "mean_token_accuracy": 0.9789350926876068,
1153
+ "num_tokens": 2693904.0,
1154
+ "step": 1430
1155
+ },
1156
+ {
1157
+ "epoch": 0.02847988608045568,
1158
+ "learning_rate": 0.00019430797832364227,
1159
+ "loss": 0.08,
1160
+ "mean_token_accuracy": 0.9751649796962738,
1161
+ "num_tokens": 2712395.0,
1162
+ "step": 1440
1163
+ },
1164
+ {
1165
+ "epoch": 0.02867766306712551,
1166
+ "learning_rate": 0.0001942684229263083,
1167
+ "loss": 0.0662,
1168
+ "mean_token_accuracy": 0.9778707563877106,
1169
+ "num_tokens": 2731103.0,
1170
+ "step": 1450
1171
+ },
1172
+ {
1173
+ "epoch": 0.028875440053795342,
1174
+ "learning_rate": 0.00019422886752897434,
1175
+ "loss": 0.0834,
1176
+ "mean_token_accuracy": 0.9672891080379487,
1177
+ "num_tokens": 2749801.0,
1178
+ "step": 1460
1179
+ },
1180
+ {
1181
+ "epoch": 0.02907321704046517,
1182
+ "learning_rate": 0.00019418931213164038,
1183
+ "loss": 0.0719,
1184
+ "mean_token_accuracy": 0.9755631804466247,
1185
+ "num_tokens": 2768551.0,
1186
+ "step": 1470
1187
+ },
1188
+ {
1189
+ "epoch": 0.029270994027135004,
1190
+ "learning_rate": 0.0001941497567343064,
1191
+ "loss": 0.0499,
1192
+ "mean_token_accuracy": 0.9837284207344055,
1193
+ "num_tokens": 2787591.0,
1194
+ "step": 1480
1195
+ },
1196
+ {
1197
+ "epoch": 0.029468771013804834,
1198
+ "learning_rate": 0.00019411020133697245,
1199
+ "loss": 0.066,
1200
+ "mean_token_accuracy": 0.9761005103588104,
1201
+ "num_tokens": 2806455.0,
1202
+ "step": 1490
1203
+ },
1204
+ {
1205
+ "epoch": 0.029666548000474666,
1206
+ "learning_rate": 0.00019407064593963846,
1207
+ "loss": 0.0623,
1208
+ "mean_token_accuracy": 0.9766542613506317,
1209
+ "num_tokens": 2825280.0,
1210
+ "step": 1500
1211
+ },
1212
+ {
1213
+ "epoch": 0.029864324987144496,
1214
+ "learning_rate": 0.0001940310905423045,
1215
+ "loss": 0.0643,
1216
+ "mean_token_accuracy": 0.9814446032047272,
1217
+ "num_tokens": 2844178.0,
1218
+ "step": 1510
1219
+ },
1220
+ {
1221
+ "epoch": 0.03006210197381433,
1222
+ "learning_rate": 0.00019399153514497054,
1223
+ "loss": 0.0559,
1224
+ "mean_token_accuracy": 0.9753939032554626,
1225
+ "num_tokens": 2863373.0,
1226
+ "step": 1520
1227
+ },
1228
+ {
1229
+ "epoch": 0.030259878960484158,
1230
+ "learning_rate": 0.00019395197974763658,
1231
+ "loss": 0.0633,
1232
+ "mean_token_accuracy": 0.9776888847351074,
1233
+ "num_tokens": 2882166.0,
1234
+ "step": 1530
1235
+ },
1236
+ {
1237
+ "epoch": 0.03045765594715399,
1238
+ "learning_rate": 0.00019391242435030261,
1239
+ "loss": 0.0568,
1240
+ "mean_token_accuracy": 0.9781621396541595,
1241
+ "num_tokens": 2901050.0,
1242
+ "step": 1540
1243
+ },
1244
+ {
1245
+ "epoch": 0.03065543293382382,
1246
+ "learning_rate": 0.00019387286895296862,
1247
+ "loss": 0.0748,
1248
+ "mean_token_accuracy": 0.9726321280002594,
1249
+ "num_tokens": 2919953.0,
1250
+ "step": 1550
1251
+ },
1252
+ {
1253
+ "epoch": 0.030853209920493653,
1254
+ "learning_rate": 0.0001938333135556347,
1255
+ "loss": 0.0717,
1256
+ "mean_token_accuracy": 0.9703111469745636,
1257
+ "num_tokens": 2939003.0,
1258
+ "step": 1560
1259
+ },
1260
+ {
1261
+ "epoch": 0.031050986907163482,
1262
+ "learning_rate": 0.00019379375815830073,
1263
+ "loss": 0.065,
1264
+ "mean_token_accuracy": 0.97583766579628,
1265
+ "num_tokens": 2957853.0,
1266
+ "step": 1570
1267
+ },
1268
+ {
1269
+ "epoch": 0.031248763893833315,
1270
+ "learning_rate": 0.00019375420276096674,
1271
+ "loss": 0.0667,
1272
+ "mean_token_accuracy": 0.9808376967906952,
1273
+ "num_tokens": 2976557.0,
1274
+ "step": 1580
1275
+ },
1276
+ {
1277
+ "epoch": 0.031446540880503145,
1278
+ "learning_rate": 0.00019371464736363277,
1279
+ "loss": 0.0777,
1280
+ "mean_token_accuracy": 0.9767092704772949,
1281
+ "num_tokens": 2995058.0,
1282
+ "step": 1590
1283
+ },
1284
+ {
1285
+ "epoch": 0.03164431786717298,
1286
+ "learning_rate": 0.0001936750919662988,
1287
+ "loss": 0.0644,
1288
+ "mean_token_accuracy": 0.9776780724525451,
1289
+ "num_tokens": 3014165.0,
1290
+ "step": 1600
1291
+ },
1292
+ {
1293
+ "epoch": 0.03184209485384281,
1294
+ "learning_rate": 0.00019363553656896485,
1295
+ "loss": 0.0566,
1296
+ "mean_token_accuracy": 0.9828783690929412,
1297
+ "num_tokens": 3032924.0,
1298
+ "step": 1610
1299
+ },
1300
+ {
1301
+ "epoch": 0.032039871840512636,
1302
+ "learning_rate": 0.00019359598117163086,
1303
+ "loss": 0.0805,
1304
+ "mean_token_accuracy": 0.972636216878891,
1305
+ "num_tokens": 3051960.0,
1306
+ "step": 1620
1307
+ },
1308
+ {
1309
+ "epoch": 0.03223764882718247,
1310
+ "learning_rate": 0.00019355642577429692,
1311
+ "loss": 0.0831,
1312
+ "mean_token_accuracy": 0.9726630806922912,
1313
+ "num_tokens": 3070644.0,
1314
+ "step": 1630
1315
+ },
1316
+ {
1317
+ "epoch": 0.0324354258138523,
1318
+ "learning_rate": 0.00019351687037696296,
1319
+ "loss": 0.0766,
1320
+ "mean_token_accuracy": 0.9787626624107361,
1321
+ "num_tokens": 3089393.0,
1322
+ "step": 1640
1323
+ },
1324
+ {
1325
+ "epoch": 0.03263320280052213,
1326
+ "learning_rate": 0.00019347731497962897,
1327
+ "loss": 0.0665,
1328
+ "mean_token_accuracy": 0.9728036403656006,
1329
+ "num_tokens": 3108260.0,
1330
+ "step": 1650
1331
+ },
1332
+ {
1333
+ "epoch": 0.03283097978719196,
1334
+ "learning_rate": 0.000193437759582295,
1335
+ "loss": 0.0842,
1336
+ "mean_token_accuracy": 0.969732540845871,
1337
+ "num_tokens": 3127157.0,
1338
+ "step": 1660
1339
+ },
1340
+ {
1341
+ "epoch": 0.033028756773861793,
1342
+ "learning_rate": 0.00019339820418496105,
1343
+ "loss": 0.072,
1344
+ "mean_token_accuracy": 0.9739607870578766,
1345
+ "num_tokens": 3146126.0,
1346
+ "step": 1670
1347
+ },
1348
+ {
1349
+ "epoch": 0.033226533760531626,
1350
+ "learning_rate": 0.00019335864878762708,
1351
+ "loss": 0.0835,
1352
+ "mean_token_accuracy": 0.9699565410614014,
1353
+ "num_tokens": 3165091.0,
1354
+ "step": 1680
1355
+ }
1356
+ ],
1357
+ "logging_steps": 10,
1358
+ "max_steps": 50562,
1359
+ "num_input_tokens_seen": 0,
1360
+ "num_train_epochs": 9223372036854775807,
1361
+ "save_steps": 1685,
1362
+ "stateful_callbacks": {
1363
+ "TrainerControl": {
1364
+ "args": {
1365
+ "should_epoch_stop": false,
1366
+ "should_evaluate": false,
1367
+ "should_log": false,
1368
+ "should_save": true,
1369
+ "should_training_stop": false
1370
+ },
1371
+ "attributes": {}
1372
+ }
1373
+ },
1374
+ "total_flos": 1.5391152511647744e+17,
1375
+ "train_batch_size": 16,
1376
+ "trial_name": null,
1377
+ "trial_params": null
1378
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.0
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 8,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "q_proj",
28
+ "v_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "trainable_token_indices": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/trainer_state.json ADDED
@@ -0,0 +1,1378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.03332542225386654,
6
+ "eval_steps": 500,
7
+ "global_step": 1685,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0001977769866698311,
14
+ "learning_rate": 0.00019996440014239942,
15
+ "loss": 0.3352,
16
+ "mean_token_accuracy": 0.9023264050483704,
17
+ "num_tokens": 19163.0,
18
+ "step": 10
19
+ },
20
+ {
21
+ "epoch": 0.0003955539733396622,
22
+ "learning_rate": 0.0001999248447450655,
23
+ "loss": 0.1524,
24
+ "mean_token_accuracy": 0.947095412015915,
25
+ "num_tokens": 38071.0,
26
+ "step": 20
27
+ },
28
+ {
29
+ "epoch": 0.0005933309600094933,
30
+ "learning_rate": 0.0001998852893477315,
31
+ "loss": 0.1461,
32
+ "mean_token_accuracy": 0.9463379800319671,
33
+ "num_tokens": 57017.0,
34
+ "step": 30
35
+ },
36
+ {
37
+ "epoch": 0.0007911079466793244,
38
+ "learning_rate": 0.00019984573395039754,
39
+ "loss": 0.1024,
40
+ "mean_token_accuracy": 0.9588611423969269,
41
+ "num_tokens": 76047.0,
42
+ "step": 40
43
+ },
44
+ {
45
+ "epoch": 0.0009888849333491555,
46
+ "learning_rate": 0.00019980617855306357,
47
+ "loss": 0.117,
48
+ "mean_token_accuracy": 0.9553473949432373,
49
+ "num_tokens": 94824.0,
50
+ "step": 50
51
+ },
52
+ {
53
+ "epoch": 0.0011866619200189867,
54
+ "learning_rate": 0.0001997666231557296,
55
+ "loss": 0.1281,
56
+ "mean_token_accuracy": 0.9538084208965302,
57
+ "num_tokens": 113467.0,
58
+ "step": 60
59
+ },
60
+ {
61
+ "epoch": 0.0013844389066888178,
62
+ "learning_rate": 0.00019972706775839565,
63
+ "loss": 0.1075,
64
+ "mean_token_accuracy": 0.961921775341034,
65
+ "num_tokens": 132229.0,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 0.0015822158933586489,
70
+ "learning_rate": 0.00019968751236106166,
71
+ "loss": 0.1057,
72
+ "mean_token_accuracy": 0.9676864743232727,
73
+ "num_tokens": 150811.0,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.00177999288002848,
78
+ "learning_rate": 0.00019964795696372772,
79
+ "loss": 0.1048,
80
+ "mean_token_accuracy": 0.9597055971622467,
81
+ "num_tokens": 169804.0,
82
+ "step": 90
83
+ },
84
+ {
85
+ "epoch": 0.001977769866698311,
86
+ "learning_rate": 0.00019960840156639376,
87
+ "loss": 0.1147,
88
+ "mean_token_accuracy": 0.9561040580272675,
89
+ "num_tokens": 188503.0,
90
+ "step": 100
91
+ },
92
+ {
93
+ "epoch": 0.002175546853368142,
94
+ "learning_rate": 0.00019956884616905977,
95
+ "loss": 0.0904,
96
+ "mean_token_accuracy": 0.9663344562053681,
97
+ "num_tokens": 207370.0,
98
+ "step": 110
99
+ },
100
+ {
101
+ "epoch": 0.0023733238400379733,
102
+ "learning_rate": 0.0001995292907717258,
103
+ "loss": 0.085,
104
+ "mean_token_accuracy": 0.9692767798900604,
105
+ "num_tokens": 226639.0,
106
+ "step": 120
107
+ },
108
+ {
109
+ "epoch": 0.0025711008267078044,
110
+ "learning_rate": 0.00019948973537439185,
111
+ "loss": 0.1055,
112
+ "mean_token_accuracy": 0.962373024225235,
113
+ "num_tokens": 245318.0,
114
+ "step": 130
115
+ },
116
+ {
117
+ "epoch": 0.0027688778133776355,
118
+ "learning_rate": 0.00019945017997705788,
119
+ "loss": 0.1023,
120
+ "mean_token_accuracy": 0.9603676617145538,
121
+ "num_tokens": 264089.0,
122
+ "step": 140
123
+ },
124
+ {
125
+ "epoch": 0.0029666548000474666,
126
+ "learning_rate": 0.0001994106245797239,
127
+ "loss": 0.1061,
128
+ "mean_token_accuracy": 0.9619979500770569,
129
+ "num_tokens": 282977.0,
130
+ "step": 150
131
+ },
132
+ {
133
+ "epoch": 0.0031644317867172977,
134
+ "learning_rate": 0.00019937106918238996,
135
+ "loss": 0.1021,
136
+ "mean_token_accuracy": 0.9600433588027955,
137
+ "num_tokens": 301615.0,
138
+ "step": 160
139
+ },
140
+ {
141
+ "epoch": 0.003362208773387129,
142
+ "learning_rate": 0.000199331513785056,
143
+ "loss": 0.095,
144
+ "mean_token_accuracy": 0.9641285121440888,
145
+ "num_tokens": 320502.0,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 0.00355998576005696,
150
+ "learning_rate": 0.000199291958387722,
151
+ "loss": 0.0722,
152
+ "mean_token_accuracy": 0.9708887040615082,
153
+ "num_tokens": 339616.0,
154
+ "step": 180
155
+ },
156
+ {
157
+ "epoch": 0.003757762746726791,
158
+ "learning_rate": 0.00019925240299038804,
159
+ "loss": 0.0951,
160
+ "mean_token_accuracy": 0.9712291181087493,
161
+ "num_tokens": 358474.0,
162
+ "step": 190
163
+ },
164
+ {
165
+ "epoch": 0.003955539733396622,
166
+ "learning_rate": 0.00019921284759305408,
167
+ "loss": 0.1194,
168
+ "mean_token_accuracy": 0.9630812525749206,
169
+ "num_tokens": 377094.0,
170
+ "step": 200
171
+ },
172
+ {
173
+ "epoch": 0.004153316720066453,
174
+ "learning_rate": 0.00019917329219572012,
175
+ "loss": 0.1002,
176
+ "mean_token_accuracy": 0.9660979807376862,
177
+ "num_tokens": 396000.0,
178
+ "step": 210
179
+ },
180
+ {
181
+ "epoch": 0.004351093706736284,
182
+ "learning_rate": 0.00019913373679838613,
183
+ "loss": 0.0954,
184
+ "mean_token_accuracy": 0.9636943399906158,
185
+ "num_tokens": 415019.0,
186
+ "step": 220
187
+ },
188
+ {
189
+ "epoch": 0.0045488706934061155,
190
+ "learning_rate": 0.0001990941814010522,
191
+ "loss": 0.1114,
192
+ "mean_token_accuracy": 0.9662698566913605,
193
+ "num_tokens": 433711.0,
194
+ "step": 230
195
+ },
196
+ {
197
+ "epoch": 0.004746647680075947,
198
+ "learning_rate": 0.00019905462600371823,
199
+ "loss": 0.0915,
200
+ "mean_token_accuracy": 0.9679243505001068,
201
+ "num_tokens": 452483.0,
202
+ "step": 240
203
+ },
204
+ {
205
+ "epoch": 0.004944424666745778,
206
+ "learning_rate": 0.00019901507060638424,
207
+ "loss": 0.0951,
208
+ "mean_token_accuracy": 0.9688079237937928,
209
+ "num_tokens": 471395.0,
210
+ "step": 250
211
+ },
212
+ {
213
+ "epoch": 0.005142201653415609,
214
+ "learning_rate": 0.00019897551520905028,
215
+ "loss": 0.1123,
216
+ "mean_token_accuracy": 0.962276142835617,
217
+ "num_tokens": 489983.0,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 0.00533997864008544,
222
+ "learning_rate": 0.00019893595981171632,
223
+ "loss": 0.0855,
224
+ "mean_token_accuracy": 0.9698696434497833,
225
+ "num_tokens": 509148.0,
226
+ "step": 270
227
+ },
228
+ {
229
+ "epoch": 0.005537755626755271,
230
+ "learning_rate": 0.00019889640441438235,
231
+ "loss": 0.0777,
232
+ "mean_token_accuracy": 0.9697826623916626,
233
+ "num_tokens": 528042.0,
234
+ "step": 280
235
+ },
236
+ {
237
+ "epoch": 0.005735532613425102,
238
+ "learning_rate": 0.0001988568490170484,
239
+ "loss": 0.0944,
240
+ "mean_token_accuracy": 0.9690817773342133,
241
+ "num_tokens": 546656.0,
242
+ "step": 290
243
+ },
244
+ {
245
+ "epoch": 0.005933309600094933,
246
+ "learning_rate": 0.00019881729361971443,
247
+ "loss": 0.0872,
248
+ "mean_token_accuracy": 0.9661558032035827,
249
+ "num_tokens": 565279.0,
250
+ "step": 300
251
+ },
252
+ {
253
+ "epoch": 0.006131086586764764,
254
+ "learning_rate": 0.00019877773822238047,
255
+ "loss": 0.09,
256
+ "mean_token_accuracy": 0.9669564247131348,
257
+ "num_tokens": 584196.0,
258
+ "step": 310
259
+ },
260
+ {
261
+ "epoch": 0.0063288635734345955,
262
+ "learning_rate": 0.00019873818282504648,
263
+ "loss": 0.0701,
264
+ "mean_token_accuracy": 0.9722951114177704,
265
+ "num_tokens": 603050.0,
266
+ "step": 320
267
+ },
268
+ {
269
+ "epoch": 0.006526640560104427,
270
+ "learning_rate": 0.00019869862742771251,
271
+ "loss": 0.0922,
272
+ "mean_token_accuracy": 0.9692854762077332,
273
+ "num_tokens": 621880.0,
274
+ "step": 330
275
+ },
276
+ {
277
+ "epoch": 0.006724417546774258,
278
+ "learning_rate": 0.00019865907203037855,
279
+ "loss": 0.0976,
280
+ "mean_token_accuracy": 0.9660769879817963,
281
+ "num_tokens": 640657.0,
282
+ "step": 340
283
+ },
284
+ {
285
+ "epoch": 0.006922194533444089,
286
+ "learning_rate": 0.0001986195166330446,
287
+ "loss": 0.1071,
288
+ "mean_token_accuracy": 0.9633386790752411,
289
+ "num_tokens": 659503.0,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 0.00711997152011392,
294
+ "learning_rate": 0.00019857996123571063,
295
+ "loss": 0.1058,
296
+ "mean_token_accuracy": 0.9641251742839814,
297
+ "num_tokens": 678570.0,
298
+ "step": 360
299
+ },
300
+ {
301
+ "epoch": 0.007317748506783751,
302
+ "learning_rate": 0.00019854040583837666,
303
+ "loss": 0.097,
304
+ "mean_token_accuracy": 0.9664280533790588,
305
+ "num_tokens": 697294.0,
306
+ "step": 370
307
+ },
308
+ {
309
+ "epoch": 0.007515525493453582,
310
+ "learning_rate": 0.0001985008504410427,
311
+ "loss": 0.0677,
312
+ "mean_token_accuracy": 0.9754213869571686,
313
+ "num_tokens": 716458.0,
314
+ "step": 380
315
+ },
316
+ {
317
+ "epoch": 0.007713302480123413,
318
+ "learning_rate": 0.0001984612950437087,
319
+ "loss": 0.0622,
320
+ "mean_token_accuracy": 0.9724574089050293,
321
+ "num_tokens": 735437.0,
322
+ "step": 390
323
+ },
324
+ {
325
+ "epoch": 0.007911079466793244,
326
+ "learning_rate": 0.00019842173964637475,
327
+ "loss": 0.1003,
328
+ "mean_token_accuracy": 0.9710333228111268,
329
+ "num_tokens": 754416.0,
330
+ "step": 400
331
+ },
332
+ {
333
+ "epoch": 0.008108856453463075,
334
+ "learning_rate": 0.0001983821842490408,
335
+ "loss": 0.0921,
336
+ "mean_token_accuracy": 0.9718518137931824,
337
+ "num_tokens": 773091.0,
338
+ "step": 410
339
+ },
340
+ {
341
+ "epoch": 0.008306633440132907,
342
+ "learning_rate": 0.00019834262885170682,
343
+ "loss": 0.0836,
344
+ "mean_token_accuracy": 0.9694978713989257,
345
+ "num_tokens": 791900.0,
346
+ "step": 420
347
+ },
348
+ {
349
+ "epoch": 0.008504410426802738,
350
+ "learning_rate": 0.00019830307345437286,
351
+ "loss": 0.0822,
352
+ "mean_token_accuracy": 0.974584549665451,
353
+ "num_tokens": 810547.0,
354
+ "step": 430
355
+ },
356
+ {
357
+ "epoch": 0.008702187413472569,
358
+ "learning_rate": 0.0001982635180570389,
359
+ "loss": 0.0735,
360
+ "mean_token_accuracy": 0.9770256340503692,
361
+ "num_tokens": 829753.0,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 0.0088999644001424,
366
+ "learning_rate": 0.00019822396265970494,
367
+ "loss": 0.0882,
368
+ "mean_token_accuracy": 0.9668483734130859,
369
+ "num_tokens": 848294.0,
370
+ "step": 450
371
+ },
372
+ {
373
+ "epoch": 0.009097741386812231,
374
+ "learning_rate": 0.00019818440726237095,
375
+ "loss": 0.0923,
376
+ "mean_token_accuracy": 0.9715612173080445,
377
+ "num_tokens": 867635.0,
378
+ "step": 460
379
+ },
380
+ {
381
+ "epoch": 0.009295518373482062,
382
+ "learning_rate": 0.00019814485186503699,
383
+ "loss": 0.0665,
384
+ "mean_token_accuracy": 0.9765240132808686,
385
+ "num_tokens": 886596.0,
386
+ "step": 470
387
+ },
388
+ {
389
+ "epoch": 0.009493295360151893,
390
+ "learning_rate": 0.00019810529646770302,
391
+ "loss": 0.08,
392
+ "mean_token_accuracy": 0.972789865732193,
393
+ "num_tokens": 905312.0,
394
+ "step": 480
395
+ },
396
+ {
397
+ "epoch": 0.009691072346821724,
398
+ "learning_rate": 0.00019806574107036906,
399
+ "loss": 0.0794,
400
+ "mean_token_accuracy": 0.9753748655319214,
401
+ "num_tokens": 924072.0,
402
+ "step": 490
403
+ },
404
+ {
405
+ "epoch": 0.009888849333491555,
406
+ "learning_rate": 0.0001980261856730351,
407
+ "loss": 0.0673,
408
+ "mean_token_accuracy": 0.9732649922370911,
409
+ "num_tokens": 942844.0,
410
+ "step": 500
411
+ },
412
+ {
413
+ "epoch": 0.010086626320161387,
414
+ "learning_rate": 0.00019798663027570113,
415
+ "loss": 0.1029,
416
+ "mean_token_accuracy": 0.9693170130252838,
417
+ "num_tokens": 961524.0,
418
+ "step": 510
419
+ },
420
+ {
421
+ "epoch": 0.010284403306831218,
422
+ "learning_rate": 0.00019794707487836717,
423
+ "loss": 0.0901,
424
+ "mean_token_accuracy": 0.9669535160064697,
425
+ "num_tokens": 980332.0,
426
+ "step": 520
427
+ },
428
+ {
429
+ "epoch": 0.010482180293501049,
430
+ "learning_rate": 0.00019790751948103318,
431
+ "loss": 0.0858,
432
+ "mean_token_accuracy": 0.9700904309749603,
433
+ "num_tokens": 999163.0,
434
+ "step": 530
435
+ },
436
+ {
437
+ "epoch": 0.01067995728017088,
438
+ "learning_rate": 0.00019786796408369922,
439
+ "loss": 0.1001,
440
+ "mean_token_accuracy": 0.9630756258964539,
441
+ "num_tokens": 1018053.0,
442
+ "step": 540
443
+ },
444
+ {
445
+ "epoch": 0.010877734266840711,
446
+ "learning_rate": 0.00019782840868636528,
447
+ "loss": 0.0798,
448
+ "mean_token_accuracy": 0.9710527896881104,
449
+ "num_tokens": 1036828.0,
450
+ "step": 550
451
+ },
452
+ {
453
+ "epoch": 0.011075511253510542,
454
+ "learning_rate": 0.0001977888532890313,
455
+ "loss": 0.0883,
456
+ "mean_token_accuracy": 0.9719638526439667,
457
+ "num_tokens": 1055843.0,
458
+ "step": 560
459
+ },
460
+ {
461
+ "epoch": 0.011273288240180373,
462
+ "learning_rate": 0.00019774929789169733,
463
+ "loss": 0.0817,
464
+ "mean_token_accuracy": 0.9706017553806305,
465
+ "num_tokens": 1074733.0,
466
+ "step": 570
467
+ },
468
+ {
469
+ "epoch": 0.011471065226850204,
470
+ "learning_rate": 0.00019770974249436337,
471
+ "loss": 0.0949,
472
+ "mean_token_accuracy": 0.9684431076049804,
473
+ "num_tokens": 1093519.0,
474
+ "step": 580
475
+ },
476
+ {
477
+ "epoch": 0.011668842213520035,
478
+ "learning_rate": 0.0001976701870970294,
479
+ "loss": 0.0797,
480
+ "mean_token_accuracy": 0.975245076417923,
481
+ "num_tokens": 1112394.0,
482
+ "step": 590
483
+ },
484
+ {
485
+ "epoch": 0.011866619200189867,
486
+ "learning_rate": 0.00019763063169969542,
487
+ "loss": 0.0814,
488
+ "mean_token_accuracy": 0.9766307890415191,
489
+ "num_tokens": 1131435.0,
490
+ "step": 600
491
+ },
492
+ {
493
+ "epoch": 0.012064396186859698,
494
+ "learning_rate": 0.00019759107630236146,
495
+ "loss": 0.0893,
496
+ "mean_token_accuracy": 0.9671396017074585,
497
+ "num_tokens": 1150274.0,
498
+ "step": 610
499
+ },
500
+ {
501
+ "epoch": 0.012262173173529529,
502
+ "learning_rate": 0.00019755152090502752,
503
+ "loss": 0.0965,
504
+ "mean_token_accuracy": 0.9663532435894012,
505
+ "num_tokens": 1169076.0,
506
+ "step": 620
507
+ },
508
+ {
509
+ "epoch": 0.01245995016019936,
510
+ "learning_rate": 0.00019751196550769353,
511
+ "loss": 0.0692,
512
+ "mean_token_accuracy": 0.9754896223545074,
513
+ "num_tokens": 1187908.0,
514
+ "step": 630
515
+ },
516
+ {
517
+ "epoch": 0.012657727146869191,
518
+ "learning_rate": 0.00019747241011035957,
519
+ "loss": 0.0893,
520
+ "mean_token_accuracy": 0.9667335331439972,
521
+ "num_tokens": 1206872.0,
522
+ "step": 640
523
+ },
524
+ {
525
+ "epoch": 0.012855504133539022,
526
+ "learning_rate": 0.0001974328547130256,
527
+ "loss": 0.0762,
528
+ "mean_token_accuracy": 0.9724668145179749,
529
+ "num_tokens": 1225767.0,
530
+ "step": 650
531
+ },
532
+ {
533
+ "epoch": 0.013053281120208853,
534
+ "learning_rate": 0.00019739329931569164,
535
+ "loss": 0.08,
536
+ "mean_token_accuracy": 0.9690144300460816,
537
+ "num_tokens": 1244340.0,
538
+ "step": 660
539
+ },
540
+ {
541
+ "epoch": 0.013251058106878684,
542
+ "learning_rate": 0.00019735374391835765,
543
+ "loss": 0.0872,
544
+ "mean_token_accuracy": 0.9725019693374634,
545
+ "num_tokens": 1263304.0,
546
+ "step": 670
547
+ },
548
+ {
549
+ "epoch": 0.013448835093548515,
550
+ "learning_rate": 0.0001973141885210237,
551
+ "loss": 0.0894,
552
+ "mean_token_accuracy": 0.9694939434528351,
553
+ "num_tokens": 1282028.0,
554
+ "step": 680
555
+ },
556
+ {
557
+ "epoch": 0.013646612080218347,
558
+ "learning_rate": 0.00019727463312368976,
559
+ "loss": 0.096,
560
+ "mean_token_accuracy": 0.9666180431842804,
561
+ "num_tokens": 1300780.0,
562
+ "step": 690
563
+ },
564
+ {
565
+ "epoch": 0.013844389066888178,
566
+ "learning_rate": 0.00019723507772635577,
567
+ "loss": 0.0804,
568
+ "mean_token_accuracy": 0.9751847207546234,
569
+ "num_tokens": 1319535.0,
570
+ "step": 700
571
+ },
572
+ {
573
+ "epoch": 0.014042166053558009,
574
+ "learning_rate": 0.0001971955223290218,
575
+ "loss": 0.1046,
576
+ "mean_token_accuracy": 0.9681148648262023,
577
+ "num_tokens": 1338391.0,
578
+ "step": 710
579
+ },
580
+ {
581
+ "epoch": 0.01423994304022784,
582
+ "learning_rate": 0.00019715596693168784,
583
+ "loss": 0.0804,
584
+ "mean_token_accuracy": 0.9726056635379792,
585
+ "num_tokens": 1357072.0,
586
+ "step": 720
587
+ },
588
+ {
589
+ "epoch": 0.014437720026897671,
590
+ "learning_rate": 0.00019711641153435388,
591
+ "loss": 0.0664,
592
+ "mean_token_accuracy": 0.9787862658500671,
593
+ "num_tokens": 1375999.0,
594
+ "step": 730
595
+ },
596
+ {
597
+ "epoch": 0.014635497013567502,
598
+ "learning_rate": 0.00019707685613701992,
599
+ "loss": 0.0823,
600
+ "mean_token_accuracy": 0.9705499827861785,
601
+ "num_tokens": 1394654.0,
602
+ "step": 740
603
+ },
604
+ {
605
+ "epoch": 0.014833274000237333,
606
+ "learning_rate": 0.00019703730073968593,
607
+ "loss": 0.0828,
608
+ "mean_token_accuracy": 0.9703374147415161,
609
+ "num_tokens": 1413323.0,
610
+ "step": 750
611
+ },
612
+ {
613
+ "epoch": 0.015031050986907164,
614
+ "learning_rate": 0.000196997745342352,
615
+ "loss": 0.0797,
616
+ "mean_token_accuracy": 0.9717476069927216,
617
+ "num_tokens": 1432274.0,
618
+ "step": 760
619
+ },
620
+ {
621
+ "epoch": 0.015228827973576995,
622
+ "learning_rate": 0.000196958189945018,
623
+ "loss": 0.0811,
624
+ "mean_token_accuracy": 0.9687287509441376,
625
+ "num_tokens": 1451009.0,
626
+ "step": 770
627
+ },
628
+ {
629
+ "epoch": 0.015426604960246827,
630
+ "learning_rate": 0.00019691863454768404,
631
+ "loss": 0.0799,
632
+ "mean_token_accuracy": 0.9712490200996399,
633
+ "num_tokens": 1469741.0,
634
+ "step": 780
635
+ },
636
+ {
637
+ "epoch": 0.015624381946916658,
638
+ "learning_rate": 0.00019687907915035008,
639
+ "loss": 0.0698,
640
+ "mean_token_accuracy": 0.9755463302135468,
641
+ "num_tokens": 1488644.0,
642
+ "step": 790
643
+ },
644
+ {
645
+ "epoch": 0.01582215893358649,
646
+ "learning_rate": 0.0001968395237530161,
647
+ "loss": 0.07,
648
+ "mean_token_accuracy": 0.9786295354366302,
649
+ "num_tokens": 1507384.0,
650
+ "step": 800
651
+ },
652
+ {
653
+ "epoch": 0.016019935920256318,
654
+ "learning_rate": 0.00019679996835568215,
655
+ "loss": 0.0842,
656
+ "mean_token_accuracy": 0.9716646075248718,
657
+ "num_tokens": 1526208.0,
658
+ "step": 810
659
+ },
660
+ {
661
+ "epoch": 0.01621771290692615,
662
+ "learning_rate": 0.00019676041295834816,
663
+ "loss": 0.0681,
664
+ "mean_token_accuracy": 0.9786826431751251,
665
+ "num_tokens": 1545086.0,
666
+ "step": 820
667
+ },
668
+ {
669
+ "epoch": 0.01641548989359598,
670
+ "learning_rate": 0.00019672085756101423,
671
+ "loss": 0.068,
672
+ "mean_token_accuracy": 0.9776684403419494,
673
+ "num_tokens": 1563972.0,
674
+ "step": 830
675
+ },
676
+ {
677
+ "epoch": 0.016613266880265813,
678
+ "learning_rate": 0.00019668130216368024,
679
+ "loss": 0.0855,
680
+ "mean_token_accuracy": 0.9758803486824036,
681
+ "num_tokens": 1582893.0,
682
+ "step": 840
683
+ },
684
+ {
685
+ "epoch": 0.016811043866935643,
686
+ "learning_rate": 0.00019664174676634627,
687
+ "loss": 0.0969,
688
+ "mean_token_accuracy": 0.9676210284233093,
689
+ "num_tokens": 1601719.0,
690
+ "step": 850
691
+ },
692
+ {
693
+ "epoch": 0.017008820853605475,
694
+ "learning_rate": 0.0001966021913690123,
695
+ "loss": 0.0836,
696
+ "mean_token_accuracy": 0.973316353559494,
697
+ "num_tokens": 1620493.0,
698
+ "step": 860
699
+ },
700
+ {
701
+ "epoch": 0.017206597840275305,
702
+ "learning_rate": 0.00019656263597167835,
703
+ "loss": 0.0707,
704
+ "mean_token_accuracy": 0.9776813209056854,
705
+ "num_tokens": 1639336.0,
706
+ "step": 870
707
+ },
708
+ {
709
+ "epoch": 0.017404374826945138,
710
+ "learning_rate": 0.00019652308057434439,
711
+ "loss": 0.0921,
712
+ "mean_token_accuracy": 0.9702894032001496,
713
+ "num_tokens": 1658213.0,
714
+ "step": 880
715
+ },
716
+ {
717
+ "epoch": 0.017602151813614967,
718
+ "learning_rate": 0.0001964835251770104,
719
+ "loss": 0.0629,
720
+ "mean_token_accuracy": 0.9804218530654907,
721
+ "num_tokens": 1677260.0,
722
+ "step": 890
723
+ },
724
+ {
725
+ "epoch": 0.0177999288002848,
726
+ "learning_rate": 0.00019644396977967646,
727
+ "loss": 0.0946,
728
+ "mean_token_accuracy": 0.9700180232524872,
729
+ "num_tokens": 1696174.0,
730
+ "step": 900
731
+ },
732
+ {
733
+ "epoch": 0.01799770578695463,
734
+ "learning_rate": 0.00019640441438234247,
735
+ "loss": 0.0822,
736
+ "mean_token_accuracy": 0.9717112898826599,
737
+ "num_tokens": 1714822.0,
738
+ "step": 910
739
+ },
740
+ {
741
+ "epoch": 0.018195482773624462,
742
+ "learning_rate": 0.0001963648589850085,
743
+ "loss": 0.0732,
744
+ "mean_token_accuracy": 0.9710357069969178,
745
+ "num_tokens": 1733526.0,
746
+ "step": 920
747
+ },
748
+ {
749
+ "epoch": 0.01839325976029429,
750
+ "learning_rate": 0.00019632530358767455,
751
+ "loss": 0.0697,
752
+ "mean_token_accuracy": 0.9731245815753937,
753
+ "num_tokens": 1752469.0,
754
+ "step": 930
755
+ },
756
+ {
757
+ "epoch": 0.018591036746964124,
758
+ "learning_rate": 0.00019628574819034058,
759
+ "loss": 0.065,
760
+ "mean_token_accuracy": 0.9799518942832947,
761
+ "num_tokens": 1771439.0,
762
+ "step": 940
763
+ },
764
+ {
765
+ "epoch": 0.018788813733633954,
766
+ "learning_rate": 0.00019624619279300662,
767
+ "loss": 0.0769,
768
+ "mean_token_accuracy": 0.9691688776016235,
769
+ "num_tokens": 1790105.0,
770
+ "step": 950
771
+ },
772
+ {
773
+ "epoch": 0.018986590720303786,
774
+ "learning_rate": 0.00019620663739567263,
775
+ "loss": 0.0675,
776
+ "mean_token_accuracy": 0.9775746822357178,
777
+ "num_tokens": 1809000.0,
778
+ "step": 960
779
+ },
780
+ {
781
+ "epoch": 0.019184367706973616,
782
+ "learning_rate": 0.0001961670819983387,
783
+ "loss": 0.0761,
784
+ "mean_token_accuracy": 0.9722946584224701,
785
+ "num_tokens": 1827743.0,
786
+ "step": 970
787
+ },
788
+ {
789
+ "epoch": 0.01938214469364345,
790
+ "learning_rate": 0.0001961275266010047,
791
+ "loss": 0.0827,
792
+ "mean_token_accuracy": 0.9710648536682129,
793
+ "num_tokens": 1846483.0,
794
+ "step": 980
795
+ },
796
+ {
797
+ "epoch": 0.019579921680313278,
798
+ "learning_rate": 0.00019608797120367074,
799
+ "loss": 0.0548,
800
+ "mean_token_accuracy": 0.9855950713157654,
801
+ "num_tokens": 1865230.0,
802
+ "step": 990
803
+ },
804
+ {
805
+ "epoch": 0.01977769866698311,
806
+ "learning_rate": 0.00019604841580633678,
807
+ "loss": 0.0934,
808
+ "mean_token_accuracy": 0.9695265829563141,
809
+ "num_tokens": 1884295.0,
810
+ "step": 1000
811
+ },
812
+ {
813
+ "epoch": 0.01997547565365294,
814
+ "learning_rate": 0.00019600886040900282,
815
+ "loss": 0.0806,
816
+ "mean_token_accuracy": 0.9750065445899964,
817
+ "num_tokens": 1903297.0,
818
+ "step": 1010
819
+ },
820
+ {
821
+ "epoch": 0.020173252640322773,
822
+ "learning_rate": 0.00019596930501166886,
823
+ "loss": 0.0683,
824
+ "mean_token_accuracy": 0.9779970288276673,
825
+ "num_tokens": 1922020.0,
826
+ "step": 1020
827
+ },
828
+ {
829
+ "epoch": 0.020371029626992603,
830
+ "learning_rate": 0.00019592974961433487,
831
+ "loss": 0.0636,
832
+ "mean_token_accuracy": 0.9801307320594788,
833
+ "num_tokens": 1940771.0,
834
+ "step": 1030
835
+ },
836
+ {
837
+ "epoch": 0.020568806613662435,
838
+ "learning_rate": 0.00019589019421700093,
839
+ "loss": 0.078,
840
+ "mean_token_accuracy": 0.9704040169715882,
841
+ "num_tokens": 1959790.0,
842
+ "step": 1040
843
+ },
844
+ {
845
+ "epoch": 0.020766583600332265,
846
+ "learning_rate": 0.00019585063881966694,
847
+ "loss": 0.0641,
848
+ "mean_token_accuracy": 0.978104192018509,
849
+ "num_tokens": 1978766.0,
850
+ "step": 1050
851
+ },
852
+ {
853
+ "epoch": 0.020964360587002098,
854
+ "learning_rate": 0.00019581108342233298,
855
+ "loss": 0.0931,
856
+ "mean_token_accuracy": 0.967569786310196,
857
+ "num_tokens": 1997785.0,
858
+ "step": 1060
859
+ },
860
+ {
861
+ "epoch": 0.021162137573671927,
862
+ "learning_rate": 0.00019577152802499902,
863
+ "loss": 0.0877,
864
+ "mean_token_accuracy": 0.9735791981220245,
865
+ "num_tokens": 2016334.0,
866
+ "step": 1070
867
+ },
868
+ {
869
+ "epoch": 0.02135991456034176,
870
+ "learning_rate": 0.00019573197262766505,
871
+ "loss": 0.0833,
872
+ "mean_token_accuracy": 0.9717927515506745,
873
+ "num_tokens": 2035271.0,
874
+ "step": 1080
875
+ },
876
+ {
877
+ "epoch": 0.02155769154701159,
878
+ "learning_rate": 0.0001956924172303311,
879
+ "loss": 0.0732,
880
+ "mean_token_accuracy": 0.97304065823555,
881
+ "num_tokens": 2054158.0,
882
+ "step": 1090
883
+ },
884
+ {
885
+ "epoch": 0.021755468533681422,
886
+ "learning_rate": 0.0001956528618329971,
887
+ "loss": 0.0782,
888
+ "mean_token_accuracy": 0.971097469329834,
889
+ "num_tokens": 2072794.0,
890
+ "step": 1100
891
+ },
892
+ {
893
+ "epoch": 0.02195324552035125,
894
+ "learning_rate": 0.00019561330643566317,
895
+ "loss": 0.0745,
896
+ "mean_token_accuracy": 0.9693615853786468,
897
+ "num_tokens": 2091583.0,
898
+ "step": 1110
899
+ },
900
+ {
901
+ "epoch": 0.022151022507021084,
902
+ "learning_rate": 0.0001955737510383292,
903
+ "loss": 0.0575,
904
+ "mean_token_accuracy": 0.9811938166618347,
905
+ "num_tokens": 2110207.0,
906
+ "step": 1120
907
+ },
908
+ {
909
+ "epoch": 0.022348799493690914,
910
+ "learning_rate": 0.00019553419564099521,
911
+ "loss": 0.0838,
912
+ "mean_token_accuracy": 0.9719610214233398,
913
+ "num_tokens": 2129006.0,
914
+ "step": 1130
915
+ },
916
+ {
917
+ "epoch": 0.022546576480360746,
918
+ "learning_rate": 0.00019549464024366125,
919
+ "loss": 0.0704,
920
+ "mean_token_accuracy": 0.9780212998390198,
921
+ "num_tokens": 2147866.0,
922
+ "step": 1140
923
+ },
924
+ {
925
+ "epoch": 0.022744353467030576,
926
+ "learning_rate": 0.0001954550848463273,
927
+ "loss": 0.0626,
928
+ "mean_token_accuracy": 0.9765089929103852,
929
+ "num_tokens": 2166678.0,
930
+ "step": 1150
931
+ },
932
+ {
933
+ "epoch": 0.02294213045370041,
934
+ "learning_rate": 0.00019541552944899333,
935
+ "loss": 0.071,
936
+ "mean_token_accuracy": 0.9786867916584014,
937
+ "num_tokens": 2185491.0,
938
+ "step": 1160
939
+ },
940
+ {
941
+ "epoch": 0.023139907440370238,
942
+ "learning_rate": 0.00019537597405165934,
943
+ "loss": 0.0855,
944
+ "mean_token_accuracy": 0.9737950384616851,
945
+ "num_tokens": 2204270.0,
946
+ "step": 1170
947
+ },
948
+ {
949
+ "epoch": 0.02333768442704007,
950
+ "learning_rate": 0.0001953364186543254,
951
+ "loss": 0.0817,
952
+ "mean_token_accuracy": 0.9746371984481812,
953
+ "num_tokens": 2223262.0,
954
+ "step": 1180
955
+ },
956
+ {
957
+ "epoch": 0.0235354614137099,
958
+ "learning_rate": 0.00019529686325699144,
959
+ "loss": 0.069,
960
+ "mean_token_accuracy": 0.9796195566654206,
961
+ "num_tokens": 2242082.0,
962
+ "step": 1190
963
+ },
964
+ {
965
+ "epoch": 0.023733238400379733,
966
+ "learning_rate": 0.00019525730785965745,
967
+ "loss": 0.0976,
968
+ "mean_token_accuracy": 0.9742420554161072,
969
+ "num_tokens": 2260874.0,
970
+ "step": 1200
971
+ },
972
+ {
973
+ "epoch": 0.023931015387049562,
974
+ "learning_rate": 0.00019521775246232349,
975
+ "loss": 0.0816,
976
+ "mean_token_accuracy": 0.972790652513504,
977
+ "num_tokens": 2279620.0,
978
+ "step": 1210
979
+ },
980
+ {
981
+ "epoch": 0.024128792373719395,
982
+ "learning_rate": 0.00019517819706498952,
983
+ "loss": 0.0602,
984
+ "mean_token_accuracy": 0.9790996849536896,
985
+ "num_tokens": 2298359.0,
986
+ "step": 1220
987
+ },
988
+ {
989
+ "epoch": 0.024326569360389225,
990
+ "learning_rate": 0.00019513864166765556,
991
+ "loss": 0.0773,
992
+ "mean_token_accuracy": 0.9750834167003631,
993
+ "num_tokens": 2317171.0,
994
+ "step": 1230
995
+ },
996
+ {
997
+ "epoch": 0.024524346347059058,
998
+ "learning_rate": 0.00019509908627032157,
999
+ "loss": 0.0863,
1000
+ "mean_token_accuracy": 0.9679585933685303,
1001
+ "num_tokens": 2335825.0,
1002
+ "step": 1240
1003
+ },
1004
+ {
1005
+ "epoch": 0.024722123333728887,
1006
+ "learning_rate": 0.00019505953087298764,
1007
+ "loss": 0.0712,
1008
+ "mean_token_accuracy": 0.9711700201034545,
1009
+ "num_tokens": 2354806.0,
1010
+ "step": 1250
1011
+ },
1012
+ {
1013
+ "epoch": 0.02491990032039872,
1014
+ "learning_rate": 0.00019501997547565367,
1015
+ "loss": 0.0905,
1016
+ "mean_token_accuracy": 0.9734555304050445,
1017
+ "num_tokens": 2373919.0,
1018
+ "step": 1260
1019
+ },
1020
+ {
1021
+ "epoch": 0.02511767730706855,
1022
+ "learning_rate": 0.00019498042007831968,
1023
+ "loss": 0.0582,
1024
+ "mean_token_accuracy": 0.9755668938159943,
1025
+ "num_tokens": 2393043.0,
1026
+ "step": 1270
1027
+ },
1028
+ {
1029
+ "epoch": 0.025315454293738382,
1030
+ "learning_rate": 0.00019494086468098575,
1031
+ "loss": 0.0627,
1032
+ "mean_token_accuracy": 0.9795541882514953,
1033
+ "num_tokens": 2411888.0,
1034
+ "step": 1280
1035
+ },
1036
+ {
1037
+ "epoch": 0.02551323128040821,
1038
+ "learning_rate": 0.00019490130928365176,
1039
+ "loss": 0.0729,
1040
+ "mean_token_accuracy": 0.976466304063797,
1041
+ "num_tokens": 2430790.0,
1042
+ "step": 1290
1043
+ },
1044
+ {
1045
+ "epoch": 0.025711008267078044,
1046
+ "learning_rate": 0.0001948617538863178,
1047
+ "loss": 0.0676,
1048
+ "mean_token_accuracy": 0.9730806350708008,
1049
+ "num_tokens": 2449534.0,
1050
+ "step": 1300
1051
+ },
1052
+ {
1053
+ "epoch": 0.025908785253747874,
1054
+ "learning_rate": 0.0001948221984889838,
1055
+ "loss": 0.0849,
1056
+ "mean_token_accuracy": 0.9746219754219055,
1057
+ "num_tokens": 2468500.0,
1058
+ "step": 1310
1059
+ },
1060
+ {
1061
+ "epoch": 0.026106562240417706,
1062
+ "learning_rate": 0.00019478264309164987,
1063
+ "loss": 0.082,
1064
+ "mean_token_accuracy": 0.9718350946903229,
1065
+ "num_tokens": 2487522.0,
1066
+ "step": 1320
1067
+ },
1068
+ {
1069
+ "epoch": 0.026304339227087536,
1070
+ "learning_rate": 0.0001947430876943159,
1071
+ "loss": 0.0814,
1072
+ "mean_token_accuracy": 0.9727078378200531,
1073
+ "num_tokens": 2506228.0,
1074
+ "step": 1330
1075
+ },
1076
+ {
1077
+ "epoch": 0.02650211621375737,
1078
+ "learning_rate": 0.00019470353229698192,
1079
+ "loss": 0.0908,
1080
+ "mean_token_accuracy": 0.972163724899292,
1081
+ "num_tokens": 2524993.0,
1082
+ "step": 1340
1083
+ },
1084
+ {
1085
+ "epoch": 0.026699893200427198,
1086
+ "learning_rate": 0.00019466397689964798,
1087
+ "loss": 0.0967,
1088
+ "mean_token_accuracy": 0.9677857100963593,
1089
+ "num_tokens": 2543814.0,
1090
+ "step": 1350
1091
+ },
1092
+ {
1093
+ "epoch": 0.02689767018709703,
1094
+ "learning_rate": 0.000194624421502314,
1095
+ "loss": 0.0965,
1096
+ "mean_token_accuracy": 0.9627455770969391,
1097
+ "num_tokens": 2562624.0,
1098
+ "step": 1360
1099
+ },
1100
+ {
1101
+ "epoch": 0.02709544717376686,
1102
+ "learning_rate": 0.00019458486610498003,
1103
+ "loss": 0.0866,
1104
+ "mean_token_accuracy": 0.9717130541801453,
1105
+ "num_tokens": 2581229.0,
1106
+ "step": 1370
1107
+ },
1108
+ {
1109
+ "epoch": 0.027293224160436693,
1110
+ "learning_rate": 0.00019454531070764607,
1111
+ "loss": 0.0864,
1112
+ "mean_token_accuracy": 0.9688866436481476,
1113
+ "num_tokens": 2600073.0,
1114
+ "step": 1380
1115
+ },
1116
+ {
1117
+ "epoch": 0.027491001147106522,
1118
+ "learning_rate": 0.0001945057553103121,
1119
+ "loss": 0.0743,
1120
+ "mean_token_accuracy": 0.9723983883857727,
1121
+ "num_tokens": 2618837.0,
1122
+ "step": 1390
1123
+ },
1124
+ {
1125
+ "epoch": 0.027688778133776355,
1126
+ "learning_rate": 0.00019446619991297814,
1127
+ "loss": 0.0719,
1128
+ "mean_token_accuracy": 0.9758836448192596,
1129
+ "num_tokens": 2637487.0,
1130
+ "step": 1400
1131
+ },
1132
+ {
1133
+ "epoch": 0.027886555120446185,
1134
+ "learning_rate": 0.00019442664451564415,
1135
+ "loss": 0.0925,
1136
+ "mean_token_accuracy": 0.969123649597168,
1137
+ "num_tokens": 2656004.0,
1138
+ "step": 1410
1139
+ },
1140
+ {
1141
+ "epoch": 0.028084332107116017,
1142
+ "learning_rate": 0.00019438708911831022,
1143
+ "loss": 0.0566,
1144
+ "mean_token_accuracy": 0.9795430541038513,
1145
+ "num_tokens": 2675089.0,
1146
+ "step": 1420
1147
+ },
1148
+ {
1149
+ "epoch": 0.028282109093785847,
1150
+ "learning_rate": 0.00019434753372097623,
1151
+ "loss": 0.0618,
1152
+ "mean_token_accuracy": 0.9796153604984283,
1153
+ "num_tokens": 2693904.0,
1154
+ "step": 1430
1155
+ },
1156
+ {
1157
+ "epoch": 0.02847988608045568,
1158
+ "learning_rate": 0.00019430797832364227,
1159
+ "loss": 0.0808,
1160
+ "mean_token_accuracy": 0.9735188663005829,
1161
+ "num_tokens": 2712395.0,
1162
+ "step": 1440
1163
+ },
1164
+ {
1165
+ "epoch": 0.02867766306712551,
1166
+ "learning_rate": 0.0001942684229263083,
1167
+ "loss": 0.0657,
1168
+ "mean_token_accuracy": 0.9771307945251465,
1169
+ "num_tokens": 2731103.0,
1170
+ "step": 1450
1171
+ },
1172
+ {
1173
+ "epoch": 0.028875440053795342,
1174
+ "learning_rate": 0.00019422886752897434,
1175
+ "loss": 0.083,
1176
+ "mean_token_accuracy": 0.9664826571941376,
1177
+ "num_tokens": 2749801.0,
1178
+ "step": 1460
1179
+ },
1180
+ {
1181
+ "epoch": 0.02907321704046517,
1182
+ "learning_rate": 0.00019418931213164038,
1183
+ "loss": 0.0721,
1184
+ "mean_token_accuracy": 0.9747998178005218,
1185
+ "num_tokens": 2768551.0,
1186
+ "step": 1470
1187
+ },
1188
+ {
1189
+ "epoch": 0.029270994027135004,
1190
+ "learning_rate": 0.0001941497567343064,
1191
+ "loss": 0.0499,
1192
+ "mean_token_accuracy": 0.9845096707344055,
1193
+ "num_tokens": 2787591.0,
1194
+ "step": 1480
1195
+ },
1196
+ {
1197
+ "epoch": 0.029468771013804834,
1198
+ "learning_rate": 0.00019411020133697245,
1199
+ "loss": 0.0665,
1200
+ "mean_token_accuracy": 0.9782804071903228,
1201
+ "num_tokens": 2806455.0,
1202
+ "step": 1490
1203
+ },
1204
+ {
1205
+ "epoch": 0.029666548000474666,
1206
+ "learning_rate": 0.00019407064593963846,
1207
+ "loss": 0.0633,
1208
+ "mean_token_accuracy": 0.9759399771690369,
1209
+ "num_tokens": 2825280.0,
1210
+ "step": 1500
1211
+ },
1212
+ {
1213
+ "epoch": 0.029864324987144496,
1214
+ "learning_rate": 0.0001940310905423045,
1215
+ "loss": 0.0641,
1216
+ "mean_token_accuracy": 0.980692720413208,
1217
+ "num_tokens": 2844178.0,
1218
+ "step": 1510
1219
+ },
1220
+ {
1221
+ "epoch": 0.03006210197381433,
1222
+ "learning_rate": 0.00019399153514497054,
1223
+ "loss": 0.0568,
1224
+ "mean_token_accuracy": 0.9760835587978363,
1225
+ "num_tokens": 2863373.0,
1226
+ "step": 1520
1227
+ },
1228
+ {
1229
+ "epoch": 0.030259878960484158,
1230
+ "learning_rate": 0.00019395197974763658,
1231
+ "loss": 0.0611,
1232
+ "mean_token_accuracy": 0.9767693936824798,
1233
+ "num_tokens": 2882166.0,
1234
+ "step": 1530
1235
+ },
1236
+ {
1237
+ "epoch": 0.03045765594715399,
1238
+ "learning_rate": 0.00019391242435030261,
1239
+ "loss": 0.0557,
1240
+ "mean_token_accuracy": 0.9800930917263031,
1241
+ "num_tokens": 2901050.0,
1242
+ "step": 1540
1243
+ },
1244
+ {
1245
+ "epoch": 0.03065543293382382,
1246
+ "learning_rate": 0.00019387286895296862,
1247
+ "loss": 0.0739,
1248
+ "mean_token_accuracy": 0.9733968496322631,
1249
+ "num_tokens": 2919953.0,
1250
+ "step": 1550
1251
+ },
1252
+ {
1253
+ "epoch": 0.030853209920493653,
1254
+ "learning_rate": 0.0001938333135556347,
1255
+ "loss": 0.0706,
1256
+ "mean_token_accuracy": 0.9725773215293885,
1257
+ "num_tokens": 2939003.0,
1258
+ "step": 1560
1259
+ },
1260
+ {
1261
+ "epoch": 0.031050986907163482,
1262
+ "learning_rate": 0.00019379375815830073,
1263
+ "loss": 0.0655,
1264
+ "mean_token_accuracy": 0.9753368616104126,
1265
+ "num_tokens": 2957853.0,
1266
+ "step": 1570
1267
+ },
1268
+ {
1269
+ "epoch": 0.031248763893833315,
1270
+ "learning_rate": 0.00019375420276096674,
1271
+ "loss": 0.0667,
1272
+ "mean_token_accuracy": 0.9791980743408203,
1273
+ "num_tokens": 2976557.0,
1274
+ "step": 1580
1275
+ },
1276
+ {
1277
+ "epoch": 0.031446540880503145,
1278
+ "learning_rate": 0.00019371464736363277,
1279
+ "loss": 0.0748,
1280
+ "mean_token_accuracy": 0.9767062723636627,
1281
+ "num_tokens": 2995058.0,
1282
+ "step": 1590
1283
+ },
1284
+ {
1285
+ "epoch": 0.03164431786717298,
1286
+ "learning_rate": 0.0001936750919662988,
1287
+ "loss": 0.0628,
1288
+ "mean_token_accuracy": 0.9783797085285186,
1289
+ "num_tokens": 3014165.0,
1290
+ "step": 1600
1291
+ },
1292
+ {
1293
+ "epoch": 0.03184209485384281,
1294
+ "learning_rate": 0.00019363553656896485,
1295
+ "loss": 0.0564,
1296
+ "mean_token_accuracy": 0.9830330014228821,
1297
+ "num_tokens": 3032924.0,
1298
+ "step": 1610
1299
+ },
1300
+ {
1301
+ "epoch": 0.032039871840512636,
1302
+ "learning_rate": 0.00019359598117163086,
1303
+ "loss": 0.0783,
1304
+ "mean_token_accuracy": 0.9734647035598755,
1305
+ "num_tokens": 3051960.0,
1306
+ "step": 1620
1307
+ },
1308
+ {
1309
+ "epoch": 0.03223764882718247,
1310
+ "learning_rate": 0.00019355642577429692,
1311
+ "loss": 0.0827,
1312
+ "mean_token_accuracy": 0.9718622207641602,
1313
+ "num_tokens": 3070644.0,
1314
+ "step": 1630
1315
+ },
1316
+ {
1317
+ "epoch": 0.0324354258138523,
1318
+ "learning_rate": 0.00019351687037696296,
1319
+ "loss": 0.071,
1320
+ "mean_token_accuracy": 0.980438482761383,
1321
+ "num_tokens": 3089393.0,
1322
+ "step": 1640
1323
+ },
1324
+ {
1325
+ "epoch": 0.03263320280052213,
1326
+ "learning_rate": 0.00019347731497962897,
1327
+ "loss": 0.0656,
1328
+ "mean_token_accuracy": 0.9727302372455597,
1329
+ "num_tokens": 3108260.0,
1330
+ "step": 1650
1331
+ },
1332
+ {
1333
+ "epoch": 0.03283097978719196,
1334
+ "learning_rate": 0.000193437759582295,
1335
+ "loss": 0.0847,
1336
+ "mean_token_accuracy": 0.9706348955631257,
1337
+ "num_tokens": 3127157.0,
1338
+ "step": 1660
1339
+ },
1340
+ {
1341
+ "epoch": 0.033028756773861793,
1342
+ "learning_rate": 0.00019339820418496105,
1343
+ "loss": 0.0721,
1344
+ "mean_token_accuracy": 0.9732413589954376,
1345
+ "num_tokens": 3146126.0,
1346
+ "step": 1670
1347
+ },
1348
+ {
1349
+ "epoch": 0.033226533760531626,
1350
+ "learning_rate": 0.00019335864878762708,
1351
+ "loss": 0.0836,
1352
+ "mean_token_accuracy": 0.9706507205963135,
1353
+ "num_tokens": 3165091.0,
1354
+ "step": 1680
1355
+ }
1356
+ ],
1357
+ "logging_steps": 10,
1358
+ "max_steps": 50562,
1359
+ "num_input_tokens_seen": 0,
1360
+ "num_train_epochs": 9223372036854775807,
1361
+ "save_steps": 1685,
1362
+ "stateful_callbacks": {
1363
+ "TrainerControl": {
1364
+ "args": {
1365
+ "should_epoch_stop": false,
1366
+ "should_evaluate": false,
1367
+ "should_log": false,
1368
+ "should_save": true,
1369
+ "should_training_stop": false
1370
+ },
1371
+ "attributes": {}
1372
+ }
1373
+ },
1374
+ "total_flos": 1.5391152511647744e+17,
1375
+ "train_batch_size": 16,
1376
+ "trial_name": null,
1377
+ "trial_params": null
1378
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 8,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "v_proj",
28
+ "q_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "trainable_token_indices": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/trainer_state.json ADDED
@@ -0,0 +1,1826 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.033329872287405256,
6
+ "eval_steps": 500,
7
+ "global_step": 2247,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00014833053977483424,
14
+ "learning_rate": 0.00019997330050284054,
15
+ "loss": 0.3338,
16
+ "mean_token_accuracy": 0.8992965638637542,
17
+ "num_tokens": 14338.0,
18
+ "step": 10
19
+ },
20
+ {
21
+ "epoch": 0.0002966610795496685,
22
+ "learning_rate": 0.00019994363439488556,
23
+ "loss": 0.1339,
24
+ "mean_token_accuracy": 0.9485014617443085,
25
+ "num_tokens": 28640.0,
26
+ "step": 20
27
+ },
28
+ {
29
+ "epoch": 0.0004449916193245027,
30
+ "learning_rate": 0.0001999139682869306,
31
+ "loss": 0.1615,
32
+ "mean_token_accuracy": 0.9449628531932831,
33
+ "num_tokens": 42898.0,
34
+ "step": 30
35
+ },
36
+ {
37
+ "epoch": 0.000593322159099337,
38
+ "learning_rate": 0.00019988430217897563,
39
+ "loss": 0.1449,
40
+ "mean_token_accuracy": 0.9448729813098907,
41
+ "num_tokens": 57046.0,
42
+ "step": 40
43
+ },
44
+ {
45
+ "epoch": 0.0007416526988741712,
46
+ "learning_rate": 0.00019985463607102066,
47
+ "loss": 0.1058,
48
+ "mean_token_accuracy": 0.95491161942482,
49
+ "num_tokens": 71338.0,
50
+ "step": 50
51
+ },
52
+ {
53
+ "epoch": 0.0008899832386490054,
54
+ "learning_rate": 0.0001998249699630657,
55
+ "loss": 0.0956,
56
+ "mean_token_accuracy": 0.9604991674423218,
57
+ "num_tokens": 85472.0,
58
+ "step": 60
59
+ },
60
+ {
61
+ "epoch": 0.0010383137784238396,
62
+ "learning_rate": 0.00019979530385511073,
63
+ "loss": 0.121,
64
+ "mean_token_accuracy": 0.9559885621070862,
65
+ "num_tokens": 99522.0,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 0.001186644318198674,
70
+ "learning_rate": 0.00019976563774715578,
71
+ "loss": 0.136,
72
+ "mean_token_accuracy": 0.9489055037498474,
73
+ "num_tokens": 113467.0,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.0013349748579735083,
78
+ "learning_rate": 0.0001997359716392008,
79
+ "loss": 0.1069,
80
+ "mean_token_accuracy": 0.9645210564136505,
81
+ "num_tokens": 127613.0,
82
+ "step": 90
83
+ },
84
+ {
85
+ "epoch": 0.0014833053977483424,
86
+ "learning_rate": 0.00019970630553124583,
87
+ "loss": 0.102,
88
+ "mean_token_accuracy": 0.9665270745754242,
89
+ "num_tokens": 141419.0,
90
+ "step": 100
91
+ },
92
+ {
93
+ "epoch": 0.0016316359375231767,
94
+ "learning_rate": 0.00019967663942329088,
95
+ "loss": 0.1031,
96
+ "mean_token_accuracy": 0.9620752274990082,
97
+ "num_tokens": 155506.0,
98
+ "step": 110
99
+ },
100
+ {
101
+ "epoch": 0.0017799664772980108,
102
+ "learning_rate": 0.0001996469733153359,
103
+ "loss": 0.1227,
104
+ "mean_token_accuracy": 0.9612519204616546,
105
+ "num_tokens": 169783.0,
106
+ "step": 120
107
+ },
108
+ {
109
+ "epoch": 0.0019282970170728451,
110
+ "learning_rate": 0.00019961730720738095,
111
+ "loss": 0.1132,
112
+ "mean_token_accuracy": 0.9575821399688721,
113
+ "num_tokens": 183864.0,
114
+ "step": 130
115
+ },
116
+ {
117
+ "epoch": 0.0020766275568476792,
118
+ "learning_rate": 0.00019958764109942597,
119
+ "loss": 0.099,
120
+ "mean_token_accuracy": 0.9662568092346191,
121
+ "num_tokens": 197941.0,
122
+ "step": 140
123
+ },
124
+ {
125
+ "epoch": 0.0022249580966225138,
126
+ "learning_rate": 0.000199557974991471,
127
+ "loss": 0.0861,
128
+ "mean_token_accuracy": 0.9733373045921325,
129
+ "num_tokens": 212305.0,
130
+ "step": 150
131
+ },
132
+ {
133
+ "epoch": 0.002373288636397348,
134
+ "learning_rate": 0.00019952830888351605,
135
+ "loss": 0.0786,
136
+ "mean_token_accuracy": 0.9724234759807586,
137
+ "num_tokens": 226639.0,
138
+ "step": 160
139
+ },
140
+ {
141
+ "epoch": 0.002521619176172182,
142
+ "learning_rate": 0.00019949864277556107,
143
+ "loss": 0.1153,
144
+ "mean_token_accuracy": 0.9579457581043244,
145
+ "num_tokens": 240608.0,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 0.0026699497159470165,
150
+ "learning_rate": 0.00019946897666760612,
151
+ "loss": 0.099,
152
+ "mean_token_accuracy": 0.9629672944545746,
153
+ "num_tokens": 254679.0,
154
+ "step": 180
155
+ },
156
+ {
157
+ "epoch": 0.0028182802557218506,
158
+ "learning_rate": 0.00019943931055965111,
159
+ "loss": 0.1052,
160
+ "mean_token_accuracy": 0.958859795331955,
161
+ "num_tokens": 268884.0,
162
+ "step": 190
163
+ },
164
+ {
165
+ "epoch": 0.0029666107954966848,
166
+ "learning_rate": 0.00019940964445169616,
167
+ "loss": 0.103,
168
+ "mean_token_accuracy": 0.9612604022026062,
169
+ "num_tokens": 283062.0,
170
+ "step": 200
171
+ },
172
+ {
173
+ "epoch": 0.003114941335271519,
174
+ "learning_rate": 0.00019937997834374121,
175
+ "loss": 0.0959,
176
+ "mean_token_accuracy": 0.9604475021362304,
177
+ "num_tokens": 297014.0,
178
+ "step": 210
179
+ },
180
+ {
181
+ "epoch": 0.0032632718750463534,
182
+ "learning_rate": 0.00019935031223578624,
183
+ "loss": 0.101,
184
+ "mean_token_accuracy": 0.9652803599834442,
185
+ "num_tokens": 311066.0,
186
+ "step": 220
187
+ },
188
+ {
189
+ "epoch": 0.0034116024148211875,
190
+ "learning_rate": 0.0001993206461278313,
191
+ "loss": 0.0996,
192
+ "mean_token_accuracy": 0.957928591966629,
193
+ "num_tokens": 325289.0,
194
+ "step": 230
195
+ },
196
+ {
197
+ "epoch": 0.0035599329545960216,
198
+ "learning_rate": 0.00019929098001987628,
199
+ "loss": 0.0692,
200
+ "mean_token_accuracy": 0.9736397683620452,
201
+ "num_tokens": 339616.0,
202
+ "step": 240
203
+ },
204
+ {
205
+ "epoch": 0.003708263494370856,
206
+ "learning_rate": 0.00019926131391192133,
207
+ "loss": 0.095,
208
+ "mean_token_accuracy": 0.9670307815074921,
209
+ "num_tokens": 353777.0,
210
+ "step": 250
211
+ },
212
+ {
213
+ "epoch": 0.0038565940341456903,
214
+ "learning_rate": 0.00019923164780396636,
215
+ "loss": 0.0968,
216
+ "mean_token_accuracy": 0.9648948311805725,
217
+ "num_tokens": 367871.0,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 0.004004924573920525,
222
+ "learning_rate": 0.0001992019816960114,
223
+ "loss": 0.1252,
224
+ "mean_token_accuracy": 0.9623154461383819,
225
+ "num_tokens": 381768.0,
226
+ "step": 270
227
+ },
228
+ {
229
+ "epoch": 0.0041532551136953585,
230
+ "learning_rate": 0.00019917231558805643,
231
+ "loss": 0.0933,
232
+ "mean_token_accuracy": 0.9669747233390809,
233
+ "num_tokens": 396018.0,
234
+ "step": 280
235
+ },
236
+ {
237
+ "epoch": 0.004301585653470193,
238
+ "learning_rate": 0.00019914264948010145,
239
+ "loss": 0.0903,
240
+ "mean_token_accuracy": 0.9654496192932129,
241
+ "num_tokens": 410375.0,
242
+ "step": 290
243
+ },
244
+ {
245
+ "epoch": 0.0044499161932450276,
246
+ "learning_rate": 0.0001991129833721465,
247
+ "loss": 0.1308,
248
+ "mean_token_accuracy": 0.9515012204647064,
249
+ "num_tokens": 424339.0,
250
+ "step": 300
251
+ },
252
+ {
253
+ "epoch": 0.004598246733019861,
254
+ "learning_rate": 0.00019908331726419153,
255
+ "loss": 0.0989,
256
+ "mean_token_accuracy": 0.9694083333015442,
257
+ "num_tokens": 438444.0,
258
+ "step": 310
259
+ },
260
+ {
261
+ "epoch": 0.004746577272794696,
262
+ "learning_rate": 0.00019905365115623658,
263
+ "loss": 0.093,
264
+ "mean_token_accuracy": 0.971742856502533,
265
+ "num_tokens": 452483.0,
266
+ "step": 320
267
+ },
268
+ {
269
+ "epoch": 0.00489490781256953,
270
+ "learning_rate": 0.0001990239850482816,
271
+ "loss": 0.102,
272
+ "mean_token_accuracy": 0.9643003046512604,
273
+ "num_tokens": 466532.0,
274
+ "step": 330
275
+ },
276
+ {
277
+ "epoch": 0.005043238352344364,
278
+ "learning_rate": 0.00019899431894032662,
279
+ "loss": 0.0964,
280
+ "mean_token_accuracy": 0.9672096133232116,
281
+ "num_tokens": 480771.0,
282
+ "step": 340
283
+ },
284
+ {
285
+ "epoch": 0.0051915688921191985,
286
+ "learning_rate": 0.00019896465283237167,
287
+ "loss": 0.1035,
288
+ "mean_token_accuracy": 0.9658852636814117,
289
+ "num_tokens": 494954.0,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 0.005339899431894033,
294
+ "learning_rate": 0.0001989349867244167,
295
+ "loss": 0.1085,
296
+ "mean_token_accuracy": 0.9633121192455292,
297
+ "num_tokens": 509115.0,
298
+ "step": 360
299
+ },
300
+ {
301
+ "epoch": 0.005488229971668867,
302
+ "learning_rate": 0.00019890532061646175,
303
+ "loss": 0.067,
304
+ "mean_token_accuracy": 0.9755404174327851,
305
+ "num_tokens": 523364.0,
306
+ "step": 370
307
+ },
308
+ {
309
+ "epoch": 0.005636560511443701,
310
+ "learning_rate": 0.00019887565450850677,
311
+ "loss": 0.0846,
312
+ "mean_token_accuracy": 0.9748851418495178,
313
+ "num_tokens": 537483.0,
314
+ "step": 380
315
+ },
316
+ {
317
+ "epoch": 0.005784891051218535,
318
+ "learning_rate": 0.0001988459884005518,
319
+ "loss": 0.09,
320
+ "mean_token_accuracy": 0.9707007527351379,
321
+ "num_tokens": 551329.0,
322
+ "step": 390
323
+ },
324
+ {
325
+ "epoch": 0.0059332215909933695,
326
+ "learning_rate": 0.00019881632229259684,
327
+ "loss": 0.0923,
328
+ "mean_token_accuracy": 0.9627965211868286,
329
+ "num_tokens": 565279.0,
330
+ "step": 400
331
+ },
332
+ {
333
+ "epoch": 0.006081552130768204,
334
+ "learning_rate": 0.00019878665618464186,
335
+ "loss": 0.1026,
336
+ "mean_token_accuracy": 0.9648779332637787,
337
+ "num_tokens": 579408.0,
338
+ "step": 410
339
+ },
340
+ {
341
+ "epoch": 0.006229882670543038,
342
+ "learning_rate": 0.0001987569900766869,
343
+ "loss": 0.0626,
344
+ "mean_token_accuracy": 0.9769876301288605,
345
+ "num_tokens": 593666.0,
346
+ "step": 420
347
+ },
348
+ {
349
+ "epoch": 0.006378213210317872,
350
+ "learning_rate": 0.00019872732396873194,
351
+ "loss": 0.0824,
352
+ "mean_token_accuracy": 0.9722626864910126,
353
+ "num_tokens": 607862.0,
354
+ "step": 430
355
+ },
356
+ {
357
+ "epoch": 0.006526543750092707,
358
+ "learning_rate": 0.00019869765786077696,
359
+ "loss": 0.0815,
360
+ "mean_token_accuracy": 0.9756675064563751,
361
+ "num_tokens": 621860.0,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 0.0066748742898675405,
366
+ "learning_rate": 0.000198667991752822,
367
+ "loss": 0.0864,
368
+ "mean_token_accuracy": 0.9753140985965729,
369
+ "num_tokens": 635967.0,
370
+ "step": 450
371
+ },
372
+ {
373
+ "epoch": 0.006823204829642375,
374
+ "learning_rate": 0.00019863832564486703,
375
+ "loss": 0.1159,
376
+ "mean_token_accuracy": 0.9547139942646027,
377
+ "num_tokens": 650037.0,
378
+ "step": 460
379
+ },
380
+ {
381
+ "epoch": 0.0069715353694172096,
382
+ "learning_rate": 0.00019860865953691206,
383
+ "loss": 0.1035,
384
+ "mean_token_accuracy": 0.9647024512290955,
385
+ "num_tokens": 664415.0,
386
+ "step": 470
387
+ },
388
+ {
389
+ "epoch": 0.007119865909192043,
390
+ "learning_rate": 0.0001985789934289571,
391
+ "loss": 0.1115,
392
+ "mean_token_accuracy": 0.9622772753238678,
393
+ "num_tokens": 678570.0,
394
+ "step": 480
395
+ },
396
+ {
397
+ "epoch": 0.007268196448966878,
398
+ "learning_rate": 0.00019854932732100213,
399
+ "loss": 0.1018,
400
+ "mean_token_accuracy": 0.9635290026664733,
401
+ "num_tokens": 692639.0,
402
+ "step": 490
403
+ },
404
+ {
405
+ "epoch": 0.007416526988741712,
406
+ "learning_rate": 0.00019851966121304718,
407
+ "loss": 0.073,
408
+ "mean_token_accuracy": 0.9765343546867371,
409
+ "num_tokens": 706812.0,
410
+ "step": 500
411
+ },
412
+ {
413
+ "epoch": 0.007564857528516546,
414
+ "learning_rate": 0.0001984899951050922,
415
+ "loss": 0.0673,
416
+ "mean_token_accuracy": 0.9734417915344238,
417
+ "num_tokens": 721155.0,
418
+ "step": 510
419
+ },
420
+ {
421
+ "epoch": 0.0077131880682913805,
422
+ "learning_rate": 0.00019846032899713723,
423
+ "loss": 0.0523,
424
+ "mean_token_accuracy": 0.9798437833786011,
425
+ "num_tokens": 735500.0,
426
+ "step": 520
427
+ },
428
+ {
429
+ "epoch": 0.007861518608066215,
430
+ "learning_rate": 0.00019843066288918225,
431
+ "loss": 0.0972,
432
+ "mean_token_accuracy": 0.9699061930179596,
433
+ "num_tokens": 749718.0,
434
+ "step": 530
435
+ },
436
+ {
437
+ "epoch": 0.00800984914784105,
438
+ "learning_rate": 0.0001984009967812273,
439
+ "loss": 0.0948,
440
+ "mean_token_accuracy": 0.9745772778987885,
441
+ "num_tokens": 763762.0,
442
+ "step": 540
443
+ },
444
+ {
445
+ "epoch": 0.008158179687615882,
446
+ "learning_rate": 0.00019837133067327235,
447
+ "loss": 0.0879,
448
+ "mean_token_accuracy": 0.9725308179855346,
449
+ "num_tokens": 777808.0,
450
+ "step": 550
451
+ },
452
+ {
453
+ "epoch": 0.008306510227390717,
454
+ "learning_rate": 0.00019834166456531734,
455
+ "loss": 0.083,
456
+ "mean_token_accuracy": 0.9699824392795563,
457
+ "num_tokens": 791900.0,
458
+ "step": 560
459
+ },
460
+ {
461
+ "epoch": 0.008454840767165551,
462
+ "learning_rate": 0.0001983119984573624,
463
+ "loss": 0.0724,
464
+ "mean_token_accuracy": 0.9774831712245942,
465
+ "num_tokens": 805873.0,
466
+ "step": 570
467
+ },
468
+ {
469
+ "epoch": 0.008603171306940386,
470
+ "learning_rate": 0.00019828233234940742,
471
+ "loss": 0.0867,
472
+ "mean_token_accuracy": 0.9701934456825256,
473
+ "num_tokens": 820203.0,
474
+ "step": 580
475
+ },
476
+ {
477
+ "epoch": 0.00875150184671522,
478
+ "learning_rate": 0.00019825266624145247,
479
+ "loss": 0.0868,
480
+ "mean_token_accuracy": 0.9678167402744293,
481
+ "num_tokens": 834478.0,
482
+ "step": 590
483
+ },
484
+ {
485
+ "epoch": 0.008899832386490055,
486
+ "learning_rate": 0.00019822300013349752,
487
+ "loss": 0.087,
488
+ "mean_token_accuracy": 0.9668359816074371,
489
+ "num_tokens": 848321.0,
490
+ "step": 600
491
+ },
492
+ {
493
+ "epoch": 0.009048162926264888,
494
+ "learning_rate": 0.00019819333402554251,
495
+ "loss": 0.0579,
496
+ "mean_token_accuracy": 0.980604612827301,
497
+ "num_tokens": 862770.0,
498
+ "step": 610
499
+ },
500
+ {
501
+ "epoch": 0.009196493466039722,
502
+ "learning_rate": 0.00019816366791758756,
503
+ "loss": 0.1058,
504
+ "mean_token_accuracy": 0.9675537347793579,
505
+ "num_tokens": 877137.0,
506
+ "step": 620
507
+ },
508
+ {
509
+ "epoch": 0.009344824005814557,
510
+ "learning_rate": 0.0001981340018096326,
511
+ "loss": 0.0641,
512
+ "mean_token_accuracy": 0.9790356934070588,
513
+ "num_tokens": 891203.0,
514
+ "step": 630
515
+ },
516
+ {
517
+ "epoch": 0.009493154545589392,
518
+ "learning_rate": 0.00019810433570167764,
519
+ "loss": 0.0783,
520
+ "mean_token_accuracy": 0.9747758269309997,
521
+ "num_tokens": 905312.0,
522
+ "step": 640
523
+ },
524
+ {
525
+ "epoch": 0.009641485085364226,
526
+ "learning_rate": 0.00019807466959372266,
527
+ "loss": 0.0792,
528
+ "mean_token_accuracy": 0.9732525169849395,
529
+ "num_tokens": 919371.0,
530
+ "step": 650
531
+ },
532
+ {
533
+ "epoch": 0.00978981562513906,
534
+ "learning_rate": 0.00019804500348576768,
535
+ "loss": 0.065,
536
+ "mean_token_accuracy": 0.9778533697128295,
537
+ "num_tokens": 933687.0,
538
+ "step": 660
539
+ },
540
+ {
541
+ "epoch": 0.009938146164913893,
542
+ "learning_rate": 0.00019801533737781273,
543
+ "loss": 0.0754,
544
+ "mean_token_accuracy": 0.9709767520427703,
545
+ "num_tokens": 947526.0,
546
+ "step": 670
547
+ },
548
+ {
549
+ "epoch": 0.010086476704688728,
550
+ "learning_rate": 0.00019798567126985776,
551
+ "loss": 0.1087,
552
+ "mean_token_accuracy": 0.9693429231643677,
553
+ "num_tokens": 961548.0,
554
+ "step": 680
555
+ },
556
+ {
557
+ "epoch": 0.010234807244463563,
558
+ "learning_rate": 0.0001979560051619028,
559
+ "loss": 0.0875,
560
+ "mean_token_accuracy": 0.9671817898750306,
561
+ "num_tokens": 975556.0,
562
+ "step": 690
563
+ },
564
+ {
565
+ "epoch": 0.010383137784238397,
566
+ "learning_rate": 0.00019792633905394783,
567
+ "loss": 0.0931,
568
+ "mean_token_accuracy": 0.9703514873981476,
569
+ "num_tokens": 989813.0,
570
+ "step": 700
571
+ },
572
+ {
573
+ "epoch": 0.010531468324013232,
574
+ "learning_rate": 0.00019789667294599285,
575
+ "loss": 0.0768,
576
+ "mean_token_accuracy": 0.973369836807251,
577
+ "num_tokens": 1003889.0,
578
+ "step": 710
579
+ },
580
+ {
581
+ "epoch": 0.010679798863788066,
582
+ "learning_rate": 0.0001978670068380379,
583
+ "loss": 0.1042,
584
+ "mean_token_accuracy": 0.9632522821426391,
585
+ "num_tokens": 1018053.0,
586
+ "step": 720
587
+ },
588
+ {
589
+ "epoch": 0.010828129403562899,
590
+ "learning_rate": 0.00019783734073008293,
591
+ "loss": 0.0748,
592
+ "mean_token_accuracy": 0.9709040760993958,
593
+ "num_tokens": 1032016.0,
594
+ "step": 730
595
+ },
596
+ {
597
+ "epoch": 0.010976459943337733,
598
+ "learning_rate": 0.00019780767462212798,
599
+ "loss": 0.0944,
600
+ "mean_token_accuracy": 0.9654510498046875,
601
+ "num_tokens": 1046250.0,
602
+ "step": 740
603
+ },
604
+ {
605
+ "epoch": 0.011124790483112568,
606
+ "learning_rate": 0.000197778008514173,
607
+ "loss": 0.0982,
608
+ "mean_token_accuracy": 0.966586810350418,
609
+ "num_tokens": 1060619.0,
610
+ "step": 750
611
+ },
612
+ {
613
+ "epoch": 0.011273121022887403,
614
+ "learning_rate": 0.00019774834240621802,
615
+ "loss": 0.0694,
616
+ "mean_token_accuracy": 0.9705922186374665,
617
+ "num_tokens": 1074706.0,
618
+ "step": 760
619
+ },
620
+ {
621
+ "epoch": 0.011421451562662237,
622
+ "learning_rate": 0.00019771867629826307,
623
+ "loss": 0.0875,
624
+ "mean_token_accuracy": 0.9699962615966797,
625
+ "num_tokens": 1088843.0,
626
+ "step": 770
627
+ },
628
+ {
629
+ "epoch": 0.01156978210243707,
630
+ "learning_rate": 0.0001976890101903081,
631
+ "loss": 0.0962,
632
+ "mean_token_accuracy": 0.9701560854911804,
633
+ "num_tokens": 1102921.0,
634
+ "step": 780
635
+ },
636
+ {
637
+ "epoch": 0.011718112642211904,
638
+ "learning_rate": 0.00019765934408235312,
639
+ "loss": 0.0641,
640
+ "mean_token_accuracy": 0.9813205659389496,
641
+ "num_tokens": 1117155.0,
642
+ "step": 790
643
+ },
644
+ {
645
+ "epoch": 0.011866443181986739,
646
+ "learning_rate": 0.00019762967797439814,
647
+ "loss": 0.0923,
648
+ "mean_token_accuracy": 0.9718149483203888,
649
+ "num_tokens": 1131435.0,
650
+ "step": 800
651
+ },
652
+ {
653
+ "epoch": 0.012014773721761574,
654
+ "learning_rate": 0.0001976000118664432,
655
+ "loss": 0.0798,
656
+ "mean_token_accuracy": 0.9723540186882019,
657
+ "num_tokens": 1145444.0,
658
+ "step": 810
659
+ },
660
+ {
661
+ "epoch": 0.012163104261536408,
662
+ "learning_rate": 0.00019757034575848824,
663
+ "loss": 0.1028,
664
+ "mean_token_accuracy": 0.9645315170288086,
665
+ "num_tokens": 1159625.0,
666
+ "step": 820
667
+ },
668
+ {
669
+ "epoch": 0.012311434801311243,
670
+ "learning_rate": 0.00019754067965053326,
671
+ "loss": 0.0893,
672
+ "mean_token_accuracy": 0.9674223959445953,
673
+ "num_tokens": 1173790.0,
674
+ "step": 830
675
+ },
676
+ {
677
+ "epoch": 0.012459765341086075,
678
+ "learning_rate": 0.0001975110135425783,
679
+ "loss": 0.0703,
680
+ "mean_token_accuracy": 0.9750796139240265,
681
+ "num_tokens": 1187787.0,
682
+ "step": 840
683
+ },
684
+ {
685
+ "epoch": 0.01260809588086091,
686
+ "learning_rate": 0.0001974813474346233,
687
+ "loss": 0.0885,
688
+ "mean_token_accuracy": 0.9665023148059845,
689
+ "num_tokens": 1202159.0,
690
+ "step": 850
691
+ },
692
+ {
693
+ "epoch": 0.012756426420635745,
694
+ "learning_rate": 0.00019745168132666836,
695
+ "loss": 0.0793,
696
+ "mean_token_accuracy": 0.9742439985275269,
697
+ "num_tokens": 1216318.0,
698
+ "step": 860
699
+ },
700
+ {
701
+ "epoch": 0.012904756960410579,
702
+ "learning_rate": 0.0001974220152187134,
703
+ "loss": 0.0654,
704
+ "mean_token_accuracy": 0.9788648605346679,
705
+ "num_tokens": 1230383.0,
706
+ "step": 870
707
+ },
708
+ {
709
+ "epoch": 0.013053087500185414,
710
+ "learning_rate": 0.00019739234911075843,
711
+ "loss": 0.0862,
712
+ "mean_token_accuracy": 0.9693156242370605,
713
+ "num_tokens": 1244340.0,
714
+ "step": 880
715
+ },
716
+ {
717
+ "epoch": 0.013201418039960248,
718
+ "learning_rate": 0.00019736268300280346,
719
+ "loss": 0.091,
720
+ "mean_token_accuracy": 0.969611394405365,
721
+ "num_tokens": 1258576.0,
722
+ "step": 890
723
+ },
724
+ {
725
+ "epoch": 0.013349748579735081,
726
+ "learning_rate": 0.00019733301689484848,
727
+ "loss": 0.0907,
728
+ "mean_token_accuracy": 0.9679802298545838,
729
+ "num_tokens": 1272705.0,
730
+ "step": 900
731
+ },
732
+ {
733
+ "epoch": 0.013498079119509915,
734
+ "learning_rate": 0.00019730335078689353,
735
+ "loss": 0.098,
736
+ "mean_token_accuracy": 0.9632154762744903,
737
+ "num_tokens": 1286700.0,
738
+ "step": 910
739
+ },
740
+ {
741
+ "epoch": 0.01364640965928475,
742
+ "learning_rate": 0.00019727368467893855,
743
+ "loss": 0.0713,
744
+ "mean_token_accuracy": 0.973181027173996,
745
+ "num_tokens": 1300730.0,
746
+ "step": 920
747
+ },
748
+ {
749
+ "epoch": 0.013794740199059585,
750
+ "learning_rate": 0.00019724401857098357,
751
+ "loss": 0.0926,
752
+ "mean_token_accuracy": 0.9724487364292145,
753
+ "num_tokens": 1314843.0,
754
+ "step": 930
755
+ },
756
+ {
757
+ "epoch": 0.013943070738834419,
758
+ "learning_rate": 0.00019721435246302862,
759
+ "loss": 0.078,
760
+ "mean_token_accuracy": 0.9706206858158112,
761
+ "num_tokens": 1328967.0,
762
+ "step": 940
763
+ },
764
+ {
765
+ "epoch": 0.014091401278609254,
766
+ "learning_rate": 0.00019718468635507365,
767
+ "loss": 0.11,
768
+ "mean_token_accuracy": 0.962195897102356,
769
+ "num_tokens": 1343046.0,
770
+ "step": 950
771
+ },
772
+ {
773
+ "epoch": 0.014239731818384086,
774
+ "learning_rate": 0.0001971550202471187,
775
+ "loss": 0.0722,
776
+ "mean_token_accuracy": 0.9760941386222839,
777
+ "num_tokens": 1357072.0,
778
+ "step": 960
779
+ },
780
+ {
781
+ "epoch": 0.014388062358158921,
782
+ "learning_rate": 0.00019712535413916372,
783
+ "loss": 0.0626,
784
+ "mean_token_accuracy": 0.9810939252376556,
785
+ "num_tokens": 1371301.0,
786
+ "step": 970
787
+ },
788
+ {
789
+ "epoch": 0.014536392897933756,
790
+ "learning_rate": 0.00019709568803120874,
791
+ "loss": 0.0972,
792
+ "mean_token_accuracy": 0.9752714991569519,
793
+ "num_tokens": 1385224.0,
794
+ "step": 980
795
+ },
796
+ {
797
+ "epoch": 0.01468472343770859,
798
+ "learning_rate": 0.0001970660219232538,
799
+ "loss": 0.065,
800
+ "mean_token_accuracy": 0.9749573111534119,
801
+ "num_tokens": 1399236.0,
802
+ "step": 990
803
+ },
804
+ {
805
+ "epoch": 0.014833053977483425,
806
+ "learning_rate": 0.00019703635581529882,
807
+ "loss": 0.0733,
808
+ "mean_token_accuracy": 0.971584141254425,
809
+ "num_tokens": 1413329.0,
810
+ "step": 1000
811
+ },
812
+ {
813
+ "epoch": 0.01498138451725826,
814
+ "learning_rate": 0.00019700668970734387,
815
+ "loss": 0.0984,
816
+ "mean_token_accuracy": 0.9667350590229035,
817
+ "num_tokens": 1427449.0,
818
+ "step": 1010
819
+ },
820
+ {
821
+ "epoch": 0.015129715057033092,
822
+ "learning_rate": 0.0001969770235993889,
823
+ "loss": 0.0758,
824
+ "mean_token_accuracy": 0.9693971753120423,
825
+ "num_tokens": 1441671.0,
826
+ "step": 1020
827
+ },
828
+ {
829
+ "epoch": 0.015278045596807927,
830
+ "learning_rate": 0.0001969473574914339,
831
+ "loss": 0.0877,
832
+ "mean_token_accuracy": 0.9640409171581268,
833
+ "num_tokens": 1455599.0,
834
+ "step": 1030
835
+ },
836
+ {
837
+ "epoch": 0.015426376136582761,
838
+ "learning_rate": 0.00019691769138347896,
839
+ "loss": 0.078,
840
+ "mean_token_accuracy": 0.973029488325119,
841
+ "num_tokens": 1469741.0,
842
+ "step": 1040
843
+ },
844
+ {
845
+ "epoch": 0.015574706676357596,
846
+ "learning_rate": 0.00019688802527552399,
847
+ "loss": 0.0709,
848
+ "mean_token_accuracy": 0.97868133187294,
849
+ "num_tokens": 1483981.0,
850
+ "step": 1050
851
+ },
852
+ {
853
+ "epoch": 0.01572303721613243,
854
+ "learning_rate": 0.00019685835916756904,
855
+ "loss": 0.0718,
856
+ "mean_token_accuracy": 0.9745873928070068,
857
+ "num_tokens": 1497968.0,
858
+ "step": 1060
859
+ },
860
+ {
861
+ "epoch": 0.015871367755907265,
862
+ "learning_rate": 0.00019682869305961403,
863
+ "loss": 0.073,
864
+ "mean_token_accuracy": 0.9762903869152069,
865
+ "num_tokens": 1511999.0,
866
+ "step": 1070
867
+ },
868
+ {
869
+ "epoch": 0.0160196982956821,
870
+ "learning_rate": 0.00019679902695165908,
871
+ "loss": 0.0854,
872
+ "mean_token_accuracy": 0.9715777993202209,
873
+ "num_tokens": 1526208.0,
874
+ "step": 1080
875
+ },
876
+ {
877
+ "epoch": 0.016168028835456934,
878
+ "learning_rate": 0.00019676936084370413,
879
+ "loss": 0.0589,
880
+ "mean_token_accuracy": 0.9842264592647553,
881
+ "num_tokens": 1540482.0,
882
+ "step": 1090
883
+ },
884
+ {
885
+ "epoch": 0.016316359375231765,
886
+ "learning_rate": 0.00019673969473574916,
887
+ "loss": 0.0702,
888
+ "mean_token_accuracy": 0.9759678483009339,
889
+ "num_tokens": 1554508.0,
890
+ "step": 1100
891
+ },
892
+ {
893
+ "epoch": 0.0164646899150066,
894
+ "learning_rate": 0.0001967100286277942,
895
+ "loss": 0.0768,
896
+ "mean_token_accuracy": 0.9776164293289185,
897
+ "num_tokens": 1568694.0,
898
+ "step": 1110
899
+ },
900
+ {
901
+ "epoch": 0.016613020454781434,
902
+ "learning_rate": 0.0001966803625198392,
903
+ "loss": 0.0879,
904
+ "mean_token_accuracy": 0.973237669467926,
905
+ "num_tokens": 1582893.0,
906
+ "step": 1120
907
+ },
908
+ {
909
+ "epoch": 0.01676135099455627,
910
+ "learning_rate": 0.00019665069641188425,
911
+ "loss": 0.0974,
912
+ "mean_token_accuracy": 0.974377167224884,
913
+ "num_tokens": 1597044.0,
914
+ "step": 1130
915
+ },
916
+ {
917
+ "epoch": 0.016909681534331103,
918
+ "learning_rate": 0.00019662103030392927,
919
+ "loss": 0.0856,
920
+ "mean_token_accuracy": 0.9713896453380585,
921
+ "num_tokens": 1611013.0,
922
+ "step": 1140
923
+ },
924
+ {
925
+ "epoch": 0.017058012074105938,
926
+ "learning_rate": 0.00019659136419597432,
927
+ "loss": 0.079,
928
+ "mean_token_accuracy": 0.9759809911251068,
929
+ "num_tokens": 1625267.0,
930
+ "step": 1150
931
+ },
932
+ {
933
+ "epoch": 0.017206342613880772,
934
+ "learning_rate": 0.00019656169808801935,
935
+ "loss": 0.0837,
936
+ "mean_token_accuracy": 0.9738470792770386,
937
+ "num_tokens": 1639399.0,
938
+ "step": 1160
939
+ },
940
+ {
941
+ "epoch": 0.017354673153655607,
942
+ "learning_rate": 0.00019653203198006437,
943
+ "loss": 0.0851,
944
+ "mean_token_accuracy": 0.9707063674926758,
945
+ "num_tokens": 1653578.0,
946
+ "step": 1170
947
+ },
948
+ {
949
+ "epoch": 0.01750300369343044,
950
+ "learning_rate": 0.00019650236587210942,
951
+ "loss": 0.0848,
952
+ "mean_token_accuracy": 0.9718270719051361,
953
+ "num_tokens": 1667589.0,
954
+ "step": 1180
955
+ },
956
+ {
957
+ "epoch": 0.017651334233205276,
958
+ "learning_rate": 0.00019647269976415444,
959
+ "loss": 0.0594,
960
+ "mean_token_accuracy": 0.9799826085567475,
961
+ "num_tokens": 1682134.0,
962
+ "step": 1190
963
+ },
964
+ {
965
+ "epoch": 0.01779966477298011,
966
+ "learning_rate": 0.0001964430336561995,
967
+ "loss": 0.0945,
968
+ "mean_token_accuracy": 0.9730036854743958,
969
+ "num_tokens": 1696174.0,
970
+ "step": 1200
971
+ },
972
+ {
973
+ "epoch": 0.017947995312754945,
974
+ "learning_rate": 0.00019641336754824452,
975
+ "loss": 0.077,
976
+ "mean_token_accuracy": 0.9751695334911347,
977
+ "num_tokens": 1710134.0,
978
+ "step": 1210
979
+ },
980
+ {
981
+ "epoch": 0.018096325852529776,
982
+ "learning_rate": 0.00019638370144028954,
983
+ "loss": 0.0812,
984
+ "mean_token_accuracy": 0.9705005526542664,
985
+ "num_tokens": 1724066.0,
986
+ "step": 1220
987
+ },
988
+ {
989
+ "epoch": 0.01824465639230461,
990
+ "learning_rate": 0.0001963540353323346,
991
+ "loss": 0.0704,
992
+ "mean_token_accuracy": 0.9745881497859955,
993
+ "num_tokens": 1738123.0,
994
+ "step": 1230
995
+ },
996
+ {
997
+ "epoch": 0.018392986932079445,
998
+ "learning_rate": 0.0001963243692243796,
999
+ "loss": 0.0702,
1000
+ "mean_token_accuracy": 0.975337028503418,
1001
+ "num_tokens": 1752512.0,
1002
+ "step": 1240
1003
+ },
1004
+ {
1005
+ "epoch": 0.01854131747185428,
1006
+ "learning_rate": 0.00019629470311642466,
1007
+ "loss": 0.0658,
1008
+ "mean_token_accuracy": 0.9756161510944367,
1009
+ "num_tokens": 1766708.0,
1010
+ "step": 1250
1011
+ },
1012
+ {
1013
+ "epoch": 0.018689648011629114,
1014
+ "learning_rate": 0.00019626503700846969,
1015
+ "loss": 0.0774,
1016
+ "mean_token_accuracy": 0.9716881215572357,
1017
+ "num_tokens": 1780735.0,
1018
+ "step": 1260
1019
+ },
1020
+ {
1021
+ "epoch": 0.01883797855140395,
1022
+ "learning_rate": 0.0001962353709005147,
1023
+ "loss": 0.0714,
1024
+ "mean_token_accuracy": 0.9714816689491272,
1025
+ "num_tokens": 1794777.0,
1026
+ "step": 1270
1027
+ },
1028
+ {
1029
+ "epoch": 0.018986309091178783,
1030
+ "learning_rate": 0.00019620570479255976,
1031
+ "loss": 0.0668,
1032
+ "mean_token_accuracy": 0.9810837268829345,
1033
+ "num_tokens": 1809000.0,
1034
+ "step": 1280
1035
+ },
1036
+ {
1037
+ "epoch": 0.019134639630953618,
1038
+ "learning_rate": 0.00019617603868460478,
1039
+ "loss": 0.0793,
1040
+ "mean_token_accuracy": 0.9687060952186585,
1041
+ "num_tokens": 1822994.0,
1042
+ "step": 1290
1043
+ },
1044
+ {
1045
+ "epoch": 0.019282970170728452,
1046
+ "learning_rate": 0.0001961463725766498,
1047
+ "loss": 0.1032,
1048
+ "mean_token_accuracy": 0.9625421404838562,
1049
+ "num_tokens": 1836944.0,
1050
+ "step": 1300
1051
+ },
1052
+ {
1053
+ "epoch": 0.019431300710503287,
1054
+ "learning_rate": 0.00019611670646869485,
1055
+ "loss": 0.0492,
1056
+ "mean_token_accuracy": 0.9850762605667114,
1057
+ "num_tokens": 1851184.0,
1058
+ "step": 1310
1059
+ },
1060
+ {
1061
+ "epoch": 0.01957963125027812,
1062
+ "learning_rate": 0.00019608704036073988,
1063
+ "loss": 0.057,
1064
+ "mean_token_accuracy": 0.9841500043869018,
1065
+ "num_tokens": 1865381.0,
1066
+ "step": 1320
1067
+ },
1068
+ {
1069
+ "epoch": 0.019727961790052952,
1070
+ "learning_rate": 0.00019605737425278493,
1071
+ "loss": 0.0896,
1072
+ "mean_token_accuracy": 0.9729780375957489,
1073
+ "num_tokens": 1879523.0,
1074
+ "step": 1330
1075
+ },
1076
+ {
1077
+ "epoch": 0.019876292329827787,
1078
+ "learning_rate": 0.00019602770814482995,
1079
+ "loss": 0.0853,
1080
+ "mean_token_accuracy": 0.9706455945968628,
1081
+ "num_tokens": 1893719.0,
1082
+ "step": 1340
1083
+ },
1084
+ {
1085
+ "epoch": 0.02002462286960262,
1086
+ "learning_rate": 0.00019599804203687497,
1087
+ "loss": 0.0721,
1088
+ "mean_token_accuracy": 0.9733758211135864,
1089
+ "num_tokens": 1908020.0,
1090
+ "step": 1350
1091
+ },
1092
+ {
1093
+ "epoch": 0.020172953409377456,
1094
+ "learning_rate": 0.00019596837592892002,
1095
+ "loss": 0.0645,
1096
+ "mean_token_accuracy": 0.9787311971187591,
1097
+ "num_tokens": 1922020.0,
1098
+ "step": 1360
1099
+ },
1100
+ {
1101
+ "epoch": 0.02032128394915229,
1102
+ "learning_rate": 0.00019593870982096505,
1103
+ "loss": 0.0669,
1104
+ "mean_token_accuracy": 0.974337100982666,
1105
+ "num_tokens": 1936105.0,
1106
+ "step": 1370
1107
+ },
1108
+ {
1109
+ "epoch": 0.020469614488927125,
1110
+ "learning_rate": 0.0001959090437130101,
1111
+ "loss": 0.076,
1112
+ "mean_token_accuracy": 0.9723476529121399,
1113
+ "num_tokens": 1950298.0,
1114
+ "step": 1380
1115
+ },
1116
+ {
1117
+ "epoch": 0.02061794502870196,
1118
+ "learning_rate": 0.0001958793776050551,
1119
+ "loss": 0.0691,
1120
+ "mean_token_accuracy": 0.9764381349086761,
1121
+ "num_tokens": 1964529.0,
1122
+ "step": 1390
1123
+ },
1124
+ {
1125
+ "epoch": 0.020766275568476794,
1126
+ "learning_rate": 0.00019584971149710014,
1127
+ "loss": 0.0593,
1128
+ "mean_token_accuracy": 0.9796685576438904,
1129
+ "num_tokens": 1978773.0,
1130
+ "step": 1400
1131
+ },
1132
+ {
1133
+ "epoch": 0.02091460610825163,
1134
+ "learning_rate": 0.00019582004538914517,
1135
+ "loss": 0.0995,
1136
+ "mean_token_accuracy": 0.9613340020179748,
1137
+ "num_tokens": 1993002.0,
1138
+ "step": 1410
1139
+ },
1140
+ {
1141
+ "epoch": 0.021062936648026463,
1142
+ "learning_rate": 0.00019579037928119022,
1143
+ "loss": 0.1051,
1144
+ "mean_token_accuracy": 0.9684881687164306,
1145
+ "num_tokens": 2007048.0,
1146
+ "step": 1420
1147
+ },
1148
+ {
1149
+ "epoch": 0.021211267187801298,
1150
+ "learning_rate": 0.00019576071317323527,
1151
+ "loss": 0.0869,
1152
+ "mean_token_accuracy": 0.9721337735652924,
1153
+ "num_tokens": 2021156.0,
1154
+ "step": 1430
1155
+ },
1156
+ {
1157
+ "epoch": 0.021359597727576132,
1158
+ "learning_rate": 0.00019573104706528026,
1159
+ "loss": 0.0897,
1160
+ "mean_token_accuracy": 0.9715201795101166,
1161
+ "num_tokens": 2035271.0,
1162
+ "step": 1440
1163
+ },
1164
+ {
1165
+ "epoch": 0.021507928267350963,
1166
+ "learning_rate": 0.0001957013809573253,
1167
+ "loss": 0.0706,
1168
+ "mean_token_accuracy": 0.9721573770046235,
1169
+ "num_tokens": 2049388.0,
1170
+ "step": 1450
1171
+ },
1172
+ {
1173
+ "epoch": 0.021656258807125798,
1174
+ "learning_rate": 0.00019567171484937034,
1175
+ "loss": 0.0846,
1176
+ "mean_token_accuracy": 0.9714311838150025,
1177
+ "num_tokens": 2063480.0,
1178
+ "step": 1460
1179
+ },
1180
+ {
1181
+ "epoch": 0.021804589346900632,
1182
+ "learning_rate": 0.00019564204874141539,
1183
+ "loss": 0.0765,
1184
+ "mean_token_accuracy": 0.9730508744716644,
1185
+ "num_tokens": 2077370.0,
1186
+ "step": 1470
1187
+ },
1188
+ {
1189
+ "epoch": 0.021952919886675467,
1190
+ "learning_rate": 0.00019561238263346044,
1191
+ "loss": 0.0791,
1192
+ "mean_token_accuracy": 0.9674266993999481,
1193
+ "num_tokens": 2091582.0,
1194
+ "step": 1480
1195
+ },
1196
+ {
1197
+ "epoch": 0.0221012504264503,
1198
+ "learning_rate": 0.00019558271652550543,
1199
+ "loss": 0.0528,
1200
+ "mean_token_accuracy": 0.9833854794502258,
1201
+ "num_tokens": 2105530.0,
1202
+ "step": 1490
1203
+ },
1204
+ {
1205
+ "epoch": 0.022249580966225136,
1206
+ "learning_rate": 0.00019555305041755048,
1207
+ "loss": 0.0687,
1208
+ "mean_token_accuracy": 0.9794368088245392,
1209
+ "num_tokens": 2119632.0,
1210
+ "step": 1500
1211
+ },
1212
+ {
1213
+ "epoch": 0.02239791150599997,
1214
+ "learning_rate": 0.0001955233843095955,
1215
+ "loss": 0.0636,
1216
+ "mean_token_accuracy": 0.9763643145561218,
1217
+ "num_tokens": 2133725.0,
1218
+ "step": 1510
1219
+ },
1220
+ {
1221
+ "epoch": 0.022546242045774805,
1222
+ "learning_rate": 0.00019549371820164055,
1223
+ "loss": 0.0827,
1224
+ "mean_token_accuracy": 0.9748819410800934,
1225
+ "num_tokens": 2147866.0,
1226
+ "step": 1520
1227
+ },
1228
+ {
1229
+ "epoch": 0.02269457258554964,
1230
+ "learning_rate": 0.00019546405209368558,
1231
+ "loss": 0.0582,
1232
+ "mean_token_accuracy": 0.9838931798934937,
1233
+ "num_tokens": 2161994.0,
1234
+ "step": 1530
1235
+ },
1236
+ {
1237
+ "epoch": 0.022842903125324474,
1238
+ "learning_rate": 0.0001954343859857306,
1239
+ "loss": 0.059,
1240
+ "mean_token_accuracy": 0.9762311697006225,
1241
+ "num_tokens": 2176227.0,
1242
+ "step": 1540
1243
+ },
1244
+ {
1245
+ "epoch": 0.02299123366509931,
1246
+ "learning_rate": 0.00019540471987777565,
1247
+ "loss": 0.0773,
1248
+ "mean_token_accuracy": 0.977224487066269,
1249
+ "num_tokens": 2190154.0,
1250
+ "step": 1550
1251
+ },
1252
+ {
1253
+ "epoch": 0.02313956420487414,
1254
+ "learning_rate": 0.00019537505376982067,
1255
+ "loss": 0.0893,
1256
+ "mean_token_accuracy": 0.9703286468982697,
1257
+ "num_tokens": 2204261.0,
1258
+ "step": 1560
1259
+ },
1260
+ {
1261
+ "epoch": 0.023287894744648974,
1262
+ "learning_rate": 0.00019534538766186572,
1263
+ "loss": 0.0752,
1264
+ "mean_token_accuracy": 0.974709951877594,
1265
+ "num_tokens": 2218636.0,
1266
+ "step": 1570
1267
+ },
1268
+ {
1269
+ "epoch": 0.02343622528442381,
1270
+ "learning_rate": 0.00019531572155391075,
1271
+ "loss": 0.0841,
1272
+ "mean_token_accuracy": 0.9769960045814514,
1273
+ "num_tokens": 2232764.0,
1274
+ "step": 1580
1275
+ },
1276
+ {
1277
+ "epoch": 0.023584555824198643,
1278
+ "learning_rate": 0.00019528605544595577,
1279
+ "loss": 0.0642,
1280
+ "mean_token_accuracy": 0.977448046207428,
1281
+ "num_tokens": 2246804.0,
1282
+ "step": 1590
1283
+ },
1284
+ {
1285
+ "epoch": 0.023732886363973478,
1286
+ "learning_rate": 0.00019525638933800082,
1287
+ "loss": 0.1069,
1288
+ "mean_token_accuracy": 0.9677329897880554,
1289
+ "num_tokens": 2260874.0,
1290
+ "step": 1600
1291
+ },
1292
+ {
1293
+ "epoch": 0.023881216903748313,
1294
+ "learning_rate": 0.00019522672323004584,
1295
+ "loss": 0.0833,
1296
+ "mean_token_accuracy": 0.9727708697319031,
1297
+ "num_tokens": 2275017.0,
1298
+ "step": 1610
1299
+ },
1300
+ {
1301
+ "epoch": 0.024029547443523147,
1302
+ "learning_rate": 0.0001951970571220909,
1303
+ "loss": 0.0785,
1304
+ "mean_token_accuracy": 0.9746884107589722,
1305
+ "num_tokens": 2289036.0,
1306
+ "step": 1620
1307
+ },
1308
+ {
1309
+ "epoch": 0.02417787798329798,
1310
+ "learning_rate": 0.00019516739101413592,
1311
+ "loss": 0.0609,
1312
+ "mean_token_accuracy": 0.9793126463890076,
1313
+ "num_tokens": 2303029.0,
1314
+ "step": 1630
1315
+ },
1316
+ {
1317
+ "epoch": 0.024326208523072816,
1318
+ "learning_rate": 0.00019513772490618094,
1319
+ "loss": 0.0772,
1320
+ "mean_token_accuracy": 0.9758535027503967,
1321
+ "num_tokens": 2317190.0,
1322
+ "step": 1640
1323
+ },
1324
+ {
1325
+ "epoch": 0.02447453906284765,
1326
+ "learning_rate": 0.000195108058798226,
1327
+ "loss": 0.08,
1328
+ "mean_token_accuracy": 0.9721879363059998,
1329
+ "num_tokens": 2331217.0,
1330
+ "step": 1650
1331
+ },
1332
+ {
1333
+ "epoch": 0.024622869602622485,
1334
+ "learning_rate": 0.000195078392690271,
1335
+ "loss": 0.0801,
1336
+ "mean_token_accuracy": 0.9673756003379822,
1337
+ "num_tokens": 2345300.0,
1338
+ "step": 1660
1339
+ },
1340
+ {
1341
+ "epoch": 0.02477120014239732,
1342
+ "learning_rate": 0.00019504872658231603,
1343
+ "loss": 0.0749,
1344
+ "mean_token_accuracy": 0.9750915884971618,
1345
+ "num_tokens": 2359603.0,
1346
+ "step": 1670
1347
+ },
1348
+ {
1349
+ "epoch": 0.02491953068217215,
1350
+ "learning_rate": 0.00019501906047436106,
1351
+ "loss": 0.0913,
1352
+ "mean_token_accuracy": 0.975084537267685,
1353
+ "num_tokens": 2373919.0,
1354
+ "step": 1680
1355
+ },
1356
+ {
1357
+ "epoch": 0.025067861221946985,
1358
+ "learning_rate": 0.0001949893943664061,
1359
+ "loss": 0.057,
1360
+ "mean_token_accuracy": 0.9772068738937378,
1361
+ "num_tokens": 2388316.0,
1362
+ "step": 1690
1363
+ },
1364
+ {
1365
+ "epoch": 0.02521619176172182,
1366
+ "learning_rate": 0.00019495972825845116,
1367
+ "loss": 0.0667,
1368
+ "mean_token_accuracy": 0.9791467607021331,
1369
+ "num_tokens": 2402418.0,
1370
+ "step": 1700
1371
+ },
1372
+ {
1373
+ "epoch": 0.025364522301496654,
1374
+ "learning_rate": 0.00019493006215049618,
1375
+ "loss": 0.0688,
1376
+ "mean_token_accuracy": 0.9734372615814209,
1377
+ "num_tokens": 2416607.0,
1378
+ "step": 1710
1379
+ },
1380
+ {
1381
+ "epoch": 0.02551285284127149,
1382
+ "learning_rate": 0.0001949003960425412,
1383
+ "loss": 0.0738,
1384
+ "mean_token_accuracy": 0.9727434098720551,
1385
+ "num_tokens": 2430760.0,
1386
+ "step": 1720
1387
+ },
1388
+ {
1389
+ "epoch": 0.025661183381046324,
1390
+ "learning_rate": 0.00019487072993458623,
1391
+ "loss": 0.0631,
1392
+ "mean_token_accuracy": 0.9753291308879852,
1393
+ "num_tokens": 2444839.0,
1394
+ "step": 1730
1395
+ },
1396
+ {
1397
+ "epoch": 0.025809513920821158,
1398
+ "learning_rate": 0.00019484106382663128,
1399
+ "loss": 0.0688,
1400
+ "mean_token_accuracy": 0.9749854207038879,
1401
+ "num_tokens": 2459179.0,
1402
+ "step": 1740
1403
+ },
1404
+ {
1405
+ "epoch": 0.025957844460595993,
1406
+ "learning_rate": 0.00019481139771867633,
1407
+ "loss": 0.0798,
1408
+ "mean_token_accuracy": 0.9702626287937164,
1409
+ "num_tokens": 2473205.0,
1410
+ "step": 1750
1411
+ },
1412
+ {
1413
+ "epoch": 0.026106175000370827,
1414
+ "learning_rate": 0.00019478173161072132,
1415
+ "loss": 0.09,
1416
+ "mean_token_accuracy": 0.9717526614665986,
1417
+ "num_tokens": 2487522.0,
1418
+ "step": 1760
1419
+ },
1420
+ {
1421
+ "epoch": 0.026254505540145662,
1422
+ "learning_rate": 0.00019475206550276637,
1423
+ "loss": 0.0785,
1424
+ "mean_token_accuracy": 0.9719234526157379,
1425
+ "num_tokens": 2501550.0,
1426
+ "step": 1770
1427
+ },
1428
+ {
1429
+ "epoch": 0.026402836079920496,
1430
+ "learning_rate": 0.0001947223993948114,
1431
+ "loss": 0.0924,
1432
+ "mean_token_accuracy": 0.9727768480777741,
1433
+ "num_tokens": 2515563.0,
1434
+ "step": 1780
1435
+ },
1436
+ {
1437
+ "epoch": 0.026551166619695327,
1438
+ "learning_rate": 0.00019469273328685645,
1439
+ "loss": 0.0961,
1440
+ "mean_token_accuracy": 0.9694974541664123,
1441
+ "num_tokens": 2529739.0,
1442
+ "step": 1790
1443
+ },
1444
+ {
1445
+ "epoch": 0.026699497159470162,
1446
+ "learning_rate": 0.00019466306717890147,
1447
+ "loss": 0.0959,
1448
+ "mean_token_accuracy": 0.9663477540016174,
1449
+ "num_tokens": 2543804.0,
1450
+ "step": 1800
1451
+ },
1452
+ {
1453
+ "epoch": 0.026847827699244996,
1454
+ "learning_rate": 0.0001946334010709465,
1455
+ "loss": 0.0963,
1456
+ "mean_token_accuracy": 0.9670729875564575,
1457
+ "num_tokens": 2557916.0,
1458
+ "step": 1810
1459
+ },
1460
+ {
1461
+ "epoch": 0.02699615823901983,
1462
+ "learning_rate": 0.00019460373496299154,
1463
+ "loss": 0.0897,
1464
+ "mean_token_accuracy": 0.9735462188720703,
1465
+ "num_tokens": 2571905.0,
1466
+ "step": 1820
1467
+ },
1468
+ {
1469
+ "epoch": 0.027144488778794666,
1470
+ "learning_rate": 0.00019457406885503657,
1471
+ "loss": 0.0811,
1472
+ "mean_token_accuracy": 0.9717726945877075,
1473
+ "num_tokens": 2585903.0,
1474
+ "step": 1830
1475
+ },
1476
+ {
1477
+ "epoch": 0.0272928193185695,
1478
+ "learning_rate": 0.00019454440274708162,
1479
+ "loss": 0.0845,
1480
+ "mean_token_accuracy": 0.9706952691078186,
1481
+ "num_tokens": 2600073.0,
1482
+ "step": 1840
1483
+ },
1484
+ {
1485
+ "epoch": 0.027441149858344335,
1486
+ "learning_rate": 0.00019451473663912664,
1487
+ "loss": 0.0768,
1488
+ "mean_token_accuracy": 0.9728710412979126,
1489
+ "num_tokens": 2614150.0,
1490
+ "step": 1850
1491
+ },
1492
+ {
1493
+ "epoch": 0.02758948039811917,
1494
+ "learning_rate": 0.00019448507053117166,
1495
+ "loss": 0.0758,
1496
+ "mean_token_accuracy": 0.9708646655082702,
1497
+ "num_tokens": 2628224.0,
1498
+ "step": 1860
1499
+ },
1500
+ {
1501
+ "epoch": 0.027737810937894004,
1502
+ "learning_rate": 0.0001944554044232167,
1503
+ "loss": 0.0773,
1504
+ "mean_token_accuracy": 0.9729286253452301,
1505
+ "num_tokens": 2642113.0,
1506
+ "step": 1870
1507
+ },
1508
+ {
1509
+ "epoch": 0.027886141477668838,
1510
+ "learning_rate": 0.00019442573831526173,
1511
+ "loss": 0.0927,
1512
+ "mean_token_accuracy": 0.9710256516933441,
1513
+ "num_tokens": 2656078.0,
1514
+ "step": 1880
1515
+ },
1516
+ {
1517
+ "epoch": 0.028034472017443673,
1518
+ "learning_rate": 0.00019439607220730678,
1519
+ "loss": 0.0632,
1520
+ "mean_token_accuracy": 0.978791344165802,
1521
+ "num_tokens": 2670128.0,
1522
+ "step": 1890
1523
+ },
1524
+ {
1525
+ "epoch": 0.028182802557218507,
1526
+ "learning_rate": 0.0001943664060993518,
1527
+ "loss": 0.0543,
1528
+ "mean_token_accuracy": 0.9814699769020081,
1529
+ "num_tokens": 2684474.0,
1530
+ "step": 1900
1531
+ },
1532
+ {
1533
+ "epoch": 0.02833113309699334,
1534
+ "learning_rate": 0.00019433673999139683,
1535
+ "loss": 0.063,
1536
+ "mean_token_accuracy": 0.9756844103336334,
1537
+ "num_tokens": 2698436.0,
1538
+ "step": 1910
1539
+ },
1540
+ {
1541
+ "epoch": 0.028479463636768173,
1542
+ "learning_rate": 0.00019430707388344188,
1543
+ "loss": 0.0945,
1544
+ "mean_token_accuracy": 0.9691503286361695,
1545
+ "num_tokens": 2712395.0,
1546
+ "step": 1920
1547
+ },
1548
+ {
1549
+ "epoch": 0.028627794176543007,
1550
+ "learning_rate": 0.0001942774077754869,
1551
+ "loss": 0.0601,
1552
+ "mean_token_accuracy": 0.9762123942375183,
1553
+ "num_tokens": 2726506.0,
1554
+ "step": 1930
1555
+ },
1556
+ {
1557
+ "epoch": 0.028776124716317842,
1558
+ "learning_rate": 0.00019424774166753195,
1559
+ "loss": 0.0912,
1560
+ "mean_token_accuracy": 0.9622864723205566,
1561
+ "num_tokens": 2740293.0,
1562
+ "step": 1940
1563
+ },
1564
+ {
1565
+ "epoch": 0.028924455256092677,
1566
+ "learning_rate": 0.00019421807555957695,
1567
+ "loss": 0.0758,
1568
+ "mean_token_accuracy": 0.9752536177635193,
1569
+ "num_tokens": 2754509.0,
1570
+ "step": 1950
1571
+ },
1572
+ {
1573
+ "epoch": 0.02907278579586751,
1574
+ "learning_rate": 0.000194188409451622,
1575
+ "loss": 0.0666,
1576
+ "mean_token_accuracy": 0.9746933698654174,
1577
+ "num_tokens": 2768540.0,
1578
+ "step": 1960
1579
+ },
1580
+ {
1581
+ "epoch": 0.029221116335642346,
1582
+ "learning_rate": 0.00019415874334366705,
1583
+ "loss": 0.0546,
1584
+ "mean_token_accuracy": 0.9834832668304443,
1585
+ "num_tokens": 2782780.0,
1586
+ "step": 1970
1587
+ },
1588
+ {
1589
+ "epoch": 0.02936944687541718,
1590
+ "learning_rate": 0.00019412907723571207,
1591
+ "loss": 0.0671,
1592
+ "mean_token_accuracy": 0.9755673408508301,
1593
+ "num_tokens": 2796973.0,
1594
+ "step": 1980
1595
+ },
1596
+ {
1597
+ "epoch": 0.029517777415192015,
1598
+ "learning_rate": 0.00019409941112775712,
1599
+ "loss": 0.0639,
1600
+ "mean_token_accuracy": 0.9805159747600556,
1601
+ "num_tokens": 2811149.0,
1602
+ "step": 1990
1603
+ },
1604
+ {
1605
+ "epoch": 0.02966610795496685,
1606
+ "learning_rate": 0.00019406974501980212,
1607
+ "loss": 0.0627,
1608
+ "mean_token_accuracy": 0.9782679080963135,
1609
+ "num_tokens": 2825280.0,
1610
+ "step": 2000
1611
+ },
1612
+ {
1613
+ "epoch": 0.029814438494741684,
1614
+ "learning_rate": 0.00019404007891184717,
1615
+ "loss": 0.0661,
1616
+ "mean_token_accuracy": 0.9821899354457855,
1617
+ "num_tokens": 2839485.0,
1618
+ "step": 2010
1619
+ },
1620
+ {
1621
+ "epoch": 0.02996276903451652,
1622
+ "learning_rate": 0.00019401041280389222,
1623
+ "loss": 0.0564,
1624
+ "mean_token_accuracy": 0.9785571038722992,
1625
+ "num_tokens": 2853745.0,
1626
+ "step": 2020
1627
+ },
1628
+ {
1629
+ "epoch": 0.03011109957429135,
1630
+ "learning_rate": 0.00019398074669593724,
1631
+ "loss": 0.0562,
1632
+ "mean_token_accuracy": 0.9799038827419281,
1633
+ "num_tokens": 2867981.0,
1634
+ "step": 2030
1635
+ },
1636
+ {
1637
+ "epoch": 0.030259430114066184,
1638
+ "learning_rate": 0.00019395108058798226,
1639
+ "loss": 0.0535,
1640
+ "mean_token_accuracy": 0.9810381293296814,
1641
+ "num_tokens": 2882309.0,
1642
+ "step": 2040
1643
+ },
1644
+ {
1645
+ "epoch": 0.03040776065384102,
1646
+ "learning_rate": 0.0001939214144800273,
1647
+ "loss": 0.0713,
1648
+ "mean_token_accuracy": 0.9679700911045075,
1649
+ "num_tokens": 2896302.0,
1650
+ "step": 2050
1651
+ },
1652
+ {
1653
+ "epoch": 0.030556091193615853,
1654
+ "learning_rate": 0.00019389174837207234,
1655
+ "loss": 0.064,
1656
+ "mean_token_accuracy": 0.9801303565502166,
1657
+ "num_tokens": 2910539.0,
1658
+ "step": 2060
1659
+ },
1660
+ {
1661
+ "epoch": 0.030704421733390688,
1662
+ "learning_rate": 0.00019386208226411736,
1663
+ "loss": 0.0705,
1664
+ "mean_token_accuracy": 0.9750793814659119,
1665
+ "num_tokens": 2924693.0,
1666
+ "step": 2070
1667
+ },
1668
+ {
1669
+ "epoch": 0.030852752273165522,
1670
+ "learning_rate": 0.0001938324161561624,
1671
+ "loss": 0.0782,
1672
+ "mean_token_accuracy": 0.9702070772647857,
1673
+ "num_tokens": 2939003.0,
1674
+ "step": 2080
1675
+ },
1676
+ {
1677
+ "epoch": 0.031001082812940357,
1678
+ "learning_rate": 0.00019380275004820743,
1679
+ "loss": 0.0673,
1680
+ "mean_token_accuracy": 0.9760224461555481,
1681
+ "num_tokens": 2953090.0,
1682
+ "step": 2090
1683
+ },
1684
+ {
1685
+ "epoch": 0.03114941335271519,
1686
+ "learning_rate": 0.00019377308394025246,
1687
+ "loss": 0.0589,
1688
+ "mean_token_accuracy": 0.9770624697208404,
1689
+ "num_tokens": 2967200.0,
1690
+ "step": 2100
1691
+ },
1692
+ {
1693
+ "epoch": 0.03129774389249002,
1694
+ "learning_rate": 0.0001937434178322975,
1695
+ "loss": 0.0711,
1696
+ "mean_token_accuracy": 0.9783848404884339,
1697
+ "num_tokens": 2981214.0,
1698
+ "step": 2110
1699
+ },
1700
+ {
1701
+ "epoch": 0.03144607443226486,
1702
+ "learning_rate": 0.00019371375172434253,
1703
+ "loss": 0.074,
1704
+ "mean_token_accuracy": 0.9754434108734131,
1705
+ "num_tokens": 2995111.0,
1706
+ "step": 2120
1707
+ },
1708
+ {
1709
+ "epoch": 0.03159440497203969,
1710
+ "learning_rate": 0.00019368408561638758,
1711
+ "loss": 0.0717,
1712
+ "mean_token_accuracy": 0.9739742994308471,
1713
+ "num_tokens": 3009288.0,
1714
+ "step": 2130
1715
+ },
1716
+ {
1717
+ "epoch": 0.03174273551181453,
1718
+ "learning_rate": 0.0001936544195084326,
1719
+ "loss": 0.0722,
1720
+ "mean_token_accuracy": 0.9753423690795898,
1721
+ "num_tokens": 3023460.0,
1722
+ "step": 2140
1723
+ },
1724
+ {
1725
+ "epoch": 0.03189106605158936,
1726
+ "learning_rate": 0.00019362475340047763,
1727
+ "loss": 0.0463,
1728
+ "mean_token_accuracy": 0.982510793209076,
1729
+ "num_tokens": 3037765.0,
1730
+ "step": 2150
1731
+ },
1732
+ {
1733
+ "epoch": 0.0320393965913642,
1734
+ "learning_rate": 0.00019359508729252268,
1735
+ "loss": 0.0858,
1736
+ "mean_token_accuracy": 0.9690262913703919,
1737
+ "num_tokens": 3051960.0,
1738
+ "step": 2160
1739
+ },
1740
+ {
1741
+ "epoch": 0.03218772713113903,
1742
+ "learning_rate": 0.0001935654211845677,
1743
+ "loss": 0.0836,
1744
+ "mean_token_accuracy": 0.9711355090141296,
1745
+ "num_tokens": 3065890.0,
1746
+ "step": 2170
1747
+ },
1748
+ {
1749
+ "epoch": 0.03233605767091387,
1750
+ "learning_rate": 0.00019353575507661272,
1751
+ "loss": 0.0631,
1752
+ "mean_token_accuracy": 0.9817444443702698,
1753
+ "num_tokens": 3080088.0,
1754
+ "step": 2180
1755
+ },
1756
+ {
1757
+ "epoch": 0.0324843882106887,
1758
+ "learning_rate": 0.00019350608896865777,
1759
+ "loss": 0.076,
1760
+ "mean_token_accuracy": 0.974403315782547,
1761
+ "num_tokens": 3094137.0,
1762
+ "step": 2190
1763
+ },
1764
+ {
1765
+ "epoch": 0.03263271875046353,
1766
+ "learning_rate": 0.0001934764228607028,
1767
+ "loss": 0.0728,
1768
+ "mean_token_accuracy": 0.9692744731903076,
1769
+ "num_tokens": 3108250.0,
1770
+ "step": 2200
1771
+ },
1772
+ {
1773
+ "epoch": 0.03278104929023837,
1774
+ "learning_rate": 0.00019344675675274785,
1775
+ "loss": 0.0605,
1776
+ "mean_token_accuracy": 0.9794303894042968,
1777
+ "num_tokens": 3122535.0,
1778
+ "step": 2210
1779
+ },
1780
+ {
1781
+ "epoch": 0.0329293798300132,
1782
+ "learning_rate": 0.00019341709064479287,
1783
+ "loss": 0.0932,
1784
+ "mean_token_accuracy": 0.9737550437450408,
1785
+ "num_tokens": 3136693.0,
1786
+ "step": 2220
1787
+ },
1788
+ {
1789
+ "epoch": 0.03307771036978804,
1790
+ "learning_rate": 0.0001933874245368379,
1791
+ "loss": 0.0726,
1792
+ "mean_token_accuracy": 0.972658348083496,
1793
+ "num_tokens": 3150833.0,
1794
+ "step": 2230
1795
+ },
1796
+ {
1797
+ "epoch": 0.03322604090956287,
1798
+ "learning_rate": 0.00019335775842888294,
1799
+ "loss": 0.0894,
1800
+ "mean_token_accuracy": 0.9667915642261505,
1801
+ "num_tokens": 3165091.0,
1802
+ "step": 2240
1803
+ }
1804
+ ],
1805
+ "logging_steps": 10,
1806
+ "max_steps": 67417,
1807
+ "num_input_tokens_seen": 0,
1808
+ "num_train_epochs": 9223372036854775807,
1809
+ "save_steps": 2247,
1810
+ "stateful_callbacks": {
1811
+ "TrainerControl": {
1812
+ "args": {
1813
+ "should_epoch_stop": false,
1814
+ "should_evaluate": false,
1815
+ "should_log": false,
1816
+ "should_save": true,
1817
+ "should_training_stop": false
1818
+ },
1819
+ "attributes": {}
1820
+ }
1821
+ },
1822
+ "total_flos": 1.5068943178039296e+17,
1823
+ "train_batch_size": 12,
1824
+ "trial_name": null,
1825
+ "trial_params": null
1826
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 8,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "v_proj",
28
+ "q_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "trainable_token_indices": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 8,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "v_proj",
28
+ "q_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "trainable_token_indices": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
3
+ size 14244
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb8112171b5385c5b37366ef9bade4f4b9781d2d1470892eab36e63919d55a16
3
+ size 1064
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 8,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "v_proj",
28
+ "q_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "trainable_token_indices": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 8,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "v_proj",
28
+ "q_proj"
29
+ ],
30
+ "task_type": "CAUSAL_LM",
31
+ "trainable_token_indices": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "extra_special_tokens": {},
36
+ "legacy": false,
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": "</s>",
39
+ "padding_side": "right",
40
+ "sp_model_kwargs": {},
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }