Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +17 -0
- gama/gama-20250422_171856/checkpoint-2350/README.md +202 -0
- gama/gama-20250422_171856/checkpoint-2350/adapter_config.json +34 -0
- gama/gama-20250422_171856/checkpoint-2350/special_tokens_map.json +24 -0
- gama/gama-20250422_171856/checkpoint-2350/tokenizer.json +0 -0
- gama/gama-20250422_171856/checkpoint-2350/tokenizer_config.json +44 -0
- gama/gama-20250422_171856/checkpoint-2350/trainer_state.json +1914 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/README.md +202 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/adapter_config.json +34 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/special_tokens_map.json +24 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer.json +0 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer_config.json +44 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/trainer_state.json +1378 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/README.md +202 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/adapter_config.json +34 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/special_tokens_map.json +24 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer.json +0 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer_config.json +44 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/trainer_state.json +1378 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/README.md +202 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/adapter_config.json +34 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/special_tokens_map.json +24 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer.json +0 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer_config.json +44 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/trainer_state.json +1826 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/README.md +202 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/adapter_config.json +34 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/special_tokens_map.json +24 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer.json +0 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer_config.json +44 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/trainer_state.json +0 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/README.md +202 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/adapter_config.json +34 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/rng_state.pth +3 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/scheduler.pt +3 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/special_tokens_map.json +24 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer.json +0 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer_config.json +44 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/trainer_state.json +0 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/README.md +202 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/adapter_config.json +34 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/special_tokens_map.json +24 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer.json +0 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer_config.json +44 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/trainer_state.json +0 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/README.md +202 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/adapter_config.json +34 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/special_tokens_map.json +24 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer.json +0 -0
- gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer_config.json +44 -0
.gitattributes
CHANGED
|
@@ -668,3 +668,20 @@ grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-1500/tokenizer.json filter
|
|
| 668 |
grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 669 |
grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 670 |
grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 669 |
grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 670 |
grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 671 |
+
grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-2700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 672 |
+
grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-2100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 673 |
+
grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 674 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-1416/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 675 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-14514/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 676 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-12036/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 677 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-6372/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 678 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-11328/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 679 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-354/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 680 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-4602/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 681 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-12744/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 682 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-9204/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 683 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-3894/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 684 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-4956/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 685 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-2124/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 686 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-13452/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 687 |
+
grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-12390/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
gama/gama-20250422_171856/checkpoint-2350/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
|
| 3 |
+
library_name: peft
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
| 200 |
+
### Framework versions
|
| 201 |
+
|
| 202 |
+
- PEFT 0.15.0
|
gama/gama-20250422_171856/checkpoint-2350/adapter_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": false,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 16,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.0,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"r": 8,
|
| 24 |
+
"rank_pattern": {},
|
| 25 |
+
"revision": null,
|
| 26 |
+
"target_modules": [
|
| 27 |
+
"v_proj",
|
| 28 |
+
"q_proj"
|
| 29 |
+
],
|
| 30 |
+
"task_type": "CAUSAL_LM",
|
| 31 |
+
"trainable_token_indices": null,
|
| 32 |
+
"use_dora": false,
|
| 33 |
+
"use_rslora": false
|
| 34 |
+
}
|
gama/gama-20250422_171856/checkpoint-2350/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
gama/gama-20250422_171856/checkpoint-2350/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gama/gama-20250422_171856/checkpoint-2350/tokenizer_config.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
| 33 |
+
"clean_up_tokenization_spaces": false,
|
| 34 |
+
"eos_token": "</s>",
|
| 35 |
+
"extra_special_tokens": {},
|
| 36 |
+
"legacy": false,
|
| 37 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 38 |
+
"pad_token": "</s>",
|
| 39 |
+
"padding_side": "right",
|
| 40 |
+
"sp_model_kwargs": {},
|
| 41 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 42 |
+
"unk_token": "<unk>",
|
| 43 |
+
"use_default_system_prompt": false
|
| 44 |
+
}
|
gama/gama-20250422_171856/checkpoint-2350/trainer_state.json
ADDED
|
@@ -0,0 +1,1914 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.033332387733681315,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2350,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.0001418399478028992,
|
| 14 |
+
"learning_rate": 0.0001999744688093955,
|
| 15 |
+
"loss": 1.4644,
|
| 16 |
+
"mean_token_accuracy": 0.6479970395565033,
|
| 17 |
+
"num_tokens": 10375.0,
|
| 18 |
+
"step": 10
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"epoch": 0.0002836798956057984,
|
| 22 |
+
"learning_rate": 0.0001999461008198349,
|
| 23 |
+
"loss": 1.3344,
|
| 24 |
+
"mean_token_accuracy": 0.6637877106666565,
|
| 25 |
+
"num_tokens": 20535.0,
|
| 26 |
+
"step": 20
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"epoch": 0.0004255198434086976,
|
| 30 |
+
"learning_rate": 0.00019991773283027433,
|
| 31 |
+
"loss": 1.3386,
|
| 32 |
+
"mean_token_accuracy": 0.6674083709716797,
|
| 33 |
+
"num_tokens": 30845.0,
|
| 34 |
+
"step": 30
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"epoch": 0.0005673597912115968,
|
| 38 |
+
"learning_rate": 0.00019988936484071373,
|
| 39 |
+
"loss": 1.2811,
|
| 40 |
+
"mean_token_accuracy": 0.6720114409923553,
|
| 41 |
+
"num_tokens": 41159.0,
|
| 42 |
+
"step": 40
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"epoch": 0.0007091997390144961,
|
| 46 |
+
"learning_rate": 0.00019986099685115316,
|
| 47 |
+
"loss": 1.3092,
|
| 48 |
+
"mean_token_accuracy": 0.6668342292308808,
|
| 49 |
+
"num_tokens": 51351.0,
|
| 50 |
+
"step": 50
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"epoch": 0.0008510396868173952,
|
| 54 |
+
"learning_rate": 0.0001998326288615926,
|
| 55 |
+
"loss": 1.2572,
|
| 56 |
+
"mean_token_accuracy": 0.678832185268402,
|
| 57 |
+
"num_tokens": 61567.0,
|
| 58 |
+
"step": 60
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 0.0009928796346202944,
|
| 62 |
+
"learning_rate": 0.00019980426087203202,
|
| 63 |
+
"loss": 1.3006,
|
| 64 |
+
"mean_token_accuracy": 0.6674412608146667,
|
| 65 |
+
"num_tokens": 71722.0,
|
| 66 |
+
"step": 70
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.0011347195824231936,
|
| 70 |
+
"learning_rate": 0.00019977589288247143,
|
| 71 |
+
"loss": 1.248,
|
| 72 |
+
"mean_token_accuracy": 0.6744147539138794,
|
| 73 |
+
"num_tokens": 81984.0,
|
| 74 |
+
"step": 80
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"epoch": 0.0012765595302260929,
|
| 78 |
+
"learning_rate": 0.00019974752489291083,
|
| 79 |
+
"loss": 1.2643,
|
| 80 |
+
"mean_token_accuracy": 0.6711925864219666,
|
| 81 |
+
"num_tokens": 92376.0,
|
| 82 |
+
"step": 90
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"epoch": 0.0014183994780289921,
|
| 86 |
+
"learning_rate": 0.0001997191569033503,
|
| 87 |
+
"loss": 1.28,
|
| 88 |
+
"mean_token_accuracy": 0.67249955534935,
|
| 89 |
+
"num_tokens": 102699.0,
|
| 90 |
+
"step": 100
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"epoch": 0.0015602394258318914,
|
| 94 |
+
"learning_rate": 0.0001996907889137897,
|
| 95 |
+
"loss": 1.2525,
|
| 96 |
+
"mean_token_accuracy": 0.6758616745471955,
|
| 97 |
+
"num_tokens": 112760.0,
|
| 98 |
+
"step": 110
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"epoch": 0.0017020793736347904,
|
| 102 |
+
"learning_rate": 0.00019966242092422912,
|
| 103 |
+
"loss": 1.2454,
|
| 104 |
+
"mean_token_accuracy": 0.6850893795490265,
|
| 105 |
+
"num_tokens": 122946.0,
|
| 106 |
+
"step": 120
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"epoch": 0.0018439193214376897,
|
| 110 |
+
"learning_rate": 0.00019963405293466852,
|
| 111 |
+
"loss": 1.2673,
|
| 112 |
+
"mean_token_accuracy": 0.6743516206741333,
|
| 113 |
+
"num_tokens": 133225.0,
|
| 114 |
+
"step": 130
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 0.0019857592692405887,
|
| 118 |
+
"learning_rate": 0.00019960568494510795,
|
| 119 |
+
"loss": 1.2961,
|
| 120 |
+
"mean_token_accuracy": 0.672141146659851,
|
| 121 |
+
"num_tokens": 143602.0,
|
| 122 |
+
"step": 140
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.002127599217043488,
|
| 126 |
+
"learning_rate": 0.00019957731695554738,
|
| 127 |
+
"loss": 1.2268,
|
| 128 |
+
"mean_token_accuracy": 0.6834299504756928,
|
| 129 |
+
"num_tokens": 154040.0,
|
| 130 |
+
"step": 150
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"epoch": 0.0022694391648463872,
|
| 134 |
+
"learning_rate": 0.00019954894896598679,
|
| 135 |
+
"loss": 1.2483,
|
| 136 |
+
"mean_token_accuracy": 0.6738203048706055,
|
| 137 |
+
"num_tokens": 164237.0,
|
| 138 |
+
"step": 160
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 0.0024112791126492867,
|
| 142 |
+
"learning_rate": 0.00019952058097642622,
|
| 143 |
+
"loss": 1.2644,
|
| 144 |
+
"mean_token_accuracy": 0.6698677897453308,
|
| 145 |
+
"num_tokens": 174418.0,
|
| 146 |
+
"step": 170
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"epoch": 0.0025531190604521858,
|
| 150 |
+
"learning_rate": 0.00019949221298686562,
|
| 151 |
+
"loss": 1.2569,
|
| 152 |
+
"mean_token_accuracy": 0.6776521384716034,
|
| 153 |
+
"num_tokens": 184818.0,
|
| 154 |
+
"step": 180
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"epoch": 0.002694959008255085,
|
| 158 |
+
"learning_rate": 0.00019946384499730505,
|
| 159 |
+
"loss": 1.218,
|
| 160 |
+
"mean_token_accuracy": 0.6825460493564606,
|
| 161 |
+
"num_tokens": 194989.0,
|
| 162 |
+
"step": 190
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"epoch": 0.0028367989560579843,
|
| 166 |
+
"learning_rate": 0.00019943547700774448,
|
| 167 |
+
"loss": 1.1975,
|
| 168 |
+
"mean_token_accuracy": 0.6864433467388154,
|
| 169 |
+
"num_tokens": 205418.0,
|
| 170 |
+
"step": 200
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 0.0029786389038608833,
|
| 174 |
+
"learning_rate": 0.00019940710901818388,
|
| 175 |
+
"loss": 1.1862,
|
| 176 |
+
"mean_token_accuracy": 0.6890658676624298,
|
| 177 |
+
"num_tokens": 215645.0,
|
| 178 |
+
"step": 210
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.0031204788516637828,
|
| 182 |
+
"learning_rate": 0.0001993787410286233,
|
| 183 |
+
"loss": 1.2141,
|
| 184 |
+
"mean_token_accuracy": 0.684105110168457,
|
| 185 |
+
"num_tokens": 225986.0,
|
| 186 |
+
"step": 220
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"epoch": 0.003262318799466682,
|
| 190 |
+
"learning_rate": 0.00019935037303906271,
|
| 191 |
+
"loss": 1.2246,
|
| 192 |
+
"mean_token_accuracy": 0.6754761219024659,
|
| 193 |
+
"num_tokens": 236317.0,
|
| 194 |
+
"step": 230
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"epoch": 0.003404158747269581,
|
| 198 |
+
"learning_rate": 0.00019932200504950217,
|
| 199 |
+
"loss": 1.1578,
|
| 200 |
+
"mean_token_accuracy": 0.6994408905506134,
|
| 201 |
+
"num_tokens": 246467.0,
|
| 202 |
+
"step": 240
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"epoch": 0.0035459986950724803,
|
| 206 |
+
"learning_rate": 0.00019929363705994157,
|
| 207 |
+
"loss": 1.1978,
|
| 208 |
+
"mean_token_accuracy": 0.6848730027675629,
|
| 209 |
+
"num_tokens": 256900.0,
|
| 210 |
+
"step": 250
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"epoch": 0.0036878386428753794,
|
| 214 |
+
"learning_rate": 0.00019926526907038098,
|
| 215 |
+
"loss": 1.1915,
|
| 216 |
+
"mean_token_accuracy": 0.6893563270568848,
|
| 217 |
+
"num_tokens": 267130.0,
|
| 218 |
+
"step": 260
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"epoch": 0.003829678590678279,
|
| 222 |
+
"learning_rate": 0.0001992369010808204,
|
| 223 |
+
"loss": 1.1796,
|
| 224 |
+
"mean_token_accuracy": 0.6892049252986908,
|
| 225 |
+
"num_tokens": 277453.0,
|
| 226 |
+
"step": 270
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 0.0039715185384811775,
|
| 230 |
+
"learning_rate": 0.00019920853309125984,
|
| 231 |
+
"loss": 1.1481,
|
| 232 |
+
"mean_token_accuracy": 0.696202689409256,
|
| 233 |
+
"num_tokens": 287556.0,
|
| 234 |
+
"step": 280
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.004113358486284077,
|
| 238 |
+
"learning_rate": 0.00019918016510169927,
|
| 239 |
+
"loss": 1.1762,
|
| 240 |
+
"mean_token_accuracy": 0.6873701572418213,
|
| 241 |
+
"num_tokens": 297903.0,
|
| 242 |
+
"step": 290
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"epoch": 0.004255198434086976,
|
| 246 |
+
"learning_rate": 0.00019915179711213867,
|
| 247 |
+
"loss": 1.1251,
|
| 248 |
+
"mean_token_accuracy": 0.6990953743457794,
|
| 249 |
+
"num_tokens": 307835.0,
|
| 250 |
+
"step": 300
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"epoch": 0.0043970383818898754,
|
| 254 |
+
"learning_rate": 0.0001991234291225781,
|
| 255 |
+
"loss": 1.1808,
|
| 256 |
+
"mean_token_accuracy": 0.6895361363887786,
|
| 257 |
+
"num_tokens": 318327.0,
|
| 258 |
+
"step": 310
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"epoch": 0.0045388783296927745,
|
| 262 |
+
"learning_rate": 0.0001990950611330175,
|
| 263 |
+
"loss": 1.1756,
|
| 264 |
+
"mean_token_accuracy": 0.6966897130012513,
|
| 265 |
+
"num_tokens": 328720.0,
|
| 266 |
+
"step": 320
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"epoch": 0.0046807182774956735,
|
| 270 |
+
"learning_rate": 0.00019906669314345693,
|
| 271 |
+
"loss": 1.1887,
|
| 272 |
+
"mean_token_accuracy": 0.682871812582016,
|
| 273 |
+
"num_tokens": 338831.0,
|
| 274 |
+
"step": 330
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"epoch": 0.004822558225298573,
|
| 278 |
+
"learning_rate": 0.00019903832515389636,
|
| 279 |
+
"loss": 1.1953,
|
| 280 |
+
"mean_token_accuracy": 0.6855618298053742,
|
| 281 |
+
"num_tokens": 349098.0,
|
| 282 |
+
"step": 340
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 0.0049643981731014725,
|
| 286 |
+
"learning_rate": 0.00019900995716433577,
|
| 287 |
+
"loss": 1.1808,
|
| 288 |
+
"mean_token_accuracy": 0.6865513443946838,
|
| 289 |
+
"num_tokens": 359364.0,
|
| 290 |
+
"step": 350
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 0.0051062381209043715,
|
| 294 |
+
"learning_rate": 0.0001989815891747752,
|
| 295 |
+
"loss": 1.1186,
|
| 296 |
+
"mean_token_accuracy": 0.7019730627536773,
|
| 297 |
+
"num_tokens": 369582.0,
|
| 298 |
+
"step": 360
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"epoch": 0.0052480780687072705,
|
| 302 |
+
"learning_rate": 0.00019895322118521463,
|
| 303 |
+
"loss": 1.1432,
|
| 304 |
+
"mean_token_accuracy": 0.6962596535682678,
|
| 305 |
+
"num_tokens": 379501.0,
|
| 306 |
+
"step": 370
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"epoch": 0.00538991801651017,
|
| 310 |
+
"learning_rate": 0.00019892485319565403,
|
| 311 |
+
"loss": 1.1904,
|
| 312 |
+
"mean_token_accuracy": 0.6854041993618012,
|
| 313 |
+
"num_tokens": 389789.0,
|
| 314 |
+
"step": 380
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"epoch": 0.0055317579643130695,
|
| 318 |
+
"learning_rate": 0.00019889648520609346,
|
| 319 |
+
"loss": 1.1778,
|
| 320 |
+
"mean_token_accuracy": 0.6948013961315155,
|
| 321 |
+
"num_tokens": 400015.0,
|
| 322 |
+
"step": 390
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"epoch": 0.0056735979121159685,
|
| 326 |
+
"learning_rate": 0.00019886811721653286,
|
| 327 |
+
"loss": 1.2114,
|
| 328 |
+
"mean_token_accuracy": 0.6789448976516723,
|
| 329 |
+
"num_tokens": 410407.0,
|
| 330 |
+
"step": 400
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"epoch": 0.005815437859918868,
|
| 334 |
+
"learning_rate": 0.00019883974922697232,
|
| 335 |
+
"loss": 1.1815,
|
| 336 |
+
"mean_token_accuracy": 0.6859633207321167,
|
| 337 |
+
"num_tokens": 420563.0,
|
| 338 |
+
"step": 410
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 0.005957277807721767,
|
| 342 |
+
"learning_rate": 0.00019881138123741172,
|
| 343 |
+
"loss": 1.1739,
|
| 344 |
+
"mean_token_accuracy": 0.6891894340515137,
|
| 345 |
+
"num_tokens": 430802.0,
|
| 346 |
+
"step": 420
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 0.006099117755524666,
|
| 350 |
+
"learning_rate": 0.00019878301324785112,
|
| 351 |
+
"loss": 1.132,
|
| 352 |
+
"mean_token_accuracy": 0.7001429855823517,
|
| 353 |
+
"num_tokens": 440998.0,
|
| 354 |
+
"step": 430
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"epoch": 0.0062409577033275656,
|
| 358 |
+
"learning_rate": 0.00019875464525829055,
|
| 359 |
+
"loss": 1.1474,
|
| 360 |
+
"mean_token_accuracy": 0.693991506099701,
|
| 361 |
+
"num_tokens": 451216.0,
|
| 362 |
+
"step": 440
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"epoch": 0.006382797651130465,
|
| 366 |
+
"learning_rate": 0.00019872627726872996,
|
| 367 |
+
"loss": 1.199,
|
| 368 |
+
"mean_token_accuracy": 0.6871366202831268,
|
| 369 |
+
"num_tokens": 461276.0,
|
| 370 |
+
"step": 450
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"epoch": 0.006524637598933364,
|
| 374 |
+
"learning_rate": 0.0001986979092791694,
|
| 375 |
+
"loss": 1.1204,
|
| 376 |
+
"mean_token_accuracy": 0.700880628824234,
|
| 377 |
+
"num_tokens": 471433.0,
|
| 378 |
+
"step": 460
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
"epoch": 0.006666477546736263,
|
| 382 |
+
"learning_rate": 0.00019866954128960882,
|
| 383 |
+
"loss": 1.2102,
|
| 384 |
+
"mean_token_accuracy": 0.6879798650741578,
|
| 385 |
+
"num_tokens": 481531.0,
|
| 386 |
+
"step": 470
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"epoch": 0.006808317494539162,
|
| 390 |
+
"learning_rate": 0.00019864117330004825,
|
| 391 |
+
"loss": 1.1195,
|
| 392 |
+
"mean_token_accuracy": 0.6937784433364869,
|
| 393 |
+
"num_tokens": 491954.0,
|
| 394 |
+
"step": 480
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 0.006950157442342062,
|
| 398 |
+
"learning_rate": 0.00019861280531048765,
|
| 399 |
+
"loss": 1.1455,
|
| 400 |
+
"mean_token_accuracy": 0.7021094024181366,
|
| 401 |
+
"num_tokens": 502274.0,
|
| 402 |
+
"step": 490
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 0.007091997390144961,
|
| 406 |
+
"learning_rate": 0.00019858443732092708,
|
| 407 |
+
"loss": 1.1608,
|
| 408 |
+
"mean_token_accuracy": 0.6934002816677094,
|
| 409 |
+
"num_tokens": 512604.0,
|
| 410 |
+
"step": 500
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"epoch": 0.00723383733794786,
|
| 414 |
+
"learning_rate": 0.0001985560693313665,
|
| 415 |
+
"loss": 1.1705,
|
| 416 |
+
"mean_token_accuracy": 0.695448386669159,
|
| 417 |
+
"num_tokens": 522792.0,
|
| 418 |
+
"step": 510
|
| 419 |
+
},
|
| 420 |
+
{
|
| 421 |
+
"epoch": 0.007375677285750759,
|
| 422 |
+
"learning_rate": 0.0001985277013418059,
|
| 423 |
+
"loss": 1.1575,
|
| 424 |
+
"mean_token_accuracy": 0.693002599477768,
|
| 425 |
+
"num_tokens": 532886.0,
|
| 426 |
+
"step": 520
|
| 427 |
+
},
|
| 428 |
+
{
|
| 429 |
+
"epoch": 0.007517517233553658,
|
| 430 |
+
"learning_rate": 0.00019849933335224534,
|
| 431 |
+
"loss": 1.1261,
|
| 432 |
+
"mean_token_accuracy": 0.6963518977165222,
|
| 433 |
+
"num_tokens": 542956.0,
|
| 434 |
+
"step": 530
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"epoch": 0.007659357181356558,
|
| 438 |
+
"learning_rate": 0.00019847096536268474,
|
| 439 |
+
"loss": 1.1214,
|
| 440 |
+
"mean_token_accuracy": 0.700639396905899,
|
| 441 |
+
"num_tokens": 553042.0,
|
| 442 |
+
"step": 540
|
| 443 |
+
},
|
| 444 |
+
{
|
| 445 |
+
"epoch": 0.007801197129159457,
|
| 446 |
+
"learning_rate": 0.00019844259737312417,
|
| 447 |
+
"loss": 1.1725,
|
| 448 |
+
"mean_token_accuracy": 0.6955362856388092,
|
| 449 |
+
"num_tokens": 563291.0,
|
| 450 |
+
"step": 550
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"epoch": 0.007943037076962355,
|
| 454 |
+
"learning_rate": 0.0001984142293835636,
|
| 455 |
+
"loss": 1.1567,
|
| 456 |
+
"mean_token_accuracy": 0.6907780110836029,
|
| 457 |
+
"num_tokens": 573513.0,
|
| 458 |
+
"step": 560
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 0.008084877024765255,
|
| 462 |
+
"learning_rate": 0.000198385861394003,
|
| 463 |
+
"loss": 1.1467,
|
| 464 |
+
"mean_token_accuracy": 0.6990721464157105,
|
| 465 |
+
"num_tokens": 583892.0,
|
| 466 |
+
"step": 570
|
| 467 |
+
},
|
| 468 |
+
{
|
| 469 |
+
"epoch": 0.008226716972568155,
|
| 470 |
+
"learning_rate": 0.00019835749340444244,
|
| 471 |
+
"loss": 1.1326,
|
| 472 |
+
"mean_token_accuracy": 0.6969013214111328,
|
| 473 |
+
"num_tokens": 593838.0,
|
| 474 |
+
"step": 580
|
| 475 |
+
},
|
| 476 |
+
{
|
| 477 |
+
"epoch": 0.008368556920371053,
|
| 478 |
+
"learning_rate": 0.00019832912541488187,
|
| 479 |
+
"loss": 1.1711,
|
| 480 |
+
"mean_token_accuracy": 0.6954678654670715,
|
| 481 |
+
"num_tokens": 603881.0,
|
| 482 |
+
"step": 590
|
| 483 |
+
},
|
| 484 |
+
{
|
| 485 |
+
"epoch": 0.008510396868173953,
|
| 486 |
+
"learning_rate": 0.00019830075742532127,
|
| 487 |
+
"loss": 1.0689,
|
| 488 |
+
"mean_token_accuracy": 0.7051353633403779,
|
| 489 |
+
"num_tokens": 614246.0,
|
| 490 |
+
"step": 600
|
| 491 |
+
},
|
| 492 |
+
{
|
| 493 |
+
"epoch": 0.008652236815976851,
|
| 494 |
+
"learning_rate": 0.0001982723894357607,
|
| 495 |
+
"loss": 1.1453,
|
| 496 |
+
"mean_token_accuracy": 0.6947243511676788,
|
| 497 |
+
"num_tokens": 624741.0,
|
| 498 |
+
"step": 610
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"epoch": 0.008794076763779751,
|
| 502 |
+
"learning_rate": 0.0001982440214462001,
|
| 503 |
+
"loss": 1.1411,
|
| 504 |
+
"mean_token_accuracy": 0.6965227723121643,
|
| 505 |
+
"num_tokens": 634891.0,
|
| 506 |
+
"step": 620
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"epoch": 0.00893591671158265,
|
| 510 |
+
"learning_rate": 0.00019821565345663953,
|
| 511 |
+
"loss": 1.1215,
|
| 512 |
+
"mean_token_accuracy": 0.703746247291565,
|
| 513 |
+
"num_tokens": 645040.0,
|
| 514 |
+
"step": 630
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 0.009077756659385549,
|
| 518 |
+
"learning_rate": 0.00019818728546707896,
|
| 519 |
+
"loss": 1.1233,
|
| 520 |
+
"mean_token_accuracy": 0.6995523154735566,
|
| 521 |
+
"num_tokens": 655495.0,
|
| 522 |
+
"step": 640
|
| 523 |
+
},
|
| 524 |
+
{
|
| 525 |
+
"epoch": 0.009219596607188449,
|
| 526 |
+
"learning_rate": 0.0001981589174775184,
|
| 527 |
+
"loss": 1.1817,
|
| 528 |
+
"mean_token_accuracy": 0.6966328859329224,
|
| 529 |
+
"num_tokens": 665731.0,
|
| 530 |
+
"step": 650
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"epoch": 0.009361436554991347,
|
| 534 |
+
"learning_rate": 0.0001981305494879578,
|
| 535 |
+
"loss": 1.1228,
|
| 536 |
+
"mean_token_accuracy": 0.6941804051399231,
|
| 537 |
+
"num_tokens": 676297.0,
|
| 538 |
+
"step": 660
|
| 539 |
+
},
|
| 540 |
+
{
|
| 541 |
+
"epoch": 0.009503276502794247,
|
| 542 |
+
"learning_rate": 0.0001981021814983972,
|
| 543 |
+
"loss": 1.1338,
|
| 544 |
+
"mean_token_accuracy": 0.6983364522457123,
|
| 545 |
+
"num_tokens": 686502.0,
|
| 546 |
+
"step": 670
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"epoch": 0.009645116450597147,
|
| 550 |
+
"learning_rate": 0.00019807381350883666,
|
| 551 |
+
"loss": 1.1052,
|
| 552 |
+
"mean_token_accuracy": 0.7062141060829162,
|
| 553 |
+
"num_tokens": 696824.0,
|
| 554 |
+
"step": 680
|
| 555 |
+
},
|
| 556 |
+
{
|
| 557 |
+
"epoch": 0.009786956398400045,
|
| 558 |
+
"learning_rate": 0.00019804544551927606,
|
| 559 |
+
"loss": 1.0801,
|
| 560 |
+
"mean_token_accuracy": 0.7161077737808228,
|
| 561 |
+
"num_tokens": 707051.0,
|
| 562 |
+
"step": 690
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"epoch": 0.009928796346202945,
|
| 566 |
+
"learning_rate": 0.0001980170775297155,
|
| 567 |
+
"loss": 1.1189,
|
| 568 |
+
"mean_token_accuracy": 0.7090901613235474,
|
| 569 |
+
"num_tokens": 717106.0,
|
| 570 |
+
"step": 700
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 0.010070636294005843,
|
| 574 |
+
"learning_rate": 0.0001979887095401549,
|
| 575 |
+
"loss": 1.0914,
|
| 576 |
+
"mean_token_accuracy": 0.709138709306717,
|
| 577 |
+
"num_tokens": 727375.0,
|
| 578 |
+
"step": 710
|
| 579 |
+
},
|
| 580 |
+
{
|
| 581 |
+
"epoch": 0.010212476241808743,
|
| 582 |
+
"learning_rate": 0.00019796034155059432,
|
| 583 |
+
"loss": 1.1572,
|
| 584 |
+
"mean_token_accuracy": 0.6868231952190399,
|
| 585 |
+
"num_tokens": 737794.0,
|
| 586 |
+
"step": 720
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"epoch": 0.010354316189611643,
|
| 590 |
+
"learning_rate": 0.00019793197356103375,
|
| 591 |
+
"loss": 1.1172,
|
| 592 |
+
"mean_token_accuracy": 0.6967993915081024,
|
| 593 |
+
"num_tokens": 748005.0,
|
| 594 |
+
"step": 730
|
| 595 |
+
},
|
| 596 |
+
{
|
| 597 |
+
"epoch": 0.010496156137414541,
|
| 598 |
+
"learning_rate": 0.00019790360557147315,
|
| 599 |
+
"loss": 1.1412,
|
| 600 |
+
"mean_token_accuracy": 0.7041096150875091,
|
| 601 |
+
"num_tokens": 758273.0,
|
| 602 |
+
"step": 740
|
| 603 |
+
},
|
| 604 |
+
{
|
| 605 |
+
"epoch": 0.010637996085217441,
|
| 606 |
+
"learning_rate": 0.00019787523758191258,
|
| 607 |
+
"loss": 1.1151,
|
| 608 |
+
"mean_token_accuracy": 0.701738464832306,
|
| 609 |
+
"num_tokens": 768532.0,
|
| 610 |
+
"step": 750
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"epoch": 0.01077983603302034,
|
| 614 |
+
"learning_rate": 0.000197846869592352,
|
| 615 |
+
"loss": 1.0879,
|
| 616 |
+
"mean_token_accuracy": 0.7017017006874084,
|
| 617 |
+
"num_tokens": 778726.0,
|
| 618 |
+
"step": 760
|
| 619 |
+
},
|
| 620 |
+
{
|
| 621 |
+
"epoch": 0.010921675980823239,
|
| 622 |
+
"learning_rate": 0.00019781850160279142,
|
| 623 |
+
"loss": 1.1212,
|
| 624 |
+
"mean_token_accuracy": 0.6997404515743255,
|
| 625 |
+
"num_tokens": 788976.0,
|
| 626 |
+
"step": 770
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 0.011063515928626139,
|
| 630 |
+
"learning_rate": 0.00019779013361323085,
|
| 631 |
+
"loss": 1.11,
|
| 632 |
+
"mean_token_accuracy": 0.6939224183559418,
|
| 633 |
+
"num_tokens": 799236.0,
|
| 634 |
+
"step": 780
|
| 635 |
+
},
|
| 636 |
+
{
|
| 637 |
+
"epoch": 0.011205355876429037,
|
| 638 |
+
"learning_rate": 0.00019776176562367025,
|
| 639 |
+
"loss": 1.1234,
|
| 640 |
+
"mean_token_accuracy": 0.7009412169456481,
|
| 641 |
+
"num_tokens": 809407.0,
|
| 642 |
+
"step": 790
|
| 643 |
+
},
|
| 644 |
+
{
|
| 645 |
+
"epoch": 0.011347195824231937,
|
| 646 |
+
"learning_rate": 0.00019773339763410968,
|
| 647 |
+
"loss": 1.1241,
|
| 648 |
+
"mean_token_accuracy": 0.700713324546814,
|
| 649 |
+
"num_tokens": 819778.0,
|
| 650 |
+
"step": 800
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"epoch": 0.011489035772034835,
|
| 654 |
+
"learning_rate": 0.0001977050296445491,
|
| 655 |
+
"loss": 1.1238,
|
| 656 |
+
"mean_token_accuracy": 0.7003967940807343,
|
| 657 |
+
"num_tokens": 829656.0,
|
| 658 |
+
"step": 810
|
| 659 |
+
},
|
| 660 |
+
{
|
| 661 |
+
"epoch": 0.011630875719837735,
|
| 662 |
+
"learning_rate": 0.00019767666165498854,
|
| 663 |
+
"loss": 1.121,
|
| 664 |
+
"mean_token_accuracy": 0.703648030757904,
|
| 665 |
+
"num_tokens": 839892.0,
|
| 666 |
+
"step": 820
|
| 667 |
+
},
|
| 668 |
+
{
|
| 669 |
+
"epoch": 0.011772715667640635,
|
| 670 |
+
"learning_rate": 0.00019764829366542794,
|
| 671 |
+
"loss": 1.1255,
|
| 672 |
+
"mean_token_accuracy": 0.7010339736938477,
|
| 673 |
+
"num_tokens": 849962.0,
|
| 674 |
+
"step": 830
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"epoch": 0.011914555615443533,
|
| 678 |
+
"learning_rate": 0.00019761992567586734,
|
| 679 |
+
"loss": 1.1608,
|
| 680 |
+
"mean_token_accuracy": 0.691221284866333,
|
| 681 |
+
"num_tokens": 860382.0,
|
| 682 |
+
"step": 840
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"epoch": 0.012056395563246433,
|
| 686 |
+
"learning_rate": 0.00019759155768630677,
|
| 687 |
+
"loss": 1.1149,
|
| 688 |
+
"mean_token_accuracy": 0.702646654844284,
|
| 689 |
+
"num_tokens": 870754.0,
|
| 690 |
+
"step": 850
|
| 691 |
+
},
|
| 692 |
+
{
|
| 693 |
+
"epoch": 0.012198235511049331,
|
| 694 |
+
"learning_rate": 0.0001975631896967462,
|
| 695 |
+
"loss": 1.1507,
|
| 696 |
+
"mean_token_accuracy": 0.6957122564315796,
|
| 697 |
+
"num_tokens": 880951.0,
|
| 698 |
+
"step": 860
|
| 699 |
+
},
|
| 700 |
+
{
|
| 701 |
+
"epoch": 0.012340075458852231,
|
| 702 |
+
"learning_rate": 0.00019753482170718563,
|
| 703 |
+
"loss": 1.1001,
|
| 704 |
+
"mean_token_accuracy": 0.7044150590896606,
|
| 705 |
+
"num_tokens": 891008.0,
|
| 706 |
+
"step": 870
|
| 707 |
+
},
|
| 708 |
+
{
|
| 709 |
+
"epoch": 0.012481915406655131,
|
| 710 |
+
"learning_rate": 0.00019750645371762504,
|
| 711 |
+
"loss": 1.1419,
|
| 712 |
+
"mean_token_accuracy": 0.7001836776733399,
|
| 713 |
+
"num_tokens": 901300.0,
|
| 714 |
+
"step": 880
|
| 715 |
+
},
|
| 716 |
+
{
|
| 717 |
+
"epoch": 0.01262375535445803,
|
| 718 |
+
"learning_rate": 0.00019747808572806447,
|
| 719 |
+
"loss": 1.1985,
|
| 720 |
+
"mean_token_accuracy": 0.6821943819522858,
|
| 721 |
+
"num_tokens": 911677.0,
|
| 722 |
+
"step": 890
|
| 723 |
+
},
|
| 724 |
+
{
|
| 725 |
+
"epoch": 0.01276559530226093,
|
| 726 |
+
"learning_rate": 0.0001974497177385039,
|
| 727 |
+
"loss": 1.1275,
|
| 728 |
+
"mean_token_accuracy": 0.6965928733348846,
|
| 729 |
+
"num_tokens": 921937.0,
|
| 730 |
+
"step": 900
|
| 731 |
+
},
|
| 732 |
+
{
|
| 733 |
+
"epoch": 0.012907435250063827,
|
| 734 |
+
"learning_rate": 0.0001974213497489433,
|
| 735 |
+
"loss": 1.1085,
|
| 736 |
+
"mean_token_accuracy": 0.7022442996501923,
|
| 737 |
+
"num_tokens": 932391.0,
|
| 738 |
+
"step": 910
|
| 739 |
+
},
|
| 740 |
+
{
|
| 741 |
+
"epoch": 0.013049275197866727,
|
| 742 |
+
"learning_rate": 0.00019739298175938273,
|
| 743 |
+
"loss": 1.1387,
|
| 744 |
+
"mean_token_accuracy": 0.7010680258274078,
|
| 745 |
+
"num_tokens": 942696.0,
|
| 746 |
+
"step": 920
|
| 747 |
+
},
|
| 748 |
+
{
|
| 749 |
+
"epoch": 0.013191115145669627,
|
| 750 |
+
"learning_rate": 0.00019736461376982213,
|
| 751 |
+
"loss": 1.1503,
|
| 752 |
+
"mean_token_accuracy": 0.693393486738205,
|
| 753 |
+
"num_tokens": 953051.0,
|
| 754 |
+
"step": 930
|
| 755 |
+
},
|
| 756 |
+
{
|
| 757 |
+
"epoch": 0.013332955093472525,
|
| 758 |
+
"learning_rate": 0.00019733624578026156,
|
| 759 |
+
"loss": 1.1153,
|
| 760 |
+
"mean_token_accuracy": 0.6921448647975922,
|
| 761 |
+
"num_tokens": 963299.0,
|
| 762 |
+
"step": 940
|
| 763 |
+
},
|
| 764 |
+
{
|
| 765 |
+
"epoch": 0.013474795041275425,
|
| 766 |
+
"learning_rate": 0.000197307877790701,
|
| 767 |
+
"loss": 1.1478,
|
| 768 |
+
"mean_token_accuracy": 0.6896324157714844,
|
| 769 |
+
"num_tokens": 973501.0,
|
| 770 |
+
"step": 950
|
| 771 |
+
},
|
| 772 |
+
{
|
| 773 |
+
"epoch": 0.013616634989078323,
|
| 774 |
+
"learning_rate": 0.0001972795098011404,
|
| 775 |
+
"loss": 1.1163,
|
| 776 |
+
"mean_token_accuracy": 0.7062079787254334,
|
| 777 |
+
"num_tokens": 983819.0,
|
| 778 |
+
"step": 960
|
| 779 |
+
},
|
| 780 |
+
{
|
| 781 |
+
"epoch": 0.013758474936881223,
|
| 782 |
+
"learning_rate": 0.00019725114181157983,
|
| 783 |
+
"loss": 1.134,
|
| 784 |
+
"mean_token_accuracy": 0.6980367541313172,
|
| 785 |
+
"num_tokens": 994251.0,
|
| 786 |
+
"step": 970
|
| 787 |
+
},
|
| 788 |
+
{
|
| 789 |
+
"epoch": 0.013900314884684123,
|
| 790 |
+
"learning_rate": 0.00019722277382201923,
|
| 791 |
+
"loss": 1.1761,
|
| 792 |
+
"mean_token_accuracy": 0.6891869902610779,
|
| 793 |
+
"num_tokens": 1004567.0,
|
| 794 |
+
"step": 980
|
| 795 |
+
},
|
| 796 |
+
{
|
| 797 |
+
"epoch": 0.014042154832487021,
|
| 798 |
+
"learning_rate": 0.00019719440583245869,
|
| 799 |
+
"loss": 1.1048,
|
| 800 |
+
"mean_token_accuracy": 0.7076132833957672,
|
| 801 |
+
"num_tokens": 1014866.0,
|
| 802 |
+
"step": 990
|
| 803 |
+
},
|
| 804 |
+
{
|
| 805 |
+
"epoch": 0.014183994780289921,
|
| 806 |
+
"learning_rate": 0.0001971660378428981,
|
| 807 |
+
"loss": 1.1435,
|
| 808 |
+
"mean_token_accuracy": 0.6922993123531341,
|
| 809 |
+
"num_tokens": 1025085.0,
|
| 810 |
+
"step": 1000
|
| 811 |
+
},
|
| 812 |
+
{
|
| 813 |
+
"epoch": 0.01432583472809282,
|
| 814 |
+
"learning_rate": 0.0001971376698533375,
|
| 815 |
+
"loss": 1.0938,
|
| 816 |
+
"mean_token_accuracy": 0.7052319467067718,
|
| 817 |
+
"num_tokens": 1035538.0,
|
| 818 |
+
"step": 1010
|
| 819 |
+
},
|
| 820 |
+
{
|
| 821 |
+
"epoch": 0.01446767467589572,
|
| 822 |
+
"learning_rate": 0.00019710930186377692,
|
| 823 |
+
"loss": 1.1406,
|
| 824 |
+
"mean_token_accuracy": 0.6931207239627838,
|
| 825 |
+
"num_tokens": 1045628.0,
|
| 826 |
+
"step": 1020
|
| 827 |
+
},
|
| 828 |
+
{
|
| 829 |
+
"epoch": 0.01460951462369862,
|
| 830 |
+
"learning_rate": 0.00019708093387421632,
|
| 831 |
+
"loss": 1.1303,
|
| 832 |
+
"mean_token_accuracy": 0.698544704914093,
|
| 833 |
+
"num_tokens": 1055971.0,
|
| 834 |
+
"step": 1030
|
| 835 |
+
},
|
| 836 |
+
{
|
| 837 |
+
"epoch": 0.014751354571501517,
|
| 838 |
+
"learning_rate": 0.00019705256588465578,
|
| 839 |
+
"loss": 1.1573,
|
| 840 |
+
"mean_token_accuracy": 0.689105898141861,
|
| 841 |
+
"num_tokens": 1066042.0,
|
| 842 |
+
"step": 1040
|
| 843 |
+
},
|
| 844 |
+
{
|
| 845 |
+
"epoch": 0.014893194519304417,
|
| 846 |
+
"learning_rate": 0.00019702419789509518,
|
| 847 |
+
"loss": 1.0628,
|
| 848 |
+
"mean_token_accuracy": 0.7112344741821289,
|
| 849 |
+
"num_tokens": 1076048.0,
|
| 850 |
+
"step": 1050
|
| 851 |
+
},
|
| 852 |
+
{
|
| 853 |
+
"epoch": 0.015035034467107316,
|
| 854 |
+
"learning_rate": 0.00019699582990553461,
|
| 855 |
+
"loss": 1.1377,
|
| 856 |
+
"mean_token_accuracy": 0.6986138761043549,
|
| 857 |
+
"num_tokens": 1086480.0,
|
| 858 |
+
"step": 1060
|
| 859 |
+
},
|
| 860 |
+
{
|
| 861 |
+
"epoch": 0.015176874414910215,
|
| 862 |
+
"learning_rate": 0.00019696746191597402,
|
| 863 |
+
"loss": 1.1203,
|
| 864 |
+
"mean_token_accuracy": 0.7030752837657929,
|
| 865 |
+
"num_tokens": 1096900.0,
|
| 866 |
+
"step": 1070
|
| 867 |
+
},
|
| 868 |
+
{
|
| 869 |
+
"epoch": 0.015318714362713115,
|
| 870 |
+
"learning_rate": 0.00019693909392641345,
|
| 871 |
+
"loss": 1.1438,
|
| 872 |
+
"mean_token_accuracy": 0.6927322566509246,
|
| 873 |
+
"num_tokens": 1107480.0,
|
| 874 |
+
"step": 1080
|
| 875 |
+
},
|
| 876 |
+
{
|
| 877 |
+
"epoch": 0.015460554310516014,
|
| 878 |
+
"learning_rate": 0.00019691072593685288,
|
| 879 |
+
"loss": 1.0853,
|
| 880 |
+
"mean_token_accuracy": 0.7137106359004974,
|
| 881 |
+
"num_tokens": 1117902.0,
|
| 882 |
+
"step": 1090
|
| 883 |
+
},
|
| 884 |
+
{
|
| 885 |
+
"epoch": 0.015602394258318913,
|
| 886 |
+
"learning_rate": 0.00019688235794729228,
|
| 887 |
+
"loss": 1.1405,
|
| 888 |
+
"mean_token_accuracy": 0.6944672584533691,
|
| 889 |
+
"num_tokens": 1128045.0,
|
| 890 |
+
"step": 1100
|
| 891 |
+
},
|
| 892 |
+
{
|
| 893 |
+
"epoch": 0.015744234206121813,
|
| 894 |
+
"learning_rate": 0.0001968539899577317,
|
| 895 |
+
"loss": 1.1166,
|
| 896 |
+
"mean_token_accuracy": 0.6966266989707947,
|
| 897 |
+
"num_tokens": 1138338.0,
|
| 898 |
+
"step": 1110
|
| 899 |
+
},
|
| 900 |
+
{
|
| 901 |
+
"epoch": 0.01588607415392471,
|
| 902 |
+
"learning_rate": 0.0001968256219681711,
|
| 903 |
+
"loss": 1.1176,
|
| 904 |
+
"mean_token_accuracy": 0.7043495059013367,
|
| 905 |
+
"num_tokens": 1148428.0,
|
| 906 |
+
"step": 1120
|
| 907 |
+
},
|
| 908 |
+
{
|
| 909 |
+
"epoch": 0.01602791410172761,
|
| 910 |
+
"learning_rate": 0.00019679725397861054,
|
| 911 |
+
"loss": 1.0583,
|
| 912 |
+
"mean_token_accuracy": 0.7091330707073211,
|
| 913 |
+
"num_tokens": 1158359.0,
|
| 914 |
+
"step": 1130
|
| 915 |
+
},
|
| 916 |
+
{
|
| 917 |
+
"epoch": 0.01616975404953051,
|
| 918 |
+
"learning_rate": 0.00019676888598904997,
|
| 919 |
+
"loss": 1.2325,
|
| 920 |
+
"mean_token_accuracy": 0.6778323352336884,
|
| 921 |
+
"num_tokens": 1168601.0,
|
| 922 |
+
"step": 1140
|
| 923 |
+
},
|
| 924 |
+
{
|
| 925 |
+
"epoch": 0.01631159399733341,
|
| 926 |
+
"learning_rate": 0.00019674051799948938,
|
| 927 |
+
"loss": 1.0737,
|
| 928 |
+
"mean_token_accuracy": 0.7033764302730561,
|
| 929 |
+
"num_tokens": 1178900.0,
|
| 930 |
+
"step": 1150
|
| 931 |
+
},
|
| 932 |
+
{
|
| 933 |
+
"epoch": 0.01645343394513631,
|
| 934 |
+
"learning_rate": 0.0001967121500099288,
|
| 935 |
+
"loss": 1.1099,
|
| 936 |
+
"mean_token_accuracy": 0.7059059202671051,
|
| 937 |
+
"num_tokens": 1189152.0,
|
| 938 |
+
"step": 1160
|
| 939 |
+
},
|
| 940 |
+
{
|
| 941 |
+
"epoch": 0.016595273892939206,
|
| 942 |
+
"learning_rate": 0.00019668378202036824,
|
| 943 |
+
"loss": 1.1593,
|
| 944 |
+
"mean_token_accuracy": 0.6904339075088501,
|
| 945 |
+
"num_tokens": 1199505.0,
|
| 946 |
+
"step": 1170
|
| 947 |
+
},
|
| 948 |
+
{
|
| 949 |
+
"epoch": 0.016737113840742106,
|
| 950 |
+
"learning_rate": 0.00019665541403080764,
|
| 951 |
+
"loss": 1.0748,
|
| 952 |
+
"mean_token_accuracy": 0.708430927991867,
|
| 953 |
+
"num_tokens": 1209675.0,
|
| 954 |
+
"step": 1180
|
| 955 |
+
},
|
| 956 |
+
{
|
| 957 |
+
"epoch": 0.016878953788545006,
|
| 958 |
+
"learning_rate": 0.00019662704604124707,
|
| 959 |
+
"loss": 1.1252,
|
| 960 |
+
"mean_token_accuracy": 0.7042509257793427,
|
| 961 |
+
"num_tokens": 1219595.0,
|
| 962 |
+
"step": 1190
|
| 963 |
+
},
|
| 964 |
+
{
|
| 965 |
+
"epoch": 0.017020793736347906,
|
| 966 |
+
"learning_rate": 0.00019659867805168647,
|
| 967 |
+
"loss": 1.0726,
|
| 968 |
+
"mean_token_accuracy": 0.6974501132965087,
|
| 969 |
+
"num_tokens": 1229844.0,
|
| 970 |
+
"step": 1200
|
| 971 |
+
},
|
| 972 |
+
{
|
| 973 |
+
"epoch": 0.017162633684150806,
|
| 974 |
+
"learning_rate": 0.00019657031006212593,
|
| 975 |
+
"loss": 1.0663,
|
| 976 |
+
"mean_token_accuracy": 0.7197851002216339,
|
| 977 |
+
"num_tokens": 1239909.0,
|
| 978 |
+
"step": 1210
|
| 979 |
+
},
|
| 980 |
+
{
|
| 981 |
+
"epoch": 0.017304473631953702,
|
| 982 |
+
"learning_rate": 0.00019654194207256533,
|
| 983 |
+
"loss": 1.0802,
|
| 984 |
+
"mean_token_accuracy": 0.7073646426200867,
|
| 985 |
+
"num_tokens": 1250098.0,
|
| 986 |
+
"step": 1220
|
| 987 |
+
},
|
| 988 |
+
{
|
| 989 |
+
"epoch": 0.017446313579756602,
|
| 990 |
+
"learning_rate": 0.00019651357408300476,
|
| 991 |
+
"loss": 1.1082,
|
| 992 |
+
"mean_token_accuracy": 0.7057863056659699,
|
| 993 |
+
"num_tokens": 1260400.0,
|
| 994 |
+
"step": 1230
|
| 995 |
+
},
|
| 996 |
+
{
|
| 997 |
+
"epoch": 0.017588153527559502,
|
| 998 |
+
"learning_rate": 0.00019648520609344416,
|
| 999 |
+
"loss": 1.1259,
|
| 1000 |
+
"mean_token_accuracy": 0.699841320514679,
|
| 1001 |
+
"num_tokens": 1270621.0,
|
| 1002 |
+
"step": 1240
|
| 1003 |
+
},
|
| 1004 |
+
{
|
| 1005 |
+
"epoch": 0.0177299934753624,
|
| 1006 |
+
"learning_rate": 0.00019645683810388357,
|
| 1007 |
+
"loss": 1.1123,
|
| 1008 |
+
"mean_token_accuracy": 0.6950526118278504,
|
| 1009 |
+
"num_tokens": 1280810.0,
|
| 1010 |
+
"step": 1250
|
| 1011 |
+
},
|
| 1012 |
+
{
|
| 1013 |
+
"epoch": 0.0178718334231653,
|
| 1014 |
+
"learning_rate": 0.00019642847011432302,
|
| 1015 |
+
"loss": 1.132,
|
| 1016 |
+
"mean_token_accuracy": 0.692725783586502,
|
| 1017 |
+
"num_tokens": 1291192.0,
|
| 1018 |
+
"step": 1260
|
| 1019 |
+
},
|
| 1020 |
+
{
|
| 1021 |
+
"epoch": 0.018013673370968198,
|
| 1022 |
+
"learning_rate": 0.00019640010212476243,
|
| 1023 |
+
"loss": 1.1279,
|
| 1024 |
+
"mean_token_accuracy": 0.703784042596817,
|
| 1025 |
+
"num_tokens": 1301253.0,
|
| 1026 |
+
"step": 1270
|
| 1027 |
+
},
|
| 1028 |
+
{
|
| 1029 |
+
"epoch": 0.018155513318771098,
|
| 1030 |
+
"learning_rate": 0.00019637173413520186,
|
| 1031 |
+
"loss": 1.064,
|
| 1032 |
+
"mean_token_accuracy": 0.7076753437519073,
|
| 1033 |
+
"num_tokens": 1311455.0,
|
| 1034 |
+
"step": 1280
|
| 1035 |
+
},
|
| 1036 |
+
{
|
| 1037 |
+
"epoch": 0.018297353266573998,
|
| 1038 |
+
"learning_rate": 0.00019634336614564126,
|
| 1039 |
+
"loss": 1.1056,
|
| 1040 |
+
"mean_token_accuracy": 0.7009041368961334,
|
| 1041 |
+
"num_tokens": 1322049.0,
|
| 1042 |
+
"step": 1290
|
| 1043 |
+
},
|
| 1044 |
+
{
|
| 1045 |
+
"epoch": 0.018439193214376898,
|
| 1046 |
+
"learning_rate": 0.0001963149981560807,
|
| 1047 |
+
"loss": 1.1307,
|
| 1048 |
+
"mean_token_accuracy": 0.6981959402561188,
|
| 1049 |
+
"num_tokens": 1332529.0,
|
| 1050 |
+
"step": 1300
|
| 1051 |
+
},
|
| 1052 |
+
{
|
| 1053 |
+
"epoch": 0.018581033162179798,
|
| 1054 |
+
"learning_rate": 0.00019628663016652012,
|
| 1055 |
+
"loss": 1.1149,
|
| 1056 |
+
"mean_token_accuracy": 0.6912563383579254,
|
| 1057 |
+
"num_tokens": 1342800.0,
|
| 1058 |
+
"step": 1310
|
| 1059 |
+
},
|
| 1060 |
+
{
|
| 1061 |
+
"epoch": 0.018722873109982694,
|
| 1062 |
+
"learning_rate": 0.00019625826217695952,
|
| 1063 |
+
"loss": 1.0897,
|
| 1064 |
+
"mean_token_accuracy": 0.7025132238864898,
|
| 1065 |
+
"num_tokens": 1353110.0,
|
| 1066 |
+
"step": 1320
|
| 1067 |
+
},
|
| 1068 |
+
{
|
| 1069 |
+
"epoch": 0.018864713057785594,
|
| 1070 |
+
"learning_rate": 0.00019622989418739895,
|
| 1071 |
+
"loss": 1.1089,
|
| 1072 |
+
"mean_token_accuracy": 0.7050705254077911,
|
| 1073 |
+
"num_tokens": 1363266.0,
|
| 1074 |
+
"step": 1330
|
| 1075 |
+
},
|
| 1076 |
+
{
|
| 1077 |
+
"epoch": 0.019006553005588494,
|
| 1078 |
+
"learning_rate": 0.00019620152619783835,
|
| 1079 |
+
"loss": 1.1038,
|
| 1080 |
+
"mean_token_accuracy": 0.7059103548526764,
|
| 1081 |
+
"num_tokens": 1373500.0,
|
| 1082 |
+
"step": 1340
|
| 1083 |
+
},
|
| 1084 |
+
{
|
| 1085 |
+
"epoch": 0.019148392953391394,
|
| 1086 |
+
"learning_rate": 0.00019617315820827778,
|
| 1087 |
+
"loss": 1.1222,
|
| 1088 |
+
"mean_token_accuracy": 0.7083336532115936,
|
| 1089 |
+
"num_tokens": 1383687.0,
|
| 1090 |
+
"step": 1350
|
| 1091 |
+
},
|
| 1092 |
+
{
|
| 1093 |
+
"epoch": 0.019290232901194294,
|
| 1094 |
+
"learning_rate": 0.00019614479021871721,
|
| 1095 |
+
"loss": 1.1291,
|
| 1096 |
+
"mean_token_accuracy": 0.6995004296302796,
|
| 1097 |
+
"num_tokens": 1394120.0,
|
| 1098 |
+
"step": 1360
|
| 1099 |
+
},
|
| 1100 |
+
{
|
| 1101 |
+
"epoch": 0.01943207284899719,
|
| 1102 |
+
"learning_rate": 0.00019611642222915662,
|
| 1103 |
+
"loss": 1.0957,
|
| 1104 |
+
"mean_token_accuracy": 0.7055183351039886,
|
| 1105 |
+
"num_tokens": 1404491.0,
|
| 1106 |
+
"step": 1370
|
| 1107 |
+
},
|
| 1108 |
+
{
|
| 1109 |
+
"epoch": 0.01957391279680009,
|
| 1110 |
+
"learning_rate": 0.00019608805423959605,
|
| 1111 |
+
"loss": 1.1187,
|
| 1112 |
+
"mean_token_accuracy": 0.6980840861797333,
|
| 1113 |
+
"num_tokens": 1414654.0,
|
| 1114 |
+
"step": 1380
|
| 1115 |
+
},
|
| 1116 |
+
{
|
| 1117 |
+
"epoch": 0.01971575274460299,
|
| 1118 |
+
"learning_rate": 0.00019605968625003548,
|
| 1119 |
+
"loss": 1.1484,
|
| 1120 |
+
"mean_token_accuracy": 0.6943079948425293,
|
| 1121 |
+
"num_tokens": 1424822.0,
|
| 1122 |
+
"step": 1390
|
| 1123 |
+
},
|
| 1124 |
+
{
|
| 1125 |
+
"epoch": 0.01985759269240589,
|
| 1126 |
+
"learning_rate": 0.0001960313182604749,
|
| 1127 |
+
"loss": 1.1231,
|
| 1128 |
+
"mean_token_accuracy": 0.7027773916721344,
|
| 1129 |
+
"num_tokens": 1435210.0,
|
| 1130 |
+
"step": 1400
|
| 1131 |
+
},
|
| 1132 |
+
{
|
| 1133 |
+
"epoch": 0.01999943264020879,
|
| 1134 |
+
"learning_rate": 0.0001960029502709143,
|
| 1135 |
+
"loss": 1.0682,
|
| 1136 |
+
"mean_token_accuracy": 0.7105901122093201,
|
| 1137 |
+
"num_tokens": 1445483.0,
|
| 1138 |
+
"step": 1410
|
| 1139 |
+
},
|
| 1140 |
+
{
|
| 1141 |
+
"epoch": 0.020141272588011686,
|
| 1142 |
+
"learning_rate": 0.0001959745822813537,
|
| 1143 |
+
"loss": 1.0946,
|
| 1144 |
+
"mean_token_accuracy": 0.7032223284244538,
|
| 1145 |
+
"num_tokens": 1455942.0,
|
| 1146 |
+
"step": 1420
|
| 1147 |
+
},
|
| 1148 |
+
{
|
| 1149 |
+
"epoch": 0.020283112535814586,
|
| 1150 |
+
"learning_rate": 0.00019594621429179314,
|
| 1151 |
+
"loss": 1.0769,
|
| 1152 |
+
"mean_token_accuracy": 0.7099736094474792,
|
| 1153 |
+
"num_tokens": 1465992.0,
|
| 1154 |
+
"step": 1430
|
| 1155 |
+
},
|
| 1156 |
+
{
|
| 1157 |
+
"epoch": 0.020424952483617486,
|
| 1158 |
+
"learning_rate": 0.00019591784630223257,
|
| 1159 |
+
"loss": 1.044,
|
| 1160 |
+
"mean_token_accuracy": 0.7143494069576264,
|
| 1161 |
+
"num_tokens": 1476070.0,
|
| 1162 |
+
"step": 1440
|
| 1163 |
+
},
|
| 1164 |
+
{
|
| 1165 |
+
"epoch": 0.020566792431420386,
|
| 1166 |
+
"learning_rate": 0.000195889478312672,
|
| 1167 |
+
"loss": 1.0988,
|
| 1168 |
+
"mean_token_accuracy": 0.6990650355815887,
|
| 1169 |
+
"num_tokens": 1486388.0,
|
| 1170 |
+
"step": 1450
|
| 1171 |
+
},
|
| 1172 |
+
{
|
| 1173 |
+
"epoch": 0.020708632379223286,
|
| 1174 |
+
"learning_rate": 0.0001958611103231114,
|
| 1175 |
+
"loss": 1.0812,
|
| 1176 |
+
"mean_token_accuracy": 0.7099774420261383,
|
| 1177 |
+
"num_tokens": 1496689.0,
|
| 1178 |
+
"step": 1460
|
| 1179 |
+
},
|
| 1180 |
+
{
|
| 1181 |
+
"epoch": 0.020850472327026182,
|
| 1182 |
+
"learning_rate": 0.00019583274233355084,
|
| 1183 |
+
"loss": 1.0747,
|
| 1184 |
+
"mean_token_accuracy": 0.7068913519382477,
|
| 1185 |
+
"num_tokens": 1506676.0,
|
| 1186 |
+
"step": 1470
|
| 1187 |
+
},
|
| 1188 |
+
{
|
| 1189 |
+
"epoch": 0.020992312274829082,
|
| 1190 |
+
"learning_rate": 0.00019580437434399027,
|
| 1191 |
+
"loss": 1.1216,
|
| 1192 |
+
"mean_token_accuracy": 0.6996153056621551,
|
| 1193 |
+
"num_tokens": 1516996.0,
|
| 1194 |
+
"step": 1480
|
| 1195 |
+
},
|
| 1196 |
+
{
|
| 1197 |
+
"epoch": 0.021134152222631982,
|
| 1198 |
+
"learning_rate": 0.00019577600635442967,
|
| 1199 |
+
"loss": 1.0814,
|
| 1200 |
+
"mean_token_accuracy": 0.7133045315742492,
|
| 1201 |
+
"num_tokens": 1526981.0,
|
| 1202 |
+
"step": 1490
|
| 1203 |
+
},
|
| 1204 |
+
{
|
| 1205 |
+
"epoch": 0.021275992170434882,
|
| 1206 |
+
"learning_rate": 0.0001957476383648691,
|
| 1207 |
+
"loss": 1.042,
|
| 1208 |
+
"mean_token_accuracy": 0.7156777441501617,
|
| 1209 |
+
"num_tokens": 1536967.0,
|
| 1210 |
+
"step": 1500
|
| 1211 |
+
},
|
| 1212 |
+
{
|
| 1213 |
+
"epoch": 0.021417832118237782,
|
| 1214 |
+
"learning_rate": 0.0001957192703753085,
|
| 1215 |
+
"loss": 1.0735,
|
| 1216 |
+
"mean_token_accuracy": 0.7105422735214233,
|
| 1217 |
+
"num_tokens": 1547354.0,
|
| 1218 |
+
"step": 1510
|
| 1219 |
+
},
|
| 1220 |
+
{
|
| 1221 |
+
"epoch": 0.02155967206604068,
|
| 1222 |
+
"learning_rate": 0.00019569090238574793,
|
| 1223 |
+
"loss": 1.1221,
|
| 1224 |
+
"mean_token_accuracy": 0.6900959551334381,
|
| 1225 |
+
"num_tokens": 1557285.0,
|
| 1226 |
+
"step": 1520
|
| 1227 |
+
},
|
| 1228 |
+
{
|
| 1229 |
+
"epoch": 0.021701512013843578,
|
| 1230 |
+
"learning_rate": 0.00019566253439618736,
|
| 1231 |
+
"loss": 1.1432,
|
| 1232 |
+
"mean_token_accuracy": 0.7033149361610412,
|
| 1233 |
+
"num_tokens": 1567450.0,
|
| 1234 |
+
"step": 1530
|
| 1235 |
+
},
|
| 1236 |
+
{
|
| 1237 |
+
"epoch": 0.021843351961646478,
|
| 1238 |
+
"learning_rate": 0.00019563416640662676,
|
| 1239 |
+
"loss": 1.0863,
|
| 1240 |
+
"mean_token_accuracy": 0.7135837018489838,
|
| 1241 |
+
"num_tokens": 1577684.0,
|
| 1242 |
+
"step": 1540
|
| 1243 |
+
},
|
| 1244 |
+
{
|
| 1245 |
+
"epoch": 0.021985191909449378,
|
| 1246 |
+
"learning_rate": 0.0001956057984170662,
|
| 1247 |
+
"loss": 1.0564,
|
| 1248 |
+
"mean_token_accuracy": 0.7110729515552521,
|
| 1249 |
+
"num_tokens": 1588410.0,
|
| 1250 |
+
"step": 1550
|
| 1251 |
+
},
|
| 1252 |
+
{
|
| 1253 |
+
"epoch": 0.022127031857252278,
|
| 1254 |
+
"learning_rate": 0.0001955774304275056,
|
| 1255 |
+
"loss": 1.1056,
|
| 1256 |
+
"mean_token_accuracy": 0.7048493981361389,
|
| 1257 |
+
"num_tokens": 1598595.0,
|
| 1258 |
+
"step": 1560
|
| 1259 |
+
},
|
| 1260 |
+
{
|
| 1261 |
+
"epoch": 0.022268871805055174,
|
| 1262 |
+
"learning_rate": 0.00019554906243794505,
|
| 1263 |
+
"loss": 1.0904,
|
| 1264 |
+
"mean_token_accuracy": 0.707161259651184,
|
| 1265 |
+
"num_tokens": 1608782.0,
|
| 1266 |
+
"step": 1570
|
| 1267 |
+
},
|
| 1268 |
+
{
|
| 1269 |
+
"epoch": 0.022410711752858074,
|
| 1270 |
+
"learning_rate": 0.00019552069444838446,
|
| 1271 |
+
"loss": 1.0611,
|
| 1272 |
+
"mean_token_accuracy": 0.7107515692710876,
|
| 1273 |
+
"num_tokens": 1618985.0,
|
| 1274 |
+
"step": 1580
|
| 1275 |
+
},
|
| 1276 |
+
{
|
| 1277 |
+
"epoch": 0.022552551700660974,
|
| 1278 |
+
"learning_rate": 0.00019549232645882386,
|
| 1279 |
+
"loss": 1.0864,
|
| 1280 |
+
"mean_token_accuracy": 0.7008066534996032,
|
| 1281 |
+
"num_tokens": 1629402.0,
|
| 1282 |
+
"step": 1590
|
| 1283 |
+
},
|
| 1284 |
+
{
|
| 1285 |
+
"epoch": 0.022694391648463874,
|
| 1286 |
+
"learning_rate": 0.0001954639584692633,
|
| 1287 |
+
"loss": 1.1076,
|
| 1288 |
+
"mean_token_accuracy": 0.6996842324733734,
|
| 1289 |
+
"num_tokens": 1639522.0,
|
| 1290 |
+
"step": 1600
|
| 1291 |
+
},
|
| 1292 |
+
{
|
| 1293 |
+
"epoch": 0.022836231596266774,
|
| 1294 |
+
"learning_rate": 0.00019543559047970272,
|
| 1295 |
+
"loss": 1.1011,
|
| 1296 |
+
"mean_token_accuracy": 0.7056106328964233,
|
| 1297 |
+
"num_tokens": 1649778.0,
|
| 1298 |
+
"step": 1610
|
| 1299 |
+
},
|
| 1300 |
+
{
|
| 1301 |
+
"epoch": 0.02297807154406967,
|
| 1302 |
+
"learning_rate": 0.00019540722249014215,
|
| 1303 |
+
"loss": 1.118,
|
| 1304 |
+
"mean_token_accuracy": 0.696357262134552,
|
| 1305 |
+
"num_tokens": 1660207.0,
|
| 1306 |
+
"step": 1620
|
| 1307 |
+
},
|
| 1308 |
+
{
|
| 1309 |
+
"epoch": 0.02311991149187257,
|
| 1310 |
+
"learning_rate": 0.00019537885450058155,
|
| 1311 |
+
"loss": 1.0676,
|
| 1312 |
+
"mean_token_accuracy": 0.7104279041290283,
|
| 1313 |
+
"num_tokens": 1670377.0,
|
| 1314 |
+
"step": 1630
|
| 1315 |
+
},
|
| 1316 |
+
{
|
| 1317 |
+
"epoch": 0.02326175143967547,
|
| 1318 |
+
"learning_rate": 0.00019535048651102098,
|
| 1319 |
+
"loss": 1.0716,
|
| 1320 |
+
"mean_token_accuracy": 0.7117774069309235,
|
| 1321 |
+
"num_tokens": 1680672.0,
|
| 1322 |
+
"step": 1640
|
| 1323 |
+
},
|
| 1324 |
+
{
|
| 1325 |
+
"epoch": 0.02340359138747837,
|
| 1326 |
+
"learning_rate": 0.00019532211852146038,
|
| 1327 |
+
"loss": 1.088,
|
| 1328 |
+
"mean_token_accuracy": 0.7042657971382141,
|
| 1329 |
+
"num_tokens": 1691141.0,
|
| 1330 |
+
"step": 1650
|
| 1331 |
+
},
|
| 1332 |
+
{
|
| 1333 |
+
"epoch": 0.02354543133528127,
|
| 1334 |
+
"learning_rate": 0.00019529375053189981,
|
| 1335 |
+
"loss": 1.1076,
|
| 1336 |
+
"mean_token_accuracy": 0.7016879081726074,
|
| 1337 |
+
"num_tokens": 1701472.0,
|
| 1338 |
+
"step": 1660
|
| 1339 |
+
},
|
| 1340 |
+
{
|
| 1341 |
+
"epoch": 0.023687271283084167,
|
| 1342 |
+
"learning_rate": 0.00019526538254233924,
|
| 1343 |
+
"loss": 1.0842,
|
| 1344 |
+
"mean_token_accuracy": 0.7067211866378784,
|
| 1345 |
+
"num_tokens": 1711678.0,
|
| 1346 |
+
"step": 1670
|
| 1347 |
+
},
|
| 1348 |
+
{
|
| 1349 |
+
"epoch": 0.023829111230887066,
|
| 1350 |
+
"learning_rate": 0.00019523701455277865,
|
| 1351 |
+
"loss": 1.0875,
|
| 1352 |
+
"mean_token_accuracy": 0.7065619647502899,
|
| 1353 |
+
"num_tokens": 1722146.0,
|
| 1354 |
+
"step": 1680
|
| 1355 |
+
},
|
| 1356 |
+
{
|
| 1357 |
+
"epoch": 0.023970951178689966,
|
| 1358 |
+
"learning_rate": 0.00019520864656321808,
|
| 1359 |
+
"loss": 1.0687,
|
| 1360 |
+
"mean_token_accuracy": 0.7062947809696197,
|
| 1361 |
+
"num_tokens": 1732546.0,
|
| 1362 |
+
"step": 1690
|
| 1363 |
+
},
|
| 1364 |
+
{
|
| 1365 |
+
"epoch": 0.024112791126492866,
|
| 1366 |
+
"learning_rate": 0.0001951802785736575,
|
| 1367 |
+
"loss": 1.0985,
|
| 1368 |
+
"mean_token_accuracy": 0.7017097353935242,
|
| 1369 |
+
"num_tokens": 1742873.0,
|
| 1370 |
+
"step": 1700
|
| 1371 |
+
},
|
| 1372 |
+
{
|
| 1373 |
+
"epoch": 0.024254631074295766,
|
| 1374 |
+
"learning_rate": 0.0001951519105840969,
|
| 1375 |
+
"loss": 1.0768,
|
| 1376 |
+
"mean_token_accuracy": 0.7073511421680451,
|
| 1377 |
+
"num_tokens": 1753232.0,
|
| 1378 |
+
"step": 1710
|
| 1379 |
+
},
|
| 1380 |
+
{
|
| 1381 |
+
"epoch": 0.024396471022098663,
|
| 1382 |
+
"learning_rate": 0.00019512354259453634,
|
| 1383 |
+
"loss": 1.0708,
|
| 1384 |
+
"mean_token_accuracy": 0.7018490791320801,
|
| 1385 |
+
"num_tokens": 1763622.0,
|
| 1386 |
+
"step": 1720
|
| 1387 |
+
},
|
| 1388 |
+
{
|
| 1389 |
+
"epoch": 0.024538310969901563,
|
| 1390 |
+
"learning_rate": 0.00019509517460497574,
|
| 1391 |
+
"loss": 1.1175,
|
| 1392 |
+
"mean_token_accuracy": 0.7016505122184753,
|
| 1393 |
+
"num_tokens": 1774027.0,
|
| 1394 |
+
"step": 1730
|
| 1395 |
+
},
|
| 1396 |
+
{
|
| 1397 |
+
"epoch": 0.024680150917704462,
|
| 1398 |
+
"learning_rate": 0.00019506680661541517,
|
| 1399 |
+
"loss": 1.0774,
|
| 1400 |
+
"mean_token_accuracy": 0.7039576828479767,
|
| 1401 |
+
"num_tokens": 1784277.0,
|
| 1402 |
+
"step": 1740
|
| 1403 |
+
},
|
| 1404 |
+
{
|
| 1405 |
+
"epoch": 0.024821990865507362,
|
| 1406 |
+
"learning_rate": 0.0001950384386258546,
|
| 1407 |
+
"loss": 1.092,
|
| 1408 |
+
"mean_token_accuracy": 0.7060896992683411,
|
| 1409 |
+
"num_tokens": 1794436.0,
|
| 1410 |
+
"step": 1750
|
| 1411 |
+
},
|
| 1412 |
+
{
|
| 1413 |
+
"epoch": 0.024963830813310262,
|
| 1414 |
+
"learning_rate": 0.000195010070636294,
|
| 1415 |
+
"loss": 1.0478,
|
| 1416 |
+
"mean_token_accuracy": 0.708050674200058,
|
| 1417 |
+
"num_tokens": 1804575.0,
|
| 1418 |
+
"step": 1760
|
| 1419 |
+
},
|
| 1420 |
+
{
|
| 1421 |
+
"epoch": 0.02510567076111316,
|
| 1422 |
+
"learning_rate": 0.00019498170264673344,
|
| 1423 |
+
"loss": 1.0737,
|
| 1424 |
+
"mean_token_accuracy": 0.7079983413219452,
|
| 1425 |
+
"num_tokens": 1814603.0,
|
| 1426 |
+
"step": 1770
|
| 1427 |
+
},
|
| 1428 |
+
{
|
| 1429 |
+
"epoch": 0.02524751070891606,
|
| 1430 |
+
"learning_rate": 0.00019495333465717284,
|
| 1431 |
+
"loss": 1.0445,
|
| 1432 |
+
"mean_token_accuracy": 0.7188371956348419,
|
| 1433 |
+
"num_tokens": 1824990.0,
|
| 1434 |
+
"step": 1780
|
| 1435 |
+
},
|
| 1436 |
+
{
|
| 1437 |
+
"epoch": 0.02538935065671896,
|
| 1438 |
+
"learning_rate": 0.0001949249666676123,
|
| 1439 |
+
"loss": 1.0557,
|
| 1440 |
+
"mean_token_accuracy": 0.7108985543251037,
|
| 1441 |
+
"num_tokens": 1835477.0,
|
| 1442 |
+
"step": 1790
|
| 1443 |
+
},
|
| 1444 |
+
{
|
| 1445 |
+
"epoch": 0.02553119060452186,
|
| 1446 |
+
"learning_rate": 0.0001948965986780517,
|
| 1447 |
+
"loss": 1.054,
|
| 1448 |
+
"mean_token_accuracy": 0.7082186043262482,
|
| 1449 |
+
"num_tokens": 1845736.0,
|
| 1450 |
+
"step": 1800
|
| 1451 |
+
},
|
| 1452 |
+
{
|
| 1453 |
+
"epoch": 0.02567303055232476,
|
| 1454 |
+
"learning_rate": 0.00019486823068849113,
|
| 1455 |
+
"loss": 1.0705,
|
| 1456 |
+
"mean_token_accuracy": 0.7082441449165344,
|
| 1457 |
+
"num_tokens": 1855966.0,
|
| 1458 |
+
"step": 1810
|
| 1459 |
+
},
|
| 1460 |
+
{
|
| 1461 |
+
"epoch": 0.025814870500127655,
|
| 1462 |
+
"learning_rate": 0.00019483986269893053,
|
| 1463 |
+
"loss": 1.0473,
|
| 1464 |
+
"mean_token_accuracy": 0.7133744478225708,
|
| 1465 |
+
"num_tokens": 1866194.0,
|
| 1466 |
+
"step": 1820
|
| 1467 |
+
},
|
| 1468 |
+
{
|
| 1469 |
+
"epoch": 0.025956710447930555,
|
| 1470 |
+
"learning_rate": 0.00019481149470936993,
|
| 1471 |
+
"loss": 1.0913,
|
| 1472 |
+
"mean_token_accuracy": 0.700226366519928,
|
| 1473 |
+
"num_tokens": 1876551.0,
|
| 1474 |
+
"step": 1830
|
| 1475 |
+
},
|
| 1476 |
+
{
|
| 1477 |
+
"epoch": 0.026098550395733455,
|
| 1478 |
+
"learning_rate": 0.0001947831267198094,
|
| 1479 |
+
"loss": 1.0627,
|
| 1480 |
+
"mean_token_accuracy": 0.7126640200614929,
|
| 1481 |
+
"num_tokens": 1886799.0,
|
| 1482 |
+
"step": 1840
|
| 1483 |
+
},
|
| 1484 |
+
{
|
| 1485 |
+
"epoch": 0.026240390343536354,
|
| 1486 |
+
"learning_rate": 0.0001947547587302488,
|
| 1487 |
+
"loss": 1.1058,
|
| 1488 |
+
"mean_token_accuracy": 0.7105142951011658,
|
| 1489 |
+
"num_tokens": 1897204.0,
|
| 1490 |
+
"step": 1850
|
| 1491 |
+
},
|
| 1492 |
+
{
|
| 1493 |
+
"epoch": 0.026382230291339254,
|
| 1494 |
+
"learning_rate": 0.00019472639074068822,
|
| 1495 |
+
"loss": 1.1202,
|
| 1496 |
+
"mean_token_accuracy": 0.6887533903121948,
|
| 1497 |
+
"num_tokens": 1907365.0,
|
| 1498 |
+
"step": 1860
|
| 1499 |
+
},
|
| 1500 |
+
{
|
| 1501 |
+
"epoch": 0.02652407023914215,
|
| 1502 |
+
"learning_rate": 0.00019469802275112763,
|
| 1503 |
+
"loss": 1.0668,
|
| 1504 |
+
"mean_token_accuracy": 0.7144553422927856,
|
| 1505 |
+
"num_tokens": 1917702.0,
|
| 1506 |
+
"step": 1870
|
| 1507 |
+
},
|
| 1508 |
+
{
|
| 1509 |
+
"epoch": 0.02666591018694505,
|
| 1510 |
+
"learning_rate": 0.00019466965476156706,
|
| 1511 |
+
"loss": 1.1356,
|
| 1512 |
+
"mean_token_accuracy": 0.6965939939022064,
|
| 1513 |
+
"num_tokens": 1928031.0,
|
| 1514 |
+
"step": 1880
|
| 1515 |
+
},
|
| 1516 |
+
{
|
| 1517 |
+
"epoch": 0.02680775013474795,
|
| 1518 |
+
"learning_rate": 0.0001946412867720065,
|
| 1519 |
+
"loss": 1.0778,
|
| 1520 |
+
"mean_token_accuracy": 0.7089079439640045,
|
| 1521 |
+
"num_tokens": 1938084.0,
|
| 1522 |
+
"step": 1890
|
| 1523 |
+
},
|
| 1524 |
+
{
|
| 1525 |
+
"epoch": 0.02694959008255085,
|
| 1526 |
+
"learning_rate": 0.0001946129187824459,
|
| 1527 |
+
"loss": 1.0483,
|
| 1528 |
+
"mean_token_accuracy": 0.7225513160228729,
|
| 1529 |
+
"num_tokens": 1948205.0,
|
| 1530 |
+
"step": 1900
|
| 1531 |
+
},
|
| 1532 |
+
{
|
| 1533 |
+
"epoch": 0.02709143003035375,
|
| 1534 |
+
"learning_rate": 0.00019458455079288532,
|
| 1535 |
+
"loss": 1.0154,
|
| 1536 |
+
"mean_token_accuracy": 0.7130086362361908,
|
| 1537 |
+
"num_tokens": 1958368.0,
|
| 1538 |
+
"step": 1910
|
| 1539 |
+
},
|
| 1540 |
+
{
|
| 1541 |
+
"epoch": 0.027233269978156647,
|
| 1542 |
+
"learning_rate": 0.00019455618280332472,
|
| 1543 |
+
"loss": 1.1258,
|
| 1544 |
+
"mean_token_accuracy": 0.695936119556427,
|
| 1545 |
+
"num_tokens": 1968738.0,
|
| 1546 |
+
"step": 1920
|
| 1547 |
+
},
|
| 1548 |
+
{
|
| 1549 |
+
"epoch": 0.027375109925959547,
|
| 1550 |
+
"learning_rate": 0.00019452781481376415,
|
| 1551 |
+
"loss": 1.0626,
|
| 1552 |
+
"mean_token_accuracy": 0.7156413078308106,
|
| 1553 |
+
"num_tokens": 1979045.0,
|
| 1554 |
+
"step": 1930
|
| 1555 |
+
},
|
| 1556 |
+
{
|
| 1557 |
+
"epoch": 0.027516949873762447,
|
| 1558 |
+
"learning_rate": 0.00019449944682420358,
|
| 1559 |
+
"loss": 1.0519,
|
| 1560 |
+
"mean_token_accuracy": 0.7116637229919434,
|
| 1561 |
+
"num_tokens": 1989218.0,
|
| 1562 |
+
"step": 1940
|
| 1563 |
+
},
|
| 1564 |
+
{
|
| 1565 |
+
"epoch": 0.027658789821565347,
|
| 1566 |
+
"learning_rate": 0.00019447107883464299,
|
| 1567 |
+
"loss": 1.0712,
|
| 1568 |
+
"mean_token_accuracy": 0.711160945892334,
|
| 1569 |
+
"num_tokens": 1999563.0,
|
| 1570 |
+
"step": 1950
|
| 1571 |
+
},
|
| 1572 |
+
{
|
| 1573 |
+
"epoch": 0.027800629769368247,
|
| 1574 |
+
"learning_rate": 0.00019444271084508242,
|
| 1575 |
+
"loss": 1.1045,
|
| 1576 |
+
"mean_token_accuracy": 0.7045223116874695,
|
| 1577 |
+
"num_tokens": 2009911.0,
|
| 1578 |
+
"step": 1960
|
| 1579 |
+
},
|
| 1580 |
+
{
|
| 1581 |
+
"epoch": 0.027942469717171143,
|
| 1582 |
+
"learning_rate": 0.00019441434285552184,
|
| 1583 |
+
"loss": 1.0463,
|
| 1584 |
+
"mean_token_accuracy": 0.7148343741893768,
|
| 1585 |
+
"num_tokens": 2020120.0,
|
| 1586 |
+
"step": 1970
|
| 1587 |
+
},
|
| 1588 |
+
{
|
| 1589 |
+
"epoch": 0.028084309664974043,
|
| 1590 |
+
"learning_rate": 0.00019438597486596127,
|
| 1591 |
+
"loss": 1.0957,
|
| 1592 |
+
"mean_token_accuracy": 0.7087677419185638,
|
| 1593 |
+
"num_tokens": 2030445.0,
|
| 1594 |
+
"step": 1980
|
| 1595 |
+
},
|
| 1596 |
+
{
|
| 1597 |
+
"epoch": 0.028226149612776943,
|
| 1598 |
+
"learning_rate": 0.00019435760687640068,
|
| 1599 |
+
"loss": 1.0338,
|
| 1600 |
+
"mean_token_accuracy": 0.7160651028156281,
|
| 1601 |
+
"num_tokens": 2040755.0,
|
| 1602 |
+
"step": 1990
|
| 1603 |
+
},
|
| 1604 |
+
{
|
| 1605 |
+
"epoch": 0.028367989560579843,
|
| 1606 |
+
"learning_rate": 0.00019432923888684008,
|
| 1607 |
+
"loss": 1.0508,
|
| 1608 |
+
"mean_token_accuracy": 0.710547685623169,
|
| 1609 |
+
"num_tokens": 2050888.0,
|
| 1610 |
+
"step": 2000
|
| 1611 |
+
},
|
| 1612 |
+
{
|
| 1613 |
+
"epoch": 0.028509829508382743,
|
| 1614 |
+
"learning_rate": 0.00019430087089727954,
|
| 1615 |
+
"loss": 1.0991,
|
| 1616 |
+
"mean_token_accuracy": 0.6983346939086914,
|
| 1617 |
+
"num_tokens": 2061216.0,
|
| 1618 |
+
"step": 2010
|
| 1619 |
+
},
|
| 1620 |
+
{
|
| 1621 |
+
"epoch": 0.02865166945618564,
|
| 1622 |
+
"learning_rate": 0.00019427250290771894,
|
| 1623 |
+
"loss": 1.0335,
|
| 1624 |
+
"mean_token_accuracy": 0.7186195015907287,
|
| 1625 |
+
"num_tokens": 2071685.0,
|
| 1626 |
+
"step": 2020
|
| 1627 |
+
},
|
| 1628 |
+
{
|
| 1629 |
+
"epoch": 0.02879350940398854,
|
| 1630 |
+
"learning_rate": 0.00019424413491815837,
|
| 1631 |
+
"loss": 1.0356,
|
| 1632 |
+
"mean_token_accuracy": 0.707346785068512,
|
| 1633 |
+
"num_tokens": 2081856.0,
|
| 1634 |
+
"step": 2030
|
| 1635 |
+
},
|
| 1636 |
+
{
|
| 1637 |
+
"epoch": 0.02893534935179144,
|
| 1638 |
+
"learning_rate": 0.00019421576692859777,
|
| 1639 |
+
"loss": 1.0796,
|
| 1640 |
+
"mean_token_accuracy": 0.713982081413269,
|
| 1641 |
+
"num_tokens": 2092164.0,
|
| 1642 |
+
"step": 2040
|
| 1643 |
+
},
|
| 1644 |
+
{
|
| 1645 |
+
"epoch": 0.02907718929959434,
|
| 1646 |
+
"learning_rate": 0.0001941873989390372,
|
| 1647 |
+
"loss": 1.0606,
|
| 1648 |
+
"mean_token_accuracy": 0.7008832335472107,
|
| 1649 |
+
"num_tokens": 2102352.0,
|
| 1650 |
+
"step": 2050
|
| 1651 |
+
},
|
| 1652 |
+
{
|
| 1653 |
+
"epoch": 0.02921902924739724,
|
| 1654 |
+
"learning_rate": 0.00019415903094947663,
|
| 1655 |
+
"loss": 1.0889,
|
| 1656 |
+
"mean_token_accuracy": 0.7124337434768677,
|
| 1657 |
+
"num_tokens": 2112565.0,
|
| 1658 |
+
"step": 2060
|
| 1659 |
+
},
|
| 1660 |
+
{
|
| 1661 |
+
"epoch": 0.029360869195200135,
|
| 1662 |
+
"learning_rate": 0.00019413066295991604,
|
| 1663 |
+
"loss": 1.1011,
|
| 1664 |
+
"mean_token_accuracy": 0.7071171522140502,
|
| 1665 |
+
"num_tokens": 2122888.0,
|
| 1666 |
+
"step": 2070
|
| 1667 |
+
},
|
| 1668 |
+
{
|
| 1669 |
+
"epoch": 0.029502709143003035,
|
| 1670 |
+
"learning_rate": 0.00019410229497035547,
|
| 1671 |
+
"loss": 1.1143,
|
| 1672 |
+
"mean_token_accuracy": 0.706645280122757,
|
| 1673 |
+
"num_tokens": 2133066.0,
|
| 1674 |
+
"step": 2080
|
| 1675 |
+
},
|
| 1676 |
+
{
|
| 1677 |
+
"epoch": 0.029644549090805935,
|
| 1678 |
+
"learning_rate": 0.00019407392698079487,
|
| 1679 |
+
"loss": 1.0579,
|
| 1680 |
+
"mean_token_accuracy": 0.7105309844017029,
|
| 1681 |
+
"num_tokens": 2143320.0,
|
| 1682 |
+
"step": 2090
|
| 1683 |
+
},
|
| 1684 |
+
{
|
| 1685 |
+
"epoch": 0.029786389038608835,
|
| 1686 |
+
"learning_rate": 0.0001940455589912343,
|
| 1687 |
+
"loss": 1.0705,
|
| 1688 |
+
"mean_token_accuracy": 0.7104713022708893,
|
| 1689 |
+
"num_tokens": 2153668.0,
|
| 1690 |
+
"step": 2100
|
| 1691 |
+
},
|
| 1692 |
+
{
|
| 1693 |
+
"epoch": 0.029928228986411735,
|
| 1694 |
+
"learning_rate": 0.00019401719100167373,
|
| 1695 |
+
"loss": 1.0957,
|
| 1696 |
+
"mean_token_accuracy": 0.7042783737182617,
|
| 1697 |
+
"num_tokens": 2164139.0,
|
| 1698 |
+
"step": 2110
|
| 1699 |
+
},
|
| 1700 |
+
{
|
| 1701 |
+
"epoch": 0.03007006893421463,
|
| 1702 |
+
"learning_rate": 0.00019398882301211313,
|
| 1703 |
+
"loss": 1.0888,
|
| 1704 |
+
"mean_token_accuracy": 0.7090662837028503,
|
| 1705 |
+
"num_tokens": 2174254.0,
|
| 1706 |
+
"step": 2120
|
| 1707 |
+
},
|
| 1708 |
+
{
|
| 1709 |
+
"epoch": 0.03021190888201753,
|
| 1710 |
+
"learning_rate": 0.00019396045502255256,
|
| 1711 |
+
"loss": 1.085,
|
| 1712 |
+
"mean_token_accuracy": 0.7122257769107818,
|
| 1713 |
+
"num_tokens": 2184390.0,
|
| 1714 |
+
"step": 2130
|
| 1715 |
+
},
|
| 1716 |
+
{
|
| 1717 |
+
"epoch": 0.03035374882982043,
|
| 1718 |
+
"learning_rate": 0.00019393208703299196,
|
| 1719 |
+
"loss": 1.0785,
|
| 1720 |
+
"mean_token_accuracy": 0.7105566322803497,
|
| 1721 |
+
"num_tokens": 2194501.0,
|
| 1722 |
+
"step": 2140
|
| 1723 |
+
},
|
| 1724 |
+
{
|
| 1725 |
+
"epoch": 0.03049558877762333,
|
| 1726 |
+
"learning_rate": 0.00019390371904343142,
|
| 1727 |
+
"loss": 1.078,
|
| 1728 |
+
"mean_token_accuracy": 0.7077171504497528,
|
| 1729 |
+
"num_tokens": 2204777.0,
|
| 1730 |
+
"step": 2150
|
| 1731 |
+
},
|
| 1732 |
+
{
|
| 1733 |
+
"epoch": 0.03063742872542623,
|
| 1734 |
+
"learning_rate": 0.00019387535105387082,
|
| 1735 |
+
"loss": 1.0877,
|
| 1736 |
+
"mean_token_accuracy": 0.7124951481819153,
|
| 1737 |
+
"num_tokens": 2215021.0,
|
| 1738 |
+
"step": 2160
|
| 1739 |
+
},
|
| 1740 |
+
{
|
| 1741 |
+
"epoch": 0.030779268673229127,
|
| 1742 |
+
"learning_rate": 0.00019384698306431023,
|
| 1743 |
+
"loss": 1.114,
|
| 1744 |
+
"mean_token_accuracy": 0.6971140921115875,
|
| 1745 |
+
"num_tokens": 2225350.0,
|
| 1746 |
+
"step": 2170
|
| 1747 |
+
},
|
| 1748 |
+
{
|
| 1749 |
+
"epoch": 0.030921108621032027,
|
| 1750 |
+
"learning_rate": 0.00019381861507474966,
|
| 1751 |
+
"loss": 1.081,
|
| 1752 |
+
"mean_token_accuracy": 0.7068113803863525,
|
| 1753 |
+
"num_tokens": 2235521.0,
|
| 1754 |
+
"step": 2180
|
| 1755 |
+
},
|
| 1756 |
+
{
|
| 1757 |
+
"epoch": 0.031062948568834927,
|
| 1758 |
+
"learning_rate": 0.0001937902470851891,
|
| 1759 |
+
"loss": 1.0834,
|
| 1760 |
+
"mean_token_accuracy": 0.7073126614093781,
|
| 1761 |
+
"num_tokens": 2245925.0,
|
| 1762 |
+
"step": 2190
|
| 1763 |
+
},
|
| 1764 |
+
{
|
| 1765 |
+
"epoch": 0.031204788516637827,
|
| 1766 |
+
"learning_rate": 0.00019376187909562852,
|
| 1767 |
+
"loss": 1.0519,
|
| 1768 |
+
"mean_token_accuracy": 0.7111847221851348,
|
| 1769 |
+
"num_tokens": 2256364.0,
|
| 1770 |
+
"step": 2200
|
| 1771 |
+
},
|
| 1772 |
+
{
|
| 1773 |
+
"epoch": 0.03134662846444072,
|
| 1774 |
+
"learning_rate": 0.00019373351110606792,
|
| 1775 |
+
"loss": 1.0787,
|
| 1776 |
+
"mean_token_accuracy": 0.709247374534607,
|
| 1777 |
+
"num_tokens": 2266801.0,
|
| 1778 |
+
"step": 2210
|
| 1779 |
+
},
|
| 1780 |
+
{
|
| 1781 |
+
"epoch": 0.03148846841224363,
|
| 1782 |
+
"learning_rate": 0.00019370514311650735,
|
| 1783 |
+
"loss": 1.0879,
|
| 1784 |
+
"mean_token_accuracy": 0.7051020622253418,
|
| 1785 |
+
"num_tokens": 2276883.0,
|
| 1786 |
+
"step": 2220
|
| 1787 |
+
},
|
| 1788 |
+
{
|
| 1789 |
+
"epoch": 0.03163030836004652,
|
| 1790 |
+
"learning_rate": 0.00019367677512694675,
|
| 1791 |
+
"loss": 1.0577,
|
| 1792 |
+
"mean_token_accuracy": 0.7003655672073364,
|
| 1793 |
+
"num_tokens": 2287178.0,
|
| 1794 |
+
"step": 2230
|
| 1795 |
+
},
|
| 1796 |
+
{
|
| 1797 |
+
"epoch": 0.03177214830784942,
|
| 1798 |
+
"learning_rate": 0.00019364840713738618,
|
| 1799 |
+
"loss": 1.1319,
|
| 1800 |
+
"mean_token_accuracy": 0.6982253730297089,
|
| 1801 |
+
"num_tokens": 2297269.0,
|
| 1802 |
+
"step": 2240
|
| 1803 |
+
},
|
| 1804 |
+
{
|
| 1805 |
+
"epoch": 0.03191398825565232,
|
| 1806 |
+
"learning_rate": 0.0001936200391478256,
|
| 1807 |
+
"loss": 1.0966,
|
| 1808 |
+
"mean_token_accuracy": 0.7028747737407685,
|
| 1809 |
+
"num_tokens": 2307553.0,
|
| 1810 |
+
"step": 2250
|
| 1811 |
+
},
|
| 1812 |
+
{
|
| 1813 |
+
"epoch": 0.03205582820345522,
|
| 1814 |
+
"learning_rate": 0.00019359167115826502,
|
| 1815 |
+
"loss": 1.0234,
|
| 1816 |
+
"mean_token_accuracy": 0.7202287912368774,
|
| 1817 |
+
"num_tokens": 2317537.0,
|
| 1818 |
+
"step": 2260
|
| 1819 |
+
},
|
| 1820 |
+
{
|
| 1821 |
+
"epoch": 0.03219766815125812,
|
| 1822 |
+
"learning_rate": 0.00019356330316870445,
|
| 1823 |
+
"loss": 1.0733,
|
| 1824 |
+
"mean_token_accuracy": 0.7071494162082672,
|
| 1825 |
+
"num_tokens": 2327652.0,
|
| 1826 |
+
"step": 2270
|
| 1827 |
+
},
|
| 1828 |
+
{
|
| 1829 |
+
"epoch": 0.03233950809906102,
|
| 1830 |
+
"learning_rate": 0.00019353493517914388,
|
| 1831 |
+
"loss": 1.0692,
|
| 1832 |
+
"mean_token_accuracy": 0.704166728258133,
|
| 1833 |
+
"num_tokens": 2337948.0,
|
| 1834 |
+
"step": 2280
|
| 1835 |
+
},
|
| 1836 |
+
{
|
| 1837 |
+
"epoch": 0.032481348046863916,
|
| 1838 |
+
"learning_rate": 0.00019350656718958328,
|
| 1839 |
+
"loss": 1.0926,
|
| 1840 |
+
"mean_token_accuracy": 0.7022884428501129,
|
| 1841 |
+
"num_tokens": 2348363.0,
|
| 1842 |
+
"step": 2290
|
| 1843 |
+
},
|
| 1844 |
+
{
|
| 1845 |
+
"epoch": 0.03262318799466682,
|
| 1846 |
+
"learning_rate": 0.0001934781992000227,
|
| 1847 |
+
"loss": 1.1009,
|
| 1848 |
+
"mean_token_accuracy": 0.7048872351646424,
|
| 1849 |
+
"num_tokens": 2358709.0,
|
| 1850 |
+
"step": 2300
|
| 1851 |
+
},
|
| 1852 |
+
{
|
| 1853 |
+
"epoch": 0.032765027942469716,
|
| 1854 |
+
"learning_rate": 0.0001934498312104621,
|
| 1855 |
+
"loss": 1.0722,
|
| 1856 |
+
"mean_token_accuracy": 0.7133905410766601,
|
| 1857 |
+
"num_tokens": 2368896.0,
|
| 1858 |
+
"step": 2310
|
| 1859 |
+
},
|
| 1860 |
+
{
|
| 1861 |
+
"epoch": 0.03290686789027262,
|
| 1862 |
+
"learning_rate": 0.00019342146322090154,
|
| 1863 |
+
"loss": 1.0448,
|
| 1864 |
+
"mean_token_accuracy": 0.7166795194149017,
|
| 1865 |
+
"num_tokens": 2379133.0,
|
| 1866 |
+
"step": 2320
|
| 1867 |
+
},
|
| 1868 |
+
{
|
| 1869 |
+
"epoch": 0.033048707838075515,
|
| 1870 |
+
"learning_rate": 0.00019339309523134097,
|
| 1871 |
+
"loss": 1.07,
|
| 1872 |
+
"mean_token_accuracy": 0.7065820157527923,
|
| 1873 |
+
"num_tokens": 2389292.0,
|
| 1874 |
+
"step": 2330
|
| 1875 |
+
},
|
| 1876 |
+
{
|
| 1877 |
+
"epoch": 0.03319054778587841,
|
| 1878 |
+
"learning_rate": 0.00019336472724178037,
|
| 1879 |
+
"loss": 1.084,
|
| 1880 |
+
"mean_token_accuracy": 0.7133280396461487,
|
| 1881 |
+
"num_tokens": 2399640.0,
|
| 1882 |
+
"step": 2340
|
| 1883 |
+
},
|
| 1884 |
+
{
|
| 1885 |
+
"epoch": 0.033332387733681315,
|
| 1886 |
+
"learning_rate": 0.0001933363592522198,
|
| 1887 |
+
"loss": 1.0789,
|
| 1888 |
+
"mean_token_accuracy": 0.70665163397789,
|
| 1889 |
+
"num_tokens": 2410039.0,
|
| 1890 |
+
"step": 2350
|
| 1891 |
+
}
|
| 1892 |
+
],
|
| 1893 |
+
"logging_steps": 10,
|
| 1894 |
+
"max_steps": 70502,
|
| 1895 |
+
"num_input_tokens_seen": 0,
|
| 1896 |
+
"num_train_epochs": 9223372036854775807,
|
| 1897 |
+
"save_steps": 2350,
|
| 1898 |
+
"stateful_callbacks": {
|
| 1899 |
+
"TrainerControl": {
|
| 1900 |
+
"args": {
|
| 1901 |
+
"should_epoch_stop": false,
|
| 1902 |
+
"should_evaluate": false,
|
| 1903 |
+
"should_log": false,
|
| 1904 |
+
"should_save": true,
|
| 1905 |
+
"should_training_stop": false
|
| 1906 |
+
},
|
| 1907 |
+
"attributes": {}
|
| 1908 |
+
}
|
| 1909 |
+
},
|
| 1910 |
+
"total_flos": 1.2857716965310464e+17,
|
| 1911 |
+
"train_batch_size": 16,
|
| 1912 |
+
"trial_name": null,
|
| 1913 |
+
"trial_params": null
|
| 1914 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
|
| 3 |
+
library_name: peft
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
| 200 |
+
### Framework versions
|
| 201 |
+
|
| 202 |
+
- PEFT 0.15.0
|
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/adapter_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": false,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 16,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.0,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"r": 8,
|
| 24 |
+
"rank_pattern": {},
|
| 25 |
+
"revision": null,
|
| 26 |
+
"target_modules": [
|
| 27 |
+
"q_proj",
|
| 28 |
+
"v_proj"
|
| 29 |
+
],
|
| 30 |
+
"task_type": "CAUSAL_LM",
|
| 31 |
+
"trainable_token_indices": null,
|
| 32 |
+
"use_dora": false,
|
| 33 |
+
"use_rslora": false
|
| 34 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer_config.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
| 33 |
+
"clean_up_tokenization_spaces": false,
|
| 34 |
+
"eos_token": "</s>",
|
| 35 |
+
"extra_special_tokens": {},
|
| 36 |
+
"legacy": false,
|
| 37 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 38 |
+
"pad_token": "</s>",
|
| 39 |
+
"padding_side": "right",
|
| 40 |
+
"sp_model_kwargs": {},
|
| 41 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 42 |
+
"unk_token": "<unk>",
|
| 43 |
+
"use_default_system_prompt": false
|
| 44 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/trainer_state.json
ADDED
|
@@ -0,0 +1,1378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.03332542225386654,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1685,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.0001977769866698311,
|
| 14 |
+
"learning_rate": 0.00019996440014239942,
|
| 15 |
+
"loss": 0.3352,
|
| 16 |
+
"mean_token_accuracy": 0.9023264050483704,
|
| 17 |
+
"num_tokens": 19163.0,
|
| 18 |
+
"step": 10
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"epoch": 0.0003955539733396622,
|
| 22 |
+
"learning_rate": 0.0001999248447450655,
|
| 23 |
+
"loss": 0.1524,
|
| 24 |
+
"mean_token_accuracy": 0.947095412015915,
|
| 25 |
+
"num_tokens": 38071.0,
|
| 26 |
+
"step": 20
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"epoch": 0.0005933309600094933,
|
| 30 |
+
"learning_rate": 0.0001998852893477315,
|
| 31 |
+
"loss": 0.1461,
|
| 32 |
+
"mean_token_accuracy": 0.9463379800319671,
|
| 33 |
+
"num_tokens": 57017.0,
|
| 34 |
+
"step": 30
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"epoch": 0.0007911079466793244,
|
| 38 |
+
"learning_rate": 0.00019984573395039754,
|
| 39 |
+
"loss": 0.1024,
|
| 40 |
+
"mean_token_accuracy": 0.9588611423969269,
|
| 41 |
+
"num_tokens": 76047.0,
|
| 42 |
+
"step": 40
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"epoch": 0.0009888849333491555,
|
| 46 |
+
"learning_rate": 0.00019980617855306357,
|
| 47 |
+
"loss": 0.117,
|
| 48 |
+
"mean_token_accuracy": 0.9553473949432373,
|
| 49 |
+
"num_tokens": 94824.0,
|
| 50 |
+
"step": 50
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"epoch": 0.0011866619200189867,
|
| 54 |
+
"learning_rate": 0.0001997666231557296,
|
| 55 |
+
"loss": 0.1281,
|
| 56 |
+
"mean_token_accuracy": 0.9538084208965302,
|
| 57 |
+
"num_tokens": 113467.0,
|
| 58 |
+
"step": 60
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 0.0013844389066888178,
|
| 62 |
+
"learning_rate": 0.00019972706775839565,
|
| 63 |
+
"loss": 0.1075,
|
| 64 |
+
"mean_token_accuracy": 0.961921775341034,
|
| 65 |
+
"num_tokens": 132229.0,
|
| 66 |
+
"step": 70
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.0015822158933586489,
|
| 70 |
+
"learning_rate": 0.00019968751236106166,
|
| 71 |
+
"loss": 0.1057,
|
| 72 |
+
"mean_token_accuracy": 0.9676864743232727,
|
| 73 |
+
"num_tokens": 150811.0,
|
| 74 |
+
"step": 80
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"epoch": 0.00177999288002848,
|
| 78 |
+
"learning_rate": 0.00019964795696372772,
|
| 79 |
+
"loss": 0.1048,
|
| 80 |
+
"mean_token_accuracy": 0.9597055971622467,
|
| 81 |
+
"num_tokens": 169804.0,
|
| 82 |
+
"step": 90
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"epoch": 0.001977769866698311,
|
| 86 |
+
"learning_rate": 0.00019960840156639376,
|
| 87 |
+
"loss": 0.1147,
|
| 88 |
+
"mean_token_accuracy": 0.9561040580272675,
|
| 89 |
+
"num_tokens": 188503.0,
|
| 90 |
+
"step": 100
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"epoch": 0.002175546853368142,
|
| 94 |
+
"learning_rate": 0.00019956884616905977,
|
| 95 |
+
"loss": 0.0904,
|
| 96 |
+
"mean_token_accuracy": 0.9663344562053681,
|
| 97 |
+
"num_tokens": 207370.0,
|
| 98 |
+
"step": 110
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"epoch": 0.0023733238400379733,
|
| 102 |
+
"learning_rate": 0.0001995292907717258,
|
| 103 |
+
"loss": 0.085,
|
| 104 |
+
"mean_token_accuracy": 0.9692767798900604,
|
| 105 |
+
"num_tokens": 226639.0,
|
| 106 |
+
"step": 120
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"epoch": 0.0025711008267078044,
|
| 110 |
+
"learning_rate": 0.00019948973537439185,
|
| 111 |
+
"loss": 0.1055,
|
| 112 |
+
"mean_token_accuracy": 0.962373024225235,
|
| 113 |
+
"num_tokens": 245318.0,
|
| 114 |
+
"step": 130
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 0.0027688778133776355,
|
| 118 |
+
"learning_rate": 0.00019945017997705788,
|
| 119 |
+
"loss": 0.1023,
|
| 120 |
+
"mean_token_accuracy": 0.9603676617145538,
|
| 121 |
+
"num_tokens": 264089.0,
|
| 122 |
+
"step": 140
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.0029666548000474666,
|
| 126 |
+
"learning_rate": 0.0001994106245797239,
|
| 127 |
+
"loss": 0.106,
|
| 128 |
+
"mean_token_accuracy": 0.9619979500770569,
|
| 129 |
+
"num_tokens": 282977.0,
|
| 130 |
+
"step": 150
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"epoch": 0.0031644317867172977,
|
| 134 |
+
"learning_rate": 0.00019937106918238996,
|
| 135 |
+
"loss": 0.1021,
|
| 136 |
+
"mean_token_accuracy": 0.9600433588027955,
|
| 137 |
+
"num_tokens": 301615.0,
|
| 138 |
+
"step": 160
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 0.003362208773387129,
|
| 142 |
+
"learning_rate": 0.000199331513785056,
|
| 143 |
+
"loss": 0.095,
|
| 144 |
+
"mean_token_accuracy": 0.9641285121440888,
|
| 145 |
+
"num_tokens": 320502.0,
|
| 146 |
+
"step": 170
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"epoch": 0.00355998576005696,
|
| 150 |
+
"learning_rate": 0.000199291958387722,
|
| 151 |
+
"loss": 0.0722,
|
| 152 |
+
"mean_token_accuracy": 0.9708887040615082,
|
| 153 |
+
"num_tokens": 339616.0,
|
| 154 |
+
"step": 180
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"epoch": 0.003757762746726791,
|
| 158 |
+
"learning_rate": 0.00019925240299038804,
|
| 159 |
+
"loss": 0.0951,
|
| 160 |
+
"mean_token_accuracy": 0.9712291181087493,
|
| 161 |
+
"num_tokens": 358474.0,
|
| 162 |
+
"step": 190
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"epoch": 0.003955539733396622,
|
| 166 |
+
"learning_rate": 0.00019921284759305408,
|
| 167 |
+
"loss": 0.1194,
|
| 168 |
+
"mean_token_accuracy": 0.9630812525749206,
|
| 169 |
+
"num_tokens": 377094.0,
|
| 170 |
+
"step": 200
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 0.004153316720066453,
|
| 174 |
+
"learning_rate": 0.00019917329219572012,
|
| 175 |
+
"loss": 0.1002,
|
| 176 |
+
"mean_token_accuracy": 0.9660979807376862,
|
| 177 |
+
"num_tokens": 396000.0,
|
| 178 |
+
"step": 210
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.004351093706736284,
|
| 182 |
+
"learning_rate": 0.00019913373679838613,
|
| 183 |
+
"loss": 0.0954,
|
| 184 |
+
"mean_token_accuracy": 0.9636943399906158,
|
| 185 |
+
"num_tokens": 415019.0,
|
| 186 |
+
"step": 220
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"epoch": 0.0045488706934061155,
|
| 190 |
+
"learning_rate": 0.0001990941814010522,
|
| 191 |
+
"loss": 0.1114,
|
| 192 |
+
"mean_token_accuracy": 0.9662698566913605,
|
| 193 |
+
"num_tokens": 433711.0,
|
| 194 |
+
"step": 230
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"epoch": 0.004746647680075947,
|
| 198 |
+
"learning_rate": 0.00019905462600371823,
|
| 199 |
+
"loss": 0.0915,
|
| 200 |
+
"mean_token_accuracy": 0.9679243505001068,
|
| 201 |
+
"num_tokens": 452483.0,
|
| 202 |
+
"step": 240
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"epoch": 0.004944424666745778,
|
| 206 |
+
"learning_rate": 0.00019901507060638424,
|
| 207 |
+
"loss": 0.095,
|
| 208 |
+
"mean_token_accuracy": 0.9688079237937928,
|
| 209 |
+
"num_tokens": 471395.0,
|
| 210 |
+
"step": 250
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"epoch": 0.005142201653415609,
|
| 214 |
+
"learning_rate": 0.00019897551520905028,
|
| 215 |
+
"loss": 0.1123,
|
| 216 |
+
"mean_token_accuracy": 0.962276142835617,
|
| 217 |
+
"num_tokens": 489983.0,
|
| 218 |
+
"step": 260
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"epoch": 0.00533997864008544,
|
| 222 |
+
"learning_rate": 0.00019893595981171632,
|
| 223 |
+
"loss": 0.0855,
|
| 224 |
+
"mean_token_accuracy": 0.9698696434497833,
|
| 225 |
+
"num_tokens": 509148.0,
|
| 226 |
+
"step": 270
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 0.005537755626755271,
|
| 230 |
+
"learning_rate": 0.00019889640441438235,
|
| 231 |
+
"loss": 0.0777,
|
| 232 |
+
"mean_token_accuracy": 0.9697826623916626,
|
| 233 |
+
"num_tokens": 528042.0,
|
| 234 |
+
"step": 280
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.005735532613425102,
|
| 238 |
+
"learning_rate": 0.0001988568490170484,
|
| 239 |
+
"loss": 0.0944,
|
| 240 |
+
"mean_token_accuracy": 0.9690817773342133,
|
| 241 |
+
"num_tokens": 546656.0,
|
| 242 |
+
"step": 290
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"epoch": 0.005933309600094933,
|
| 246 |
+
"learning_rate": 0.00019881729361971443,
|
| 247 |
+
"loss": 0.0872,
|
| 248 |
+
"mean_token_accuracy": 0.9661558032035827,
|
| 249 |
+
"num_tokens": 565279.0,
|
| 250 |
+
"step": 300
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"epoch": 0.006131086586764764,
|
| 254 |
+
"learning_rate": 0.00019877773822238047,
|
| 255 |
+
"loss": 0.09,
|
| 256 |
+
"mean_token_accuracy": 0.9669564247131348,
|
| 257 |
+
"num_tokens": 584196.0,
|
| 258 |
+
"step": 310
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"epoch": 0.0063288635734345955,
|
| 262 |
+
"learning_rate": 0.00019873818282504648,
|
| 263 |
+
"loss": 0.0702,
|
| 264 |
+
"mean_token_accuracy": 0.9722951114177704,
|
| 265 |
+
"num_tokens": 603050.0,
|
| 266 |
+
"step": 320
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"epoch": 0.006526640560104427,
|
| 270 |
+
"learning_rate": 0.00019869862742771251,
|
| 271 |
+
"loss": 0.0923,
|
| 272 |
+
"mean_token_accuracy": 0.9684451401233674,
|
| 273 |
+
"num_tokens": 621880.0,
|
| 274 |
+
"step": 330
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"epoch": 0.006724417546774258,
|
| 278 |
+
"learning_rate": 0.00019865907203037855,
|
| 279 |
+
"loss": 0.0976,
|
| 280 |
+
"mean_token_accuracy": 0.9660769879817963,
|
| 281 |
+
"num_tokens": 640657.0,
|
| 282 |
+
"step": 340
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 0.006922194533444089,
|
| 286 |
+
"learning_rate": 0.0001986195166330446,
|
| 287 |
+
"loss": 0.107,
|
| 288 |
+
"mean_token_accuracy": 0.9633386790752411,
|
| 289 |
+
"num_tokens": 659503.0,
|
| 290 |
+
"step": 350
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 0.00711997152011392,
|
| 294 |
+
"learning_rate": 0.00019857996123571063,
|
| 295 |
+
"loss": 0.1058,
|
| 296 |
+
"mean_token_accuracy": 0.9641251742839814,
|
| 297 |
+
"num_tokens": 678570.0,
|
| 298 |
+
"step": 360
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"epoch": 0.007317748506783751,
|
| 302 |
+
"learning_rate": 0.00019854040583837666,
|
| 303 |
+
"loss": 0.097,
|
| 304 |
+
"mean_token_accuracy": 0.9664280533790588,
|
| 305 |
+
"num_tokens": 697294.0,
|
| 306 |
+
"step": 370
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"epoch": 0.007515525493453582,
|
| 310 |
+
"learning_rate": 0.0001985008504410427,
|
| 311 |
+
"loss": 0.0677,
|
| 312 |
+
"mean_token_accuracy": 0.9754213869571686,
|
| 313 |
+
"num_tokens": 716458.0,
|
| 314 |
+
"step": 380
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"epoch": 0.007713302480123413,
|
| 318 |
+
"learning_rate": 0.0001984612950437087,
|
| 319 |
+
"loss": 0.0622,
|
| 320 |
+
"mean_token_accuracy": 0.9724574089050293,
|
| 321 |
+
"num_tokens": 735437.0,
|
| 322 |
+
"step": 390
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"epoch": 0.007911079466793244,
|
| 326 |
+
"learning_rate": 0.00019842173964637475,
|
| 327 |
+
"loss": 0.1004,
|
| 328 |
+
"mean_token_accuracy": 0.9710333228111268,
|
| 329 |
+
"num_tokens": 754416.0,
|
| 330 |
+
"step": 400
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"epoch": 0.008108856453463075,
|
| 334 |
+
"learning_rate": 0.0001983821842490408,
|
| 335 |
+
"loss": 0.0922,
|
| 336 |
+
"mean_token_accuracy": 0.9718518137931824,
|
| 337 |
+
"num_tokens": 773091.0,
|
| 338 |
+
"step": 410
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 0.008306633440132907,
|
| 342 |
+
"learning_rate": 0.00019834262885170682,
|
| 343 |
+
"loss": 0.0835,
|
| 344 |
+
"mean_token_accuracy": 0.9694978713989257,
|
| 345 |
+
"num_tokens": 791900.0,
|
| 346 |
+
"step": 420
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 0.008504410426802738,
|
| 350 |
+
"learning_rate": 0.00019830307345437286,
|
| 351 |
+
"loss": 0.0822,
|
| 352 |
+
"mean_token_accuracy": 0.974584549665451,
|
| 353 |
+
"num_tokens": 810547.0,
|
| 354 |
+
"step": 430
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"epoch": 0.008702187413472569,
|
| 358 |
+
"learning_rate": 0.0001982635180570389,
|
| 359 |
+
"loss": 0.0733,
|
| 360 |
+
"mean_token_accuracy": 0.9770256340503692,
|
| 361 |
+
"num_tokens": 829753.0,
|
| 362 |
+
"step": 440
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"epoch": 0.0088999644001424,
|
| 366 |
+
"learning_rate": 0.00019822396265970494,
|
| 367 |
+
"loss": 0.0882,
|
| 368 |
+
"mean_token_accuracy": 0.9668483734130859,
|
| 369 |
+
"num_tokens": 848294.0,
|
| 370 |
+
"step": 450
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"epoch": 0.009097741386812231,
|
| 374 |
+
"learning_rate": 0.00019818440726237095,
|
| 375 |
+
"loss": 0.0921,
|
| 376 |
+
"mean_token_accuracy": 0.9715612173080445,
|
| 377 |
+
"num_tokens": 867635.0,
|
| 378 |
+
"step": 460
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
"epoch": 0.009295518373482062,
|
| 382 |
+
"learning_rate": 0.00019814485186503699,
|
| 383 |
+
"loss": 0.0668,
|
| 384 |
+
"mean_token_accuracy": 0.9765562832355499,
|
| 385 |
+
"num_tokens": 886596.0,
|
| 386 |
+
"step": 470
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"epoch": 0.009493295360151893,
|
| 390 |
+
"learning_rate": 0.00019810529646770302,
|
| 391 |
+
"loss": 0.0797,
|
| 392 |
+
"mean_token_accuracy": 0.972789865732193,
|
| 393 |
+
"num_tokens": 905312.0,
|
| 394 |
+
"step": 480
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 0.009691072346821724,
|
| 398 |
+
"learning_rate": 0.00019806574107036906,
|
| 399 |
+
"loss": 0.0792,
|
| 400 |
+
"mean_token_accuracy": 0.976115608215332,
|
| 401 |
+
"num_tokens": 924072.0,
|
| 402 |
+
"step": 490
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 0.009888849333491555,
|
| 406 |
+
"learning_rate": 0.0001980261856730351,
|
| 407 |
+
"loss": 0.0672,
|
| 408 |
+
"mean_token_accuracy": 0.9746761620044708,
|
| 409 |
+
"num_tokens": 942844.0,
|
| 410 |
+
"step": 500
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"epoch": 0.010086626320161387,
|
| 414 |
+
"learning_rate": 0.00019798663027570113,
|
| 415 |
+
"loss": 0.1031,
|
| 416 |
+
"mean_token_accuracy": 0.9693170130252838,
|
| 417 |
+
"num_tokens": 961524.0,
|
| 418 |
+
"step": 510
|
| 419 |
+
},
|
| 420 |
+
{
|
| 421 |
+
"epoch": 0.010284403306831218,
|
| 422 |
+
"learning_rate": 0.00019794707487836717,
|
| 423 |
+
"loss": 0.0899,
|
| 424 |
+
"mean_token_accuracy": 0.9669535160064697,
|
| 425 |
+
"num_tokens": 980332.0,
|
| 426 |
+
"step": 520
|
| 427 |
+
},
|
| 428 |
+
{
|
| 429 |
+
"epoch": 0.010482180293501049,
|
| 430 |
+
"learning_rate": 0.00019790751948103318,
|
| 431 |
+
"loss": 0.0861,
|
| 432 |
+
"mean_token_accuracy": 0.9700904309749603,
|
| 433 |
+
"num_tokens": 999163.0,
|
| 434 |
+
"step": 530
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"epoch": 0.01067995728017088,
|
| 438 |
+
"learning_rate": 0.00019786796408369922,
|
| 439 |
+
"loss": 0.0997,
|
| 440 |
+
"mean_token_accuracy": 0.9640280067920685,
|
| 441 |
+
"num_tokens": 1018053.0,
|
| 442 |
+
"step": 540
|
| 443 |
+
},
|
| 444 |
+
{
|
| 445 |
+
"epoch": 0.010877734266840711,
|
| 446 |
+
"learning_rate": 0.00019782840868636528,
|
| 447 |
+
"loss": 0.0795,
|
| 448 |
+
"mean_token_accuracy": 0.9717990577220916,
|
| 449 |
+
"num_tokens": 1036828.0,
|
| 450 |
+
"step": 550
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"epoch": 0.011075511253510542,
|
| 454 |
+
"learning_rate": 0.0001977888532890313,
|
| 455 |
+
"loss": 0.0885,
|
| 456 |
+
"mean_token_accuracy": 0.9719638526439667,
|
| 457 |
+
"num_tokens": 1055843.0,
|
| 458 |
+
"step": 560
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 0.011273288240180373,
|
| 462 |
+
"learning_rate": 0.00019774929789169733,
|
| 463 |
+
"loss": 0.0823,
|
| 464 |
+
"mean_token_accuracy": 0.9698925375938415,
|
| 465 |
+
"num_tokens": 1074733.0,
|
| 466 |
+
"step": 570
|
| 467 |
+
},
|
| 468 |
+
{
|
| 469 |
+
"epoch": 0.011471065226850204,
|
| 470 |
+
"learning_rate": 0.00019770974249436337,
|
| 471 |
+
"loss": 0.0945,
|
| 472 |
+
"mean_token_accuracy": 0.9694046437740326,
|
| 473 |
+
"num_tokens": 1093519.0,
|
| 474 |
+
"step": 580
|
| 475 |
+
},
|
| 476 |
+
{
|
| 477 |
+
"epoch": 0.011668842213520035,
|
| 478 |
+
"learning_rate": 0.0001976701870970294,
|
| 479 |
+
"loss": 0.0797,
|
| 480 |
+
"mean_token_accuracy": 0.975245076417923,
|
| 481 |
+
"num_tokens": 1112394.0,
|
| 482 |
+
"step": 590
|
| 483 |
+
},
|
| 484 |
+
{
|
| 485 |
+
"epoch": 0.011866619200189867,
|
| 486 |
+
"learning_rate": 0.00019763063169969542,
|
| 487 |
+
"loss": 0.0816,
|
| 488 |
+
"mean_token_accuracy": 0.9766307890415191,
|
| 489 |
+
"num_tokens": 1131435.0,
|
| 490 |
+
"step": 600
|
| 491 |
+
},
|
| 492 |
+
{
|
| 493 |
+
"epoch": 0.012064396186859698,
|
| 494 |
+
"learning_rate": 0.00019759107630236146,
|
| 495 |
+
"loss": 0.089,
|
| 496 |
+
"mean_token_accuracy": 0.9677313148975373,
|
| 497 |
+
"num_tokens": 1150274.0,
|
| 498 |
+
"step": 610
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"epoch": 0.012262173173529529,
|
| 502 |
+
"learning_rate": 0.00019755152090502752,
|
| 503 |
+
"loss": 0.096,
|
| 504 |
+
"mean_token_accuracy": 0.9663532435894012,
|
| 505 |
+
"num_tokens": 1169076.0,
|
| 506 |
+
"step": 620
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"epoch": 0.01245995016019936,
|
| 510 |
+
"learning_rate": 0.00019751196550769353,
|
| 511 |
+
"loss": 0.0686,
|
| 512 |
+
"mean_token_accuracy": 0.9754896223545074,
|
| 513 |
+
"num_tokens": 1187908.0,
|
| 514 |
+
"step": 630
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 0.012657727146869191,
|
| 518 |
+
"learning_rate": 0.00019747241011035957,
|
| 519 |
+
"loss": 0.0902,
|
| 520 |
+
"mean_token_accuracy": 0.9659872591495514,
|
| 521 |
+
"num_tokens": 1206872.0,
|
| 522 |
+
"step": 640
|
| 523 |
+
},
|
| 524 |
+
{
|
| 525 |
+
"epoch": 0.012855504133539022,
|
| 526 |
+
"learning_rate": 0.0001974328547130256,
|
| 527 |
+
"loss": 0.0757,
|
| 528 |
+
"mean_token_accuracy": 0.9724668145179749,
|
| 529 |
+
"num_tokens": 1225767.0,
|
| 530 |
+
"step": 650
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"epoch": 0.013053281120208853,
|
| 534 |
+
"learning_rate": 0.00019739329931569164,
|
| 535 |
+
"loss": 0.0798,
|
| 536 |
+
"mean_token_accuracy": 0.9697497248649597,
|
| 537 |
+
"num_tokens": 1244340.0,
|
| 538 |
+
"step": 660
|
| 539 |
+
},
|
| 540 |
+
{
|
| 541 |
+
"epoch": 0.013251058106878684,
|
| 542 |
+
"learning_rate": 0.00019735374391835765,
|
| 543 |
+
"loss": 0.0874,
|
| 544 |
+
"mean_token_accuracy": 0.9725019693374634,
|
| 545 |
+
"num_tokens": 1263304.0,
|
| 546 |
+
"step": 670
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"epoch": 0.013448835093548515,
|
| 550 |
+
"learning_rate": 0.0001973141885210237,
|
| 551 |
+
"loss": 0.0894,
|
| 552 |
+
"mean_token_accuracy": 0.9702820897102356,
|
| 553 |
+
"num_tokens": 1282028.0,
|
| 554 |
+
"step": 680
|
| 555 |
+
},
|
| 556 |
+
{
|
| 557 |
+
"epoch": 0.013646612080218347,
|
| 558 |
+
"learning_rate": 0.00019727463312368976,
|
| 559 |
+
"loss": 0.0958,
|
| 560 |
+
"mean_token_accuracy": 0.9666474103927613,
|
| 561 |
+
"num_tokens": 1300780.0,
|
| 562 |
+
"step": 690
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"epoch": 0.013844389066888178,
|
| 566 |
+
"learning_rate": 0.00019723507772635577,
|
| 567 |
+
"loss": 0.0808,
|
| 568 |
+
"mean_token_accuracy": 0.9751847207546234,
|
| 569 |
+
"num_tokens": 1319535.0,
|
| 570 |
+
"step": 700
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 0.014042166053558009,
|
| 574 |
+
"learning_rate": 0.0001971955223290218,
|
| 575 |
+
"loss": 0.1044,
|
| 576 |
+
"mean_token_accuracy": 0.9665757656097412,
|
| 577 |
+
"num_tokens": 1338391.0,
|
| 578 |
+
"step": 710
|
| 579 |
+
},
|
| 580 |
+
{
|
| 581 |
+
"epoch": 0.01423994304022784,
|
| 582 |
+
"learning_rate": 0.00019715596693168784,
|
| 583 |
+
"loss": 0.0798,
|
| 584 |
+
"mean_token_accuracy": 0.9741999268531799,
|
| 585 |
+
"num_tokens": 1357072.0,
|
| 586 |
+
"step": 720
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"epoch": 0.014437720026897671,
|
| 590 |
+
"learning_rate": 0.00019711641153435388,
|
| 591 |
+
"loss": 0.0661,
|
| 592 |
+
"mean_token_accuracy": 0.9795381426811218,
|
| 593 |
+
"num_tokens": 1375999.0,
|
| 594 |
+
"step": 730
|
| 595 |
+
},
|
| 596 |
+
{
|
| 597 |
+
"epoch": 0.014635497013567502,
|
| 598 |
+
"learning_rate": 0.00019707685613701992,
|
| 599 |
+
"loss": 0.082,
|
| 600 |
+
"mean_token_accuracy": 0.9710344612598419,
|
| 601 |
+
"num_tokens": 1394654.0,
|
| 602 |
+
"step": 740
|
| 603 |
+
},
|
| 604 |
+
{
|
| 605 |
+
"epoch": 0.014833274000237333,
|
| 606 |
+
"learning_rate": 0.00019703730073968593,
|
| 607 |
+
"loss": 0.0825,
|
| 608 |
+
"mean_token_accuracy": 0.9703740537166595,
|
| 609 |
+
"num_tokens": 1413323.0,
|
| 610 |
+
"step": 750
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"epoch": 0.015031050986907164,
|
| 614 |
+
"learning_rate": 0.000196997745342352,
|
| 615 |
+
"loss": 0.0797,
|
| 616 |
+
"mean_token_accuracy": 0.9717476069927216,
|
| 617 |
+
"num_tokens": 1432274.0,
|
| 618 |
+
"step": 760
|
| 619 |
+
},
|
| 620 |
+
{
|
| 621 |
+
"epoch": 0.015228827973576995,
|
| 622 |
+
"learning_rate": 0.000196958189945018,
|
| 623 |
+
"loss": 0.0811,
|
| 624 |
+
"mean_token_accuracy": 0.9687287509441376,
|
| 625 |
+
"num_tokens": 1451009.0,
|
| 626 |
+
"step": 770
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 0.015426604960246827,
|
| 630 |
+
"learning_rate": 0.00019691863454768404,
|
| 631 |
+
"loss": 0.0802,
|
| 632 |
+
"mean_token_accuracy": 0.9698404908180237,
|
| 633 |
+
"num_tokens": 1469741.0,
|
| 634 |
+
"step": 780
|
| 635 |
+
},
|
| 636 |
+
{
|
| 637 |
+
"epoch": 0.015624381946916658,
|
| 638 |
+
"learning_rate": 0.00019687907915035008,
|
| 639 |
+
"loss": 0.0698,
|
| 640 |
+
"mean_token_accuracy": 0.9755463302135468,
|
| 641 |
+
"num_tokens": 1488644.0,
|
| 642 |
+
"step": 790
|
| 643 |
+
},
|
| 644 |
+
{
|
| 645 |
+
"epoch": 0.01582215893358649,
|
| 646 |
+
"learning_rate": 0.0001968395237530161,
|
| 647 |
+
"loss": 0.0698,
|
| 648 |
+
"mean_token_accuracy": 0.9777807116508483,
|
| 649 |
+
"num_tokens": 1507384.0,
|
| 650 |
+
"step": 800
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"epoch": 0.016019935920256318,
|
| 654 |
+
"learning_rate": 0.00019679996835568215,
|
| 655 |
+
"loss": 0.085,
|
| 656 |
+
"mean_token_accuracy": 0.9731873035430908,
|
| 657 |
+
"num_tokens": 1526208.0,
|
| 658 |
+
"step": 810
|
| 659 |
+
},
|
| 660 |
+
{
|
| 661 |
+
"epoch": 0.01621771290692615,
|
| 662 |
+
"learning_rate": 0.00019676041295834816,
|
| 663 |
+
"loss": 0.0682,
|
| 664 |
+
"mean_token_accuracy": 0.9786826431751251,
|
| 665 |
+
"num_tokens": 1545086.0,
|
| 666 |
+
"step": 820
|
| 667 |
+
},
|
| 668 |
+
{
|
| 669 |
+
"epoch": 0.01641548989359598,
|
| 670 |
+
"learning_rate": 0.00019672085756101423,
|
| 671 |
+
"loss": 0.0687,
|
| 672 |
+
"mean_token_accuracy": 0.9754264533519745,
|
| 673 |
+
"num_tokens": 1563972.0,
|
| 674 |
+
"step": 830
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"epoch": 0.016613266880265813,
|
| 678 |
+
"learning_rate": 0.00019668130216368024,
|
| 679 |
+
"loss": 0.0847,
|
| 680 |
+
"mean_token_accuracy": 0.9757691383361816,
|
| 681 |
+
"num_tokens": 1582893.0,
|
| 682 |
+
"step": 840
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"epoch": 0.016811043866935643,
|
| 686 |
+
"learning_rate": 0.00019664174676634627,
|
| 687 |
+
"loss": 0.0978,
|
| 688 |
+
"mean_token_accuracy": 0.9691124320030212,
|
| 689 |
+
"num_tokens": 1601719.0,
|
| 690 |
+
"step": 850
|
| 691 |
+
},
|
| 692 |
+
{
|
| 693 |
+
"epoch": 0.017008820853605475,
|
| 694 |
+
"learning_rate": 0.0001966021913690123,
|
| 695 |
+
"loss": 0.0834,
|
| 696 |
+
"mean_token_accuracy": 0.9732567369937897,
|
| 697 |
+
"num_tokens": 1620493.0,
|
| 698 |
+
"step": 860
|
| 699 |
+
},
|
| 700 |
+
{
|
| 701 |
+
"epoch": 0.017206597840275305,
|
| 702 |
+
"learning_rate": 0.00019656263597167835,
|
| 703 |
+
"loss": 0.0703,
|
| 704 |
+
"mean_token_accuracy": 0.9783392190933228,
|
| 705 |
+
"num_tokens": 1639336.0,
|
| 706 |
+
"step": 870
|
| 707 |
+
},
|
| 708 |
+
{
|
| 709 |
+
"epoch": 0.017404374826945138,
|
| 710 |
+
"learning_rate": 0.00019652308057434439,
|
| 711 |
+
"loss": 0.0916,
|
| 712 |
+
"mean_token_accuracy": 0.9702894032001496,
|
| 713 |
+
"num_tokens": 1658213.0,
|
| 714 |
+
"step": 880
|
| 715 |
+
},
|
| 716 |
+
{
|
| 717 |
+
"epoch": 0.017602151813614967,
|
| 718 |
+
"learning_rate": 0.0001964835251770104,
|
| 719 |
+
"loss": 0.0631,
|
| 720 |
+
"mean_token_accuracy": 0.9804218530654907,
|
| 721 |
+
"num_tokens": 1677260.0,
|
| 722 |
+
"step": 890
|
| 723 |
+
},
|
| 724 |
+
{
|
| 725 |
+
"epoch": 0.0177999288002848,
|
| 726 |
+
"learning_rate": 0.00019644396977967646,
|
| 727 |
+
"loss": 0.0942,
|
| 728 |
+
"mean_token_accuracy": 0.9701037347316742,
|
| 729 |
+
"num_tokens": 1696174.0,
|
| 730 |
+
"step": 900
|
| 731 |
+
},
|
| 732 |
+
{
|
| 733 |
+
"epoch": 0.01799770578695463,
|
| 734 |
+
"learning_rate": 0.00019640441438234247,
|
| 735 |
+
"loss": 0.0827,
|
| 736 |
+
"mean_token_accuracy": 0.9717662394046783,
|
| 737 |
+
"num_tokens": 1714822.0,
|
| 738 |
+
"step": 910
|
| 739 |
+
},
|
| 740 |
+
{
|
| 741 |
+
"epoch": 0.018195482773624462,
|
| 742 |
+
"learning_rate": 0.0001963648589850085,
|
| 743 |
+
"loss": 0.0728,
|
| 744 |
+
"mean_token_accuracy": 0.9710807025432586,
|
| 745 |
+
"num_tokens": 1733526.0,
|
| 746 |
+
"step": 920
|
| 747 |
+
},
|
| 748 |
+
{
|
| 749 |
+
"epoch": 0.01839325976029429,
|
| 750 |
+
"learning_rate": 0.00019632530358767455,
|
| 751 |
+
"loss": 0.0689,
|
| 752 |
+
"mean_token_accuracy": 0.9746571719646454,
|
| 753 |
+
"num_tokens": 1752469.0,
|
| 754 |
+
"step": 930
|
| 755 |
+
},
|
| 756 |
+
{
|
| 757 |
+
"epoch": 0.018591036746964124,
|
| 758 |
+
"learning_rate": 0.00019628574819034058,
|
| 759 |
+
"loss": 0.0648,
|
| 760 |
+
"mean_token_accuracy": 0.9815335392951965,
|
| 761 |
+
"num_tokens": 1771439.0,
|
| 762 |
+
"step": 940
|
| 763 |
+
},
|
| 764 |
+
{
|
| 765 |
+
"epoch": 0.018788813733633954,
|
| 766 |
+
"learning_rate": 0.00019624619279300662,
|
| 767 |
+
"loss": 0.0773,
|
| 768 |
+
"mean_token_accuracy": 0.9698800563812255,
|
| 769 |
+
"num_tokens": 1790105.0,
|
| 770 |
+
"step": 950
|
| 771 |
+
},
|
| 772 |
+
{
|
| 773 |
+
"epoch": 0.018986590720303786,
|
| 774 |
+
"learning_rate": 0.00019620663739567263,
|
| 775 |
+
"loss": 0.067,
|
| 776 |
+
"mean_token_accuracy": 0.9776150286197662,
|
| 777 |
+
"num_tokens": 1809000.0,
|
| 778 |
+
"step": 960
|
| 779 |
+
},
|
| 780 |
+
{
|
| 781 |
+
"epoch": 0.019184367706973616,
|
| 782 |
+
"learning_rate": 0.0001961670819983387,
|
| 783 |
+
"loss": 0.0771,
|
| 784 |
+
"mean_token_accuracy": 0.9723577439785004,
|
| 785 |
+
"num_tokens": 1827743.0,
|
| 786 |
+
"step": 970
|
| 787 |
+
},
|
| 788 |
+
{
|
| 789 |
+
"epoch": 0.01938214469364345,
|
| 790 |
+
"learning_rate": 0.0001961275266010047,
|
| 791 |
+
"loss": 0.0829,
|
| 792 |
+
"mean_token_accuracy": 0.9695183992385864,
|
| 793 |
+
"num_tokens": 1846483.0,
|
| 794 |
+
"step": 980
|
| 795 |
+
},
|
| 796 |
+
{
|
| 797 |
+
"epoch": 0.019579921680313278,
|
| 798 |
+
"learning_rate": 0.00019608797120367074,
|
| 799 |
+
"loss": 0.0544,
|
| 800 |
+
"mean_token_accuracy": 0.9838368058204651,
|
| 801 |
+
"num_tokens": 1865230.0,
|
| 802 |
+
"step": 990
|
| 803 |
+
},
|
| 804 |
+
{
|
| 805 |
+
"epoch": 0.01977769866698311,
|
| 806 |
+
"learning_rate": 0.00019604841580633678,
|
| 807 |
+
"loss": 0.0913,
|
| 808 |
+
"mean_token_accuracy": 0.9702223718166352,
|
| 809 |
+
"num_tokens": 1884295.0,
|
| 810 |
+
"step": 1000
|
| 811 |
+
},
|
| 812 |
+
{
|
| 813 |
+
"epoch": 0.01997547565365294,
|
| 814 |
+
"learning_rate": 0.00019600886040900282,
|
| 815 |
+
"loss": 0.0794,
|
| 816 |
+
"mean_token_accuracy": 0.9764923632144928,
|
| 817 |
+
"num_tokens": 1903297.0,
|
| 818 |
+
"step": 1010
|
| 819 |
+
},
|
| 820 |
+
{
|
| 821 |
+
"epoch": 0.020173252640322773,
|
| 822 |
+
"learning_rate": 0.00019596930501166886,
|
| 823 |
+
"loss": 0.0673,
|
| 824 |
+
"mean_token_accuracy": 0.9780684530735015,
|
| 825 |
+
"num_tokens": 1922020.0,
|
| 826 |
+
"step": 1020
|
| 827 |
+
},
|
| 828 |
+
{
|
| 829 |
+
"epoch": 0.020371029626992603,
|
| 830 |
+
"learning_rate": 0.00019592974961433487,
|
| 831 |
+
"loss": 0.0626,
|
| 832 |
+
"mean_token_accuracy": 0.9800810873508453,
|
| 833 |
+
"num_tokens": 1940771.0,
|
| 834 |
+
"step": 1030
|
| 835 |
+
},
|
| 836 |
+
{
|
| 837 |
+
"epoch": 0.020568806613662435,
|
| 838 |
+
"learning_rate": 0.00019589019421700093,
|
| 839 |
+
"loss": 0.0765,
|
| 840 |
+
"mean_token_accuracy": 0.9712104678153992,
|
| 841 |
+
"num_tokens": 1959790.0,
|
| 842 |
+
"step": 1040
|
| 843 |
+
},
|
| 844 |
+
{
|
| 845 |
+
"epoch": 0.020766583600332265,
|
| 846 |
+
"learning_rate": 0.00019585063881966694,
|
| 847 |
+
"loss": 0.0648,
|
| 848 |
+
"mean_token_accuracy": 0.978104192018509,
|
| 849 |
+
"num_tokens": 1978766.0,
|
| 850 |
+
"step": 1050
|
| 851 |
+
},
|
| 852 |
+
{
|
| 853 |
+
"epoch": 0.020964360587002098,
|
| 854 |
+
"learning_rate": 0.00019581108342233298,
|
| 855 |
+
"loss": 0.0938,
|
| 856 |
+
"mean_token_accuracy": 0.9689741492271423,
|
| 857 |
+
"num_tokens": 1997785.0,
|
| 858 |
+
"step": 1060
|
| 859 |
+
},
|
| 860 |
+
{
|
| 861 |
+
"epoch": 0.021162137573671927,
|
| 862 |
+
"learning_rate": 0.00019577152802499902,
|
| 863 |
+
"loss": 0.0891,
|
| 864 |
+
"mean_token_accuracy": 0.9727595269680023,
|
| 865 |
+
"num_tokens": 2016334.0,
|
| 866 |
+
"step": 1070
|
| 867 |
+
},
|
| 868 |
+
{
|
| 869 |
+
"epoch": 0.02135991456034176,
|
| 870 |
+
"learning_rate": 0.00019573197262766505,
|
| 871 |
+
"loss": 0.0818,
|
| 872 |
+
"mean_token_accuracy": 0.9725301325321197,
|
| 873 |
+
"num_tokens": 2035271.0,
|
| 874 |
+
"step": 1080
|
| 875 |
+
},
|
| 876 |
+
{
|
| 877 |
+
"epoch": 0.02155769154701159,
|
| 878 |
+
"learning_rate": 0.0001956924172303311,
|
| 879 |
+
"loss": 0.0721,
|
| 880 |
+
"mean_token_accuracy": 0.9739335179328918,
|
| 881 |
+
"num_tokens": 2054158.0,
|
| 882 |
+
"step": 1090
|
| 883 |
+
},
|
| 884 |
+
{
|
| 885 |
+
"epoch": 0.021755468533681422,
|
| 886 |
+
"learning_rate": 0.0001956528618329971,
|
| 887 |
+
"loss": 0.0785,
|
| 888 |
+
"mean_token_accuracy": 0.971097469329834,
|
| 889 |
+
"num_tokens": 2072794.0,
|
| 890 |
+
"step": 1100
|
| 891 |
+
},
|
| 892 |
+
{
|
| 893 |
+
"epoch": 0.02195324552035125,
|
| 894 |
+
"learning_rate": 0.00019561330643566317,
|
| 895 |
+
"loss": 0.0759,
|
| 896 |
+
"mean_token_accuracy": 0.9685551345348358,
|
| 897 |
+
"num_tokens": 2091583.0,
|
| 898 |
+
"step": 1110
|
| 899 |
+
},
|
| 900 |
+
{
|
| 901 |
+
"epoch": 0.022151022507021084,
|
| 902 |
+
"learning_rate": 0.0001955737510383292,
|
| 903 |
+
"loss": 0.0584,
|
| 904 |
+
"mean_token_accuracy": 0.9821391940116883,
|
| 905 |
+
"num_tokens": 2110207.0,
|
| 906 |
+
"step": 1120
|
| 907 |
+
},
|
| 908 |
+
{
|
| 909 |
+
"epoch": 0.022348799493690914,
|
| 910 |
+
"learning_rate": 0.00019553419564099521,
|
| 911 |
+
"loss": 0.0841,
|
| 912 |
+
"mean_token_accuracy": 0.9727806925773621,
|
| 913 |
+
"num_tokens": 2129006.0,
|
| 914 |
+
"step": 1130
|
| 915 |
+
},
|
| 916 |
+
{
|
| 917 |
+
"epoch": 0.022546576480360746,
|
| 918 |
+
"learning_rate": 0.00019549464024366125,
|
| 919 |
+
"loss": 0.0708,
|
| 920 |
+
"mean_token_accuracy": 0.9772637248039245,
|
| 921 |
+
"num_tokens": 2147866.0,
|
| 922 |
+
"step": 1140
|
| 923 |
+
},
|
| 924 |
+
{
|
| 925 |
+
"epoch": 0.022744353467030576,
|
| 926 |
+
"learning_rate": 0.0001954550848463273,
|
| 927 |
+
"loss": 0.0615,
|
| 928 |
+
"mean_token_accuracy": 0.9771132528781891,
|
| 929 |
+
"num_tokens": 2166678.0,
|
| 930 |
+
"step": 1150
|
| 931 |
+
},
|
| 932 |
+
{
|
| 933 |
+
"epoch": 0.02294213045370041,
|
| 934 |
+
"learning_rate": 0.00019541552944899333,
|
| 935 |
+
"loss": 0.0716,
|
| 936 |
+
"mean_token_accuracy": 0.97862588763237,
|
| 937 |
+
"num_tokens": 2185491.0,
|
| 938 |
+
"step": 1160
|
| 939 |
+
},
|
| 940 |
+
{
|
| 941 |
+
"epoch": 0.023139907440370238,
|
| 942 |
+
"learning_rate": 0.00019537597405165934,
|
| 943 |
+
"loss": 0.0835,
|
| 944 |
+
"mean_token_accuracy": 0.9729329645633698,
|
| 945 |
+
"num_tokens": 2204270.0,
|
| 946 |
+
"step": 1170
|
| 947 |
+
},
|
| 948 |
+
{
|
| 949 |
+
"epoch": 0.02333768442704007,
|
| 950 |
+
"learning_rate": 0.0001953364186543254,
|
| 951 |
+
"loss": 0.0825,
|
| 952 |
+
"mean_token_accuracy": 0.975412392616272,
|
| 953 |
+
"num_tokens": 2223262.0,
|
| 954 |
+
"step": 1180
|
| 955 |
+
},
|
| 956 |
+
{
|
| 957 |
+
"epoch": 0.0235354614137099,
|
| 958 |
+
"learning_rate": 0.00019529686325699144,
|
| 959 |
+
"loss": 0.0694,
|
| 960 |
+
"mean_token_accuracy": 0.9787419438362122,
|
| 961 |
+
"num_tokens": 2242082.0,
|
| 962 |
+
"step": 1190
|
| 963 |
+
},
|
| 964 |
+
{
|
| 965 |
+
"epoch": 0.023733238400379733,
|
| 966 |
+
"learning_rate": 0.00019525730785965745,
|
| 967 |
+
"loss": 0.0971,
|
| 968 |
+
"mean_token_accuracy": 0.9742420554161072,
|
| 969 |
+
"num_tokens": 2260874.0,
|
| 970 |
+
"step": 1200
|
| 971 |
+
},
|
| 972 |
+
{
|
| 973 |
+
"epoch": 0.023931015387049562,
|
| 974 |
+
"learning_rate": 0.00019521775246232349,
|
| 975 |
+
"loss": 0.0813,
|
| 976 |
+
"mean_token_accuracy": 0.973499870300293,
|
| 977 |
+
"num_tokens": 2279620.0,
|
| 978 |
+
"step": 1210
|
| 979 |
+
},
|
| 980 |
+
{
|
| 981 |
+
"epoch": 0.024128792373719395,
|
| 982 |
+
"learning_rate": 0.00019517819706498952,
|
| 983 |
+
"loss": 0.0591,
|
| 984 |
+
"mean_token_accuracy": 0.9806219100952148,
|
| 985 |
+
"num_tokens": 2298359.0,
|
| 986 |
+
"step": 1220
|
| 987 |
+
},
|
| 988 |
+
{
|
| 989 |
+
"epoch": 0.024326569360389225,
|
| 990 |
+
"learning_rate": 0.00019513864166765556,
|
| 991 |
+
"loss": 0.0784,
|
| 992 |
+
"mean_token_accuracy": 0.9767163157463074,
|
| 993 |
+
"num_tokens": 2317171.0,
|
| 994 |
+
"step": 1230
|
| 995 |
+
},
|
| 996 |
+
{
|
| 997 |
+
"epoch": 0.024524346347059058,
|
| 998 |
+
"learning_rate": 0.00019509908627032157,
|
| 999 |
+
"loss": 0.0867,
|
| 1000 |
+
"mean_token_accuracy": 0.9671182572841645,
|
| 1001 |
+
"num_tokens": 2335825.0,
|
| 1002 |
+
"step": 1240
|
| 1003 |
+
},
|
| 1004 |
+
{
|
| 1005 |
+
"epoch": 0.024722123333728887,
|
| 1006 |
+
"learning_rate": 0.00019505953087298764,
|
| 1007 |
+
"loss": 0.0727,
|
| 1008 |
+
"mean_token_accuracy": 0.9711700201034545,
|
| 1009 |
+
"num_tokens": 2354806.0,
|
| 1010 |
+
"step": 1250
|
| 1011 |
+
},
|
| 1012 |
+
{
|
| 1013 |
+
"epoch": 0.02491990032039872,
|
| 1014 |
+
"learning_rate": 0.00019501997547565367,
|
| 1015 |
+
"loss": 0.0901,
|
| 1016 |
+
"mean_token_accuracy": 0.9734555304050445,
|
| 1017 |
+
"num_tokens": 2373919.0,
|
| 1018 |
+
"step": 1260
|
| 1019 |
+
},
|
| 1020 |
+
{
|
| 1021 |
+
"epoch": 0.02511767730706855,
|
| 1022 |
+
"learning_rate": 0.00019498042007831968,
|
| 1023 |
+
"loss": 0.0579,
|
| 1024 |
+
"mean_token_accuracy": 0.9764434218406677,
|
| 1025 |
+
"num_tokens": 2393043.0,
|
| 1026 |
+
"step": 1270
|
| 1027 |
+
},
|
| 1028 |
+
{
|
| 1029 |
+
"epoch": 0.025315454293738382,
|
| 1030 |
+
"learning_rate": 0.00019494086468098575,
|
| 1031 |
+
"loss": 0.0623,
|
| 1032 |
+
"mean_token_accuracy": 0.9802065193653107,
|
| 1033 |
+
"num_tokens": 2411888.0,
|
| 1034 |
+
"step": 1280
|
| 1035 |
+
},
|
| 1036 |
+
{
|
| 1037 |
+
"epoch": 0.02551323128040821,
|
| 1038 |
+
"learning_rate": 0.00019490130928365176,
|
| 1039 |
+
"loss": 0.0739,
|
| 1040 |
+
"mean_token_accuracy": 0.9758767008781433,
|
| 1041 |
+
"num_tokens": 2430790.0,
|
| 1042 |
+
"step": 1290
|
| 1043 |
+
},
|
| 1044 |
+
{
|
| 1045 |
+
"epoch": 0.025711008267078044,
|
| 1046 |
+
"learning_rate": 0.0001948617538863178,
|
| 1047 |
+
"loss": 0.0676,
|
| 1048 |
+
"mean_token_accuracy": 0.9723714172840119,
|
| 1049 |
+
"num_tokens": 2449534.0,
|
| 1050 |
+
"step": 1300
|
| 1051 |
+
},
|
| 1052 |
+
{
|
| 1053 |
+
"epoch": 0.025908785253747874,
|
| 1054 |
+
"learning_rate": 0.0001948221984889838,
|
| 1055 |
+
"loss": 0.0842,
|
| 1056 |
+
"mean_token_accuracy": 0.9739417016506196,
|
| 1057 |
+
"num_tokens": 2468500.0,
|
| 1058 |
+
"step": 1310
|
| 1059 |
+
},
|
| 1060 |
+
{
|
| 1061 |
+
"epoch": 0.026106562240417706,
|
| 1062 |
+
"learning_rate": 0.00019478264309164987,
|
| 1063 |
+
"loss": 0.0812,
|
| 1064 |
+
"mean_token_accuracy": 0.9725883424282074,
|
| 1065 |
+
"num_tokens": 2487522.0,
|
| 1066 |
+
"step": 1320
|
| 1067 |
+
},
|
| 1068 |
+
{
|
| 1069 |
+
"epoch": 0.026304339227087536,
|
| 1070 |
+
"learning_rate": 0.0001947430876943159,
|
| 1071 |
+
"loss": 0.081,
|
| 1072 |
+
"mean_token_accuracy": 0.9718135535717011,
|
| 1073 |
+
"num_tokens": 2506228.0,
|
| 1074 |
+
"step": 1330
|
| 1075 |
+
},
|
| 1076 |
+
{
|
| 1077 |
+
"epoch": 0.02650211621375737,
|
| 1078 |
+
"learning_rate": 0.00019470353229698192,
|
| 1079 |
+
"loss": 0.0913,
|
| 1080 |
+
"mean_token_accuracy": 0.9727324843406677,
|
| 1081 |
+
"num_tokens": 2524993.0,
|
| 1082 |
+
"step": 1340
|
| 1083 |
+
},
|
| 1084 |
+
{
|
| 1085 |
+
"epoch": 0.026699893200427198,
|
| 1086 |
+
"learning_rate": 0.00019466397689964798,
|
| 1087 |
+
"loss": 0.0961,
|
| 1088 |
+
"mean_token_accuracy": 0.9686046600341797,
|
| 1089 |
+
"num_tokens": 2543814.0,
|
| 1090 |
+
"step": 1350
|
| 1091 |
+
},
|
| 1092 |
+
{
|
| 1093 |
+
"epoch": 0.02689767018709703,
|
| 1094 |
+
"learning_rate": 0.000194624421502314,
|
| 1095 |
+
"loss": 0.0961,
|
| 1096 |
+
"mean_token_accuracy": 0.9635930359363556,
|
| 1097 |
+
"num_tokens": 2562624.0,
|
| 1098 |
+
"step": 1360
|
| 1099 |
+
},
|
| 1100 |
+
{
|
| 1101 |
+
"epoch": 0.02709544717376686,
|
| 1102 |
+
"learning_rate": 0.00019458486610498003,
|
| 1103 |
+
"loss": 0.0882,
|
| 1104 |
+
"mean_token_accuracy": 0.971700656414032,
|
| 1105 |
+
"num_tokens": 2581229.0,
|
| 1106 |
+
"step": 1370
|
| 1107 |
+
},
|
| 1108 |
+
{
|
| 1109 |
+
"epoch": 0.027293224160436693,
|
| 1110 |
+
"learning_rate": 0.00019454531070764607,
|
| 1111 |
+
"loss": 0.0875,
|
| 1112 |
+
"mean_token_accuracy": 0.9696930944919586,
|
| 1113 |
+
"num_tokens": 2600073.0,
|
| 1114 |
+
"step": 1380
|
| 1115 |
+
},
|
| 1116 |
+
{
|
| 1117 |
+
"epoch": 0.027491001147106522,
|
| 1118 |
+
"learning_rate": 0.0001945057553103121,
|
| 1119 |
+
"loss": 0.0732,
|
| 1120 |
+
"mean_token_accuracy": 0.9732912659645081,
|
| 1121 |
+
"num_tokens": 2618837.0,
|
| 1122 |
+
"step": 1390
|
| 1123 |
+
},
|
| 1124 |
+
{
|
| 1125 |
+
"epoch": 0.027688778133776355,
|
| 1126 |
+
"learning_rate": 0.00019446619991297814,
|
| 1127 |
+
"loss": 0.0711,
|
| 1128 |
+
"mean_token_accuracy": 0.975910484790802,
|
| 1129 |
+
"num_tokens": 2637487.0,
|
| 1130 |
+
"step": 1400
|
| 1131 |
+
},
|
| 1132 |
+
{
|
| 1133 |
+
"epoch": 0.027886555120446185,
|
| 1134 |
+
"learning_rate": 0.00019442664451564415,
|
| 1135 |
+
"loss": 0.0944,
|
| 1136 |
+
"mean_token_accuracy": 0.96585413813591,
|
| 1137 |
+
"num_tokens": 2656004.0,
|
| 1138 |
+
"step": 1410
|
| 1139 |
+
},
|
| 1140 |
+
{
|
| 1141 |
+
"epoch": 0.028084332107116017,
|
| 1142 |
+
"learning_rate": 0.00019438708911831022,
|
| 1143 |
+
"loss": 0.0553,
|
| 1144 |
+
"mean_token_accuracy": 0.980397754907608,
|
| 1145 |
+
"num_tokens": 2675089.0,
|
| 1146 |
+
"step": 1420
|
| 1147 |
+
},
|
| 1148 |
+
{
|
| 1149 |
+
"epoch": 0.028282109093785847,
|
| 1150 |
+
"learning_rate": 0.00019434753372097623,
|
| 1151 |
+
"loss": 0.062,
|
| 1152 |
+
"mean_token_accuracy": 0.9789350926876068,
|
| 1153 |
+
"num_tokens": 2693904.0,
|
| 1154 |
+
"step": 1430
|
| 1155 |
+
},
|
| 1156 |
+
{
|
| 1157 |
+
"epoch": 0.02847988608045568,
|
| 1158 |
+
"learning_rate": 0.00019430797832364227,
|
| 1159 |
+
"loss": 0.08,
|
| 1160 |
+
"mean_token_accuracy": 0.9751649796962738,
|
| 1161 |
+
"num_tokens": 2712395.0,
|
| 1162 |
+
"step": 1440
|
| 1163 |
+
},
|
| 1164 |
+
{
|
| 1165 |
+
"epoch": 0.02867766306712551,
|
| 1166 |
+
"learning_rate": 0.0001942684229263083,
|
| 1167 |
+
"loss": 0.0662,
|
| 1168 |
+
"mean_token_accuracy": 0.9778707563877106,
|
| 1169 |
+
"num_tokens": 2731103.0,
|
| 1170 |
+
"step": 1450
|
| 1171 |
+
},
|
| 1172 |
+
{
|
| 1173 |
+
"epoch": 0.028875440053795342,
|
| 1174 |
+
"learning_rate": 0.00019422886752897434,
|
| 1175 |
+
"loss": 0.0834,
|
| 1176 |
+
"mean_token_accuracy": 0.9672891080379487,
|
| 1177 |
+
"num_tokens": 2749801.0,
|
| 1178 |
+
"step": 1460
|
| 1179 |
+
},
|
| 1180 |
+
{
|
| 1181 |
+
"epoch": 0.02907321704046517,
|
| 1182 |
+
"learning_rate": 0.00019418931213164038,
|
| 1183 |
+
"loss": 0.0719,
|
| 1184 |
+
"mean_token_accuracy": 0.9755631804466247,
|
| 1185 |
+
"num_tokens": 2768551.0,
|
| 1186 |
+
"step": 1470
|
| 1187 |
+
},
|
| 1188 |
+
{
|
| 1189 |
+
"epoch": 0.029270994027135004,
|
| 1190 |
+
"learning_rate": 0.0001941497567343064,
|
| 1191 |
+
"loss": 0.0499,
|
| 1192 |
+
"mean_token_accuracy": 0.9837284207344055,
|
| 1193 |
+
"num_tokens": 2787591.0,
|
| 1194 |
+
"step": 1480
|
| 1195 |
+
},
|
| 1196 |
+
{
|
| 1197 |
+
"epoch": 0.029468771013804834,
|
| 1198 |
+
"learning_rate": 0.00019411020133697245,
|
| 1199 |
+
"loss": 0.066,
|
| 1200 |
+
"mean_token_accuracy": 0.9761005103588104,
|
| 1201 |
+
"num_tokens": 2806455.0,
|
| 1202 |
+
"step": 1490
|
| 1203 |
+
},
|
| 1204 |
+
{
|
| 1205 |
+
"epoch": 0.029666548000474666,
|
| 1206 |
+
"learning_rate": 0.00019407064593963846,
|
| 1207 |
+
"loss": 0.0623,
|
| 1208 |
+
"mean_token_accuracy": 0.9766542613506317,
|
| 1209 |
+
"num_tokens": 2825280.0,
|
| 1210 |
+
"step": 1500
|
| 1211 |
+
},
|
| 1212 |
+
{
|
| 1213 |
+
"epoch": 0.029864324987144496,
|
| 1214 |
+
"learning_rate": 0.0001940310905423045,
|
| 1215 |
+
"loss": 0.0643,
|
| 1216 |
+
"mean_token_accuracy": 0.9814446032047272,
|
| 1217 |
+
"num_tokens": 2844178.0,
|
| 1218 |
+
"step": 1510
|
| 1219 |
+
},
|
| 1220 |
+
{
|
| 1221 |
+
"epoch": 0.03006210197381433,
|
| 1222 |
+
"learning_rate": 0.00019399153514497054,
|
| 1223 |
+
"loss": 0.0559,
|
| 1224 |
+
"mean_token_accuracy": 0.9753939032554626,
|
| 1225 |
+
"num_tokens": 2863373.0,
|
| 1226 |
+
"step": 1520
|
| 1227 |
+
},
|
| 1228 |
+
{
|
| 1229 |
+
"epoch": 0.030259878960484158,
|
| 1230 |
+
"learning_rate": 0.00019395197974763658,
|
| 1231 |
+
"loss": 0.0633,
|
| 1232 |
+
"mean_token_accuracy": 0.9776888847351074,
|
| 1233 |
+
"num_tokens": 2882166.0,
|
| 1234 |
+
"step": 1530
|
| 1235 |
+
},
|
| 1236 |
+
{
|
| 1237 |
+
"epoch": 0.03045765594715399,
|
| 1238 |
+
"learning_rate": 0.00019391242435030261,
|
| 1239 |
+
"loss": 0.0568,
|
| 1240 |
+
"mean_token_accuracy": 0.9781621396541595,
|
| 1241 |
+
"num_tokens": 2901050.0,
|
| 1242 |
+
"step": 1540
|
| 1243 |
+
},
|
| 1244 |
+
{
|
| 1245 |
+
"epoch": 0.03065543293382382,
|
| 1246 |
+
"learning_rate": 0.00019387286895296862,
|
| 1247 |
+
"loss": 0.0748,
|
| 1248 |
+
"mean_token_accuracy": 0.9726321280002594,
|
| 1249 |
+
"num_tokens": 2919953.0,
|
| 1250 |
+
"step": 1550
|
| 1251 |
+
},
|
| 1252 |
+
{
|
| 1253 |
+
"epoch": 0.030853209920493653,
|
| 1254 |
+
"learning_rate": 0.0001938333135556347,
|
| 1255 |
+
"loss": 0.0717,
|
| 1256 |
+
"mean_token_accuracy": 0.9703111469745636,
|
| 1257 |
+
"num_tokens": 2939003.0,
|
| 1258 |
+
"step": 1560
|
| 1259 |
+
},
|
| 1260 |
+
{
|
| 1261 |
+
"epoch": 0.031050986907163482,
|
| 1262 |
+
"learning_rate": 0.00019379375815830073,
|
| 1263 |
+
"loss": 0.065,
|
| 1264 |
+
"mean_token_accuracy": 0.97583766579628,
|
| 1265 |
+
"num_tokens": 2957853.0,
|
| 1266 |
+
"step": 1570
|
| 1267 |
+
},
|
| 1268 |
+
{
|
| 1269 |
+
"epoch": 0.031248763893833315,
|
| 1270 |
+
"learning_rate": 0.00019375420276096674,
|
| 1271 |
+
"loss": 0.0667,
|
| 1272 |
+
"mean_token_accuracy": 0.9808376967906952,
|
| 1273 |
+
"num_tokens": 2976557.0,
|
| 1274 |
+
"step": 1580
|
| 1275 |
+
},
|
| 1276 |
+
{
|
| 1277 |
+
"epoch": 0.031446540880503145,
|
| 1278 |
+
"learning_rate": 0.00019371464736363277,
|
| 1279 |
+
"loss": 0.0777,
|
| 1280 |
+
"mean_token_accuracy": 0.9767092704772949,
|
| 1281 |
+
"num_tokens": 2995058.0,
|
| 1282 |
+
"step": 1590
|
| 1283 |
+
},
|
| 1284 |
+
{
|
| 1285 |
+
"epoch": 0.03164431786717298,
|
| 1286 |
+
"learning_rate": 0.0001936750919662988,
|
| 1287 |
+
"loss": 0.0644,
|
| 1288 |
+
"mean_token_accuracy": 0.9776780724525451,
|
| 1289 |
+
"num_tokens": 3014165.0,
|
| 1290 |
+
"step": 1600
|
| 1291 |
+
},
|
| 1292 |
+
{
|
| 1293 |
+
"epoch": 0.03184209485384281,
|
| 1294 |
+
"learning_rate": 0.00019363553656896485,
|
| 1295 |
+
"loss": 0.0566,
|
| 1296 |
+
"mean_token_accuracy": 0.9828783690929412,
|
| 1297 |
+
"num_tokens": 3032924.0,
|
| 1298 |
+
"step": 1610
|
| 1299 |
+
},
|
| 1300 |
+
{
|
| 1301 |
+
"epoch": 0.032039871840512636,
|
| 1302 |
+
"learning_rate": 0.00019359598117163086,
|
| 1303 |
+
"loss": 0.0805,
|
| 1304 |
+
"mean_token_accuracy": 0.972636216878891,
|
| 1305 |
+
"num_tokens": 3051960.0,
|
| 1306 |
+
"step": 1620
|
| 1307 |
+
},
|
| 1308 |
+
{
|
| 1309 |
+
"epoch": 0.03223764882718247,
|
| 1310 |
+
"learning_rate": 0.00019355642577429692,
|
| 1311 |
+
"loss": 0.0831,
|
| 1312 |
+
"mean_token_accuracy": 0.9726630806922912,
|
| 1313 |
+
"num_tokens": 3070644.0,
|
| 1314 |
+
"step": 1630
|
| 1315 |
+
},
|
| 1316 |
+
{
|
| 1317 |
+
"epoch": 0.0324354258138523,
|
| 1318 |
+
"learning_rate": 0.00019351687037696296,
|
| 1319 |
+
"loss": 0.0766,
|
| 1320 |
+
"mean_token_accuracy": 0.9787626624107361,
|
| 1321 |
+
"num_tokens": 3089393.0,
|
| 1322 |
+
"step": 1640
|
| 1323 |
+
},
|
| 1324 |
+
{
|
| 1325 |
+
"epoch": 0.03263320280052213,
|
| 1326 |
+
"learning_rate": 0.00019347731497962897,
|
| 1327 |
+
"loss": 0.0665,
|
| 1328 |
+
"mean_token_accuracy": 0.9728036403656006,
|
| 1329 |
+
"num_tokens": 3108260.0,
|
| 1330 |
+
"step": 1650
|
| 1331 |
+
},
|
| 1332 |
+
{
|
| 1333 |
+
"epoch": 0.03283097978719196,
|
| 1334 |
+
"learning_rate": 0.000193437759582295,
|
| 1335 |
+
"loss": 0.0842,
|
| 1336 |
+
"mean_token_accuracy": 0.969732540845871,
|
| 1337 |
+
"num_tokens": 3127157.0,
|
| 1338 |
+
"step": 1660
|
| 1339 |
+
},
|
| 1340 |
+
{
|
| 1341 |
+
"epoch": 0.033028756773861793,
|
| 1342 |
+
"learning_rate": 0.00019339820418496105,
|
| 1343 |
+
"loss": 0.072,
|
| 1344 |
+
"mean_token_accuracy": 0.9739607870578766,
|
| 1345 |
+
"num_tokens": 3146126.0,
|
| 1346 |
+
"step": 1670
|
| 1347 |
+
},
|
| 1348 |
+
{
|
| 1349 |
+
"epoch": 0.033226533760531626,
|
| 1350 |
+
"learning_rate": 0.00019335864878762708,
|
| 1351 |
+
"loss": 0.0835,
|
| 1352 |
+
"mean_token_accuracy": 0.9699565410614014,
|
| 1353 |
+
"num_tokens": 3165091.0,
|
| 1354 |
+
"step": 1680
|
| 1355 |
+
}
|
| 1356 |
+
],
|
| 1357 |
+
"logging_steps": 10,
|
| 1358 |
+
"max_steps": 50562,
|
| 1359 |
+
"num_input_tokens_seen": 0,
|
| 1360 |
+
"num_train_epochs": 9223372036854775807,
|
| 1361 |
+
"save_steps": 1685,
|
| 1362 |
+
"stateful_callbacks": {
|
| 1363 |
+
"TrainerControl": {
|
| 1364 |
+
"args": {
|
| 1365 |
+
"should_epoch_stop": false,
|
| 1366 |
+
"should_evaluate": false,
|
| 1367 |
+
"should_log": false,
|
| 1368 |
+
"should_save": true,
|
| 1369 |
+
"should_training_stop": false
|
| 1370 |
+
},
|
| 1371 |
+
"attributes": {}
|
| 1372 |
+
}
|
| 1373 |
+
},
|
| 1374 |
+
"total_flos": 1.5391152511647744e+17,
|
| 1375 |
+
"train_batch_size": 16,
|
| 1376 |
+
"trial_name": null,
|
| 1377 |
+
"trial_params": null
|
| 1378 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
|
| 3 |
+
library_name: peft
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
| 200 |
+
### Framework versions
|
| 201 |
+
|
| 202 |
+
- PEFT 0.15.0
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/adapter_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": false,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 16,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.0,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"r": 8,
|
| 24 |
+
"rank_pattern": {},
|
| 25 |
+
"revision": null,
|
| 26 |
+
"target_modules": [
|
| 27 |
+
"q_proj",
|
| 28 |
+
"v_proj"
|
| 29 |
+
],
|
| 30 |
+
"task_type": "CAUSAL_LM",
|
| 31 |
+
"trainable_token_indices": null,
|
| 32 |
+
"use_dora": false,
|
| 33 |
+
"use_rslora": false
|
| 34 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer_config.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
| 33 |
+
"clean_up_tokenization_spaces": false,
|
| 34 |
+
"eos_token": "</s>",
|
| 35 |
+
"extra_special_tokens": {},
|
| 36 |
+
"legacy": false,
|
| 37 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 38 |
+
"pad_token": "</s>",
|
| 39 |
+
"padding_side": "right",
|
| 40 |
+
"sp_model_kwargs": {},
|
| 41 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 42 |
+
"unk_token": "<unk>",
|
| 43 |
+
"use_default_system_prompt": false
|
| 44 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/trainer_state.json
ADDED
|
@@ -0,0 +1,1378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.03332542225386654,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1685,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.0001977769866698311,
|
| 14 |
+
"learning_rate": 0.00019996440014239942,
|
| 15 |
+
"loss": 0.3352,
|
| 16 |
+
"mean_token_accuracy": 0.9023264050483704,
|
| 17 |
+
"num_tokens": 19163.0,
|
| 18 |
+
"step": 10
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"epoch": 0.0003955539733396622,
|
| 22 |
+
"learning_rate": 0.0001999248447450655,
|
| 23 |
+
"loss": 0.1524,
|
| 24 |
+
"mean_token_accuracy": 0.947095412015915,
|
| 25 |
+
"num_tokens": 38071.0,
|
| 26 |
+
"step": 20
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"epoch": 0.0005933309600094933,
|
| 30 |
+
"learning_rate": 0.0001998852893477315,
|
| 31 |
+
"loss": 0.1461,
|
| 32 |
+
"mean_token_accuracy": 0.9463379800319671,
|
| 33 |
+
"num_tokens": 57017.0,
|
| 34 |
+
"step": 30
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"epoch": 0.0007911079466793244,
|
| 38 |
+
"learning_rate": 0.00019984573395039754,
|
| 39 |
+
"loss": 0.1024,
|
| 40 |
+
"mean_token_accuracy": 0.9588611423969269,
|
| 41 |
+
"num_tokens": 76047.0,
|
| 42 |
+
"step": 40
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"epoch": 0.0009888849333491555,
|
| 46 |
+
"learning_rate": 0.00019980617855306357,
|
| 47 |
+
"loss": 0.117,
|
| 48 |
+
"mean_token_accuracy": 0.9553473949432373,
|
| 49 |
+
"num_tokens": 94824.0,
|
| 50 |
+
"step": 50
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"epoch": 0.0011866619200189867,
|
| 54 |
+
"learning_rate": 0.0001997666231557296,
|
| 55 |
+
"loss": 0.1281,
|
| 56 |
+
"mean_token_accuracy": 0.9538084208965302,
|
| 57 |
+
"num_tokens": 113467.0,
|
| 58 |
+
"step": 60
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 0.0013844389066888178,
|
| 62 |
+
"learning_rate": 0.00019972706775839565,
|
| 63 |
+
"loss": 0.1075,
|
| 64 |
+
"mean_token_accuracy": 0.961921775341034,
|
| 65 |
+
"num_tokens": 132229.0,
|
| 66 |
+
"step": 70
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.0015822158933586489,
|
| 70 |
+
"learning_rate": 0.00019968751236106166,
|
| 71 |
+
"loss": 0.1057,
|
| 72 |
+
"mean_token_accuracy": 0.9676864743232727,
|
| 73 |
+
"num_tokens": 150811.0,
|
| 74 |
+
"step": 80
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"epoch": 0.00177999288002848,
|
| 78 |
+
"learning_rate": 0.00019964795696372772,
|
| 79 |
+
"loss": 0.1048,
|
| 80 |
+
"mean_token_accuracy": 0.9597055971622467,
|
| 81 |
+
"num_tokens": 169804.0,
|
| 82 |
+
"step": 90
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"epoch": 0.001977769866698311,
|
| 86 |
+
"learning_rate": 0.00019960840156639376,
|
| 87 |
+
"loss": 0.1147,
|
| 88 |
+
"mean_token_accuracy": 0.9561040580272675,
|
| 89 |
+
"num_tokens": 188503.0,
|
| 90 |
+
"step": 100
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"epoch": 0.002175546853368142,
|
| 94 |
+
"learning_rate": 0.00019956884616905977,
|
| 95 |
+
"loss": 0.0904,
|
| 96 |
+
"mean_token_accuracy": 0.9663344562053681,
|
| 97 |
+
"num_tokens": 207370.0,
|
| 98 |
+
"step": 110
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"epoch": 0.0023733238400379733,
|
| 102 |
+
"learning_rate": 0.0001995292907717258,
|
| 103 |
+
"loss": 0.085,
|
| 104 |
+
"mean_token_accuracy": 0.9692767798900604,
|
| 105 |
+
"num_tokens": 226639.0,
|
| 106 |
+
"step": 120
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"epoch": 0.0025711008267078044,
|
| 110 |
+
"learning_rate": 0.00019948973537439185,
|
| 111 |
+
"loss": 0.1055,
|
| 112 |
+
"mean_token_accuracy": 0.962373024225235,
|
| 113 |
+
"num_tokens": 245318.0,
|
| 114 |
+
"step": 130
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 0.0027688778133776355,
|
| 118 |
+
"learning_rate": 0.00019945017997705788,
|
| 119 |
+
"loss": 0.1023,
|
| 120 |
+
"mean_token_accuracy": 0.9603676617145538,
|
| 121 |
+
"num_tokens": 264089.0,
|
| 122 |
+
"step": 140
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.0029666548000474666,
|
| 126 |
+
"learning_rate": 0.0001994106245797239,
|
| 127 |
+
"loss": 0.1061,
|
| 128 |
+
"mean_token_accuracy": 0.9619979500770569,
|
| 129 |
+
"num_tokens": 282977.0,
|
| 130 |
+
"step": 150
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"epoch": 0.0031644317867172977,
|
| 134 |
+
"learning_rate": 0.00019937106918238996,
|
| 135 |
+
"loss": 0.1021,
|
| 136 |
+
"mean_token_accuracy": 0.9600433588027955,
|
| 137 |
+
"num_tokens": 301615.0,
|
| 138 |
+
"step": 160
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 0.003362208773387129,
|
| 142 |
+
"learning_rate": 0.000199331513785056,
|
| 143 |
+
"loss": 0.095,
|
| 144 |
+
"mean_token_accuracy": 0.9641285121440888,
|
| 145 |
+
"num_tokens": 320502.0,
|
| 146 |
+
"step": 170
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"epoch": 0.00355998576005696,
|
| 150 |
+
"learning_rate": 0.000199291958387722,
|
| 151 |
+
"loss": 0.0722,
|
| 152 |
+
"mean_token_accuracy": 0.9708887040615082,
|
| 153 |
+
"num_tokens": 339616.0,
|
| 154 |
+
"step": 180
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"epoch": 0.003757762746726791,
|
| 158 |
+
"learning_rate": 0.00019925240299038804,
|
| 159 |
+
"loss": 0.0951,
|
| 160 |
+
"mean_token_accuracy": 0.9712291181087493,
|
| 161 |
+
"num_tokens": 358474.0,
|
| 162 |
+
"step": 190
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"epoch": 0.003955539733396622,
|
| 166 |
+
"learning_rate": 0.00019921284759305408,
|
| 167 |
+
"loss": 0.1194,
|
| 168 |
+
"mean_token_accuracy": 0.9630812525749206,
|
| 169 |
+
"num_tokens": 377094.0,
|
| 170 |
+
"step": 200
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 0.004153316720066453,
|
| 174 |
+
"learning_rate": 0.00019917329219572012,
|
| 175 |
+
"loss": 0.1002,
|
| 176 |
+
"mean_token_accuracy": 0.9660979807376862,
|
| 177 |
+
"num_tokens": 396000.0,
|
| 178 |
+
"step": 210
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.004351093706736284,
|
| 182 |
+
"learning_rate": 0.00019913373679838613,
|
| 183 |
+
"loss": 0.0954,
|
| 184 |
+
"mean_token_accuracy": 0.9636943399906158,
|
| 185 |
+
"num_tokens": 415019.0,
|
| 186 |
+
"step": 220
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"epoch": 0.0045488706934061155,
|
| 190 |
+
"learning_rate": 0.0001990941814010522,
|
| 191 |
+
"loss": 0.1114,
|
| 192 |
+
"mean_token_accuracy": 0.9662698566913605,
|
| 193 |
+
"num_tokens": 433711.0,
|
| 194 |
+
"step": 230
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"epoch": 0.004746647680075947,
|
| 198 |
+
"learning_rate": 0.00019905462600371823,
|
| 199 |
+
"loss": 0.0915,
|
| 200 |
+
"mean_token_accuracy": 0.9679243505001068,
|
| 201 |
+
"num_tokens": 452483.0,
|
| 202 |
+
"step": 240
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"epoch": 0.004944424666745778,
|
| 206 |
+
"learning_rate": 0.00019901507060638424,
|
| 207 |
+
"loss": 0.0951,
|
| 208 |
+
"mean_token_accuracy": 0.9688079237937928,
|
| 209 |
+
"num_tokens": 471395.0,
|
| 210 |
+
"step": 250
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"epoch": 0.005142201653415609,
|
| 214 |
+
"learning_rate": 0.00019897551520905028,
|
| 215 |
+
"loss": 0.1123,
|
| 216 |
+
"mean_token_accuracy": 0.962276142835617,
|
| 217 |
+
"num_tokens": 489983.0,
|
| 218 |
+
"step": 260
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"epoch": 0.00533997864008544,
|
| 222 |
+
"learning_rate": 0.00019893595981171632,
|
| 223 |
+
"loss": 0.0855,
|
| 224 |
+
"mean_token_accuracy": 0.9698696434497833,
|
| 225 |
+
"num_tokens": 509148.0,
|
| 226 |
+
"step": 270
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 0.005537755626755271,
|
| 230 |
+
"learning_rate": 0.00019889640441438235,
|
| 231 |
+
"loss": 0.0777,
|
| 232 |
+
"mean_token_accuracy": 0.9697826623916626,
|
| 233 |
+
"num_tokens": 528042.0,
|
| 234 |
+
"step": 280
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.005735532613425102,
|
| 238 |
+
"learning_rate": 0.0001988568490170484,
|
| 239 |
+
"loss": 0.0944,
|
| 240 |
+
"mean_token_accuracy": 0.9690817773342133,
|
| 241 |
+
"num_tokens": 546656.0,
|
| 242 |
+
"step": 290
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"epoch": 0.005933309600094933,
|
| 246 |
+
"learning_rate": 0.00019881729361971443,
|
| 247 |
+
"loss": 0.0872,
|
| 248 |
+
"mean_token_accuracy": 0.9661558032035827,
|
| 249 |
+
"num_tokens": 565279.0,
|
| 250 |
+
"step": 300
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"epoch": 0.006131086586764764,
|
| 254 |
+
"learning_rate": 0.00019877773822238047,
|
| 255 |
+
"loss": 0.09,
|
| 256 |
+
"mean_token_accuracy": 0.9669564247131348,
|
| 257 |
+
"num_tokens": 584196.0,
|
| 258 |
+
"step": 310
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"epoch": 0.0063288635734345955,
|
| 262 |
+
"learning_rate": 0.00019873818282504648,
|
| 263 |
+
"loss": 0.0701,
|
| 264 |
+
"mean_token_accuracy": 0.9722951114177704,
|
| 265 |
+
"num_tokens": 603050.0,
|
| 266 |
+
"step": 320
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"epoch": 0.006526640560104427,
|
| 270 |
+
"learning_rate": 0.00019869862742771251,
|
| 271 |
+
"loss": 0.0922,
|
| 272 |
+
"mean_token_accuracy": 0.9692854762077332,
|
| 273 |
+
"num_tokens": 621880.0,
|
| 274 |
+
"step": 330
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"epoch": 0.006724417546774258,
|
| 278 |
+
"learning_rate": 0.00019865907203037855,
|
| 279 |
+
"loss": 0.0976,
|
| 280 |
+
"mean_token_accuracy": 0.9660769879817963,
|
| 281 |
+
"num_tokens": 640657.0,
|
| 282 |
+
"step": 340
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 0.006922194533444089,
|
| 286 |
+
"learning_rate": 0.0001986195166330446,
|
| 287 |
+
"loss": 0.1071,
|
| 288 |
+
"mean_token_accuracy": 0.9633386790752411,
|
| 289 |
+
"num_tokens": 659503.0,
|
| 290 |
+
"step": 350
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 0.00711997152011392,
|
| 294 |
+
"learning_rate": 0.00019857996123571063,
|
| 295 |
+
"loss": 0.1058,
|
| 296 |
+
"mean_token_accuracy": 0.9641251742839814,
|
| 297 |
+
"num_tokens": 678570.0,
|
| 298 |
+
"step": 360
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"epoch": 0.007317748506783751,
|
| 302 |
+
"learning_rate": 0.00019854040583837666,
|
| 303 |
+
"loss": 0.097,
|
| 304 |
+
"mean_token_accuracy": 0.9664280533790588,
|
| 305 |
+
"num_tokens": 697294.0,
|
| 306 |
+
"step": 370
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"epoch": 0.007515525493453582,
|
| 310 |
+
"learning_rate": 0.0001985008504410427,
|
| 311 |
+
"loss": 0.0677,
|
| 312 |
+
"mean_token_accuracy": 0.9754213869571686,
|
| 313 |
+
"num_tokens": 716458.0,
|
| 314 |
+
"step": 380
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"epoch": 0.007713302480123413,
|
| 318 |
+
"learning_rate": 0.0001984612950437087,
|
| 319 |
+
"loss": 0.0622,
|
| 320 |
+
"mean_token_accuracy": 0.9724574089050293,
|
| 321 |
+
"num_tokens": 735437.0,
|
| 322 |
+
"step": 390
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"epoch": 0.007911079466793244,
|
| 326 |
+
"learning_rate": 0.00019842173964637475,
|
| 327 |
+
"loss": 0.1003,
|
| 328 |
+
"mean_token_accuracy": 0.9710333228111268,
|
| 329 |
+
"num_tokens": 754416.0,
|
| 330 |
+
"step": 400
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"epoch": 0.008108856453463075,
|
| 334 |
+
"learning_rate": 0.0001983821842490408,
|
| 335 |
+
"loss": 0.0921,
|
| 336 |
+
"mean_token_accuracy": 0.9718518137931824,
|
| 337 |
+
"num_tokens": 773091.0,
|
| 338 |
+
"step": 410
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 0.008306633440132907,
|
| 342 |
+
"learning_rate": 0.00019834262885170682,
|
| 343 |
+
"loss": 0.0836,
|
| 344 |
+
"mean_token_accuracy": 0.9694978713989257,
|
| 345 |
+
"num_tokens": 791900.0,
|
| 346 |
+
"step": 420
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 0.008504410426802738,
|
| 350 |
+
"learning_rate": 0.00019830307345437286,
|
| 351 |
+
"loss": 0.0822,
|
| 352 |
+
"mean_token_accuracy": 0.974584549665451,
|
| 353 |
+
"num_tokens": 810547.0,
|
| 354 |
+
"step": 430
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"epoch": 0.008702187413472569,
|
| 358 |
+
"learning_rate": 0.0001982635180570389,
|
| 359 |
+
"loss": 0.0735,
|
| 360 |
+
"mean_token_accuracy": 0.9770256340503692,
|
| 361 |
+
"num_tokens": 829753.0,
|
| 362 |
+
"step": 440
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"epoch": 0.0088999644001424,
|
| 366 |
+
"learning_rate": 0.00019822396265970494,
|
| 367 |
+
"loss": 0.0882,
|
| 368 |
+
"mean_token_accuracy": 0.9668483734130859,
|
| 369 |
+
"num_tokens": 848294.0,
|
| 370 |
+
"step": 450
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"epoch": 0.009097741386812231,
|
| 374 |
+
"learning_rate": 0.00019818440726237095,
|
| 375 |
+
"loss": 0.0923,
|
| 376 |
+
"mean_token_accuracy": 0.9715612173080445,
|
| 377 |
+
"num_tokens": 867635.0,
|
| 378 |
+
"step": 460
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
"epoch": 0.009295518373482062,
|
| 382 |
+
"learning_rate": 0.00019814485186503699,
|
| 383 |
+
"loss": 0.0665,
|
| 384 |
+
"mean_token_accuracy": 0.9765240132808686,
|
| 385 |
+
"num_tokens": 886596.0,
|
| 386 |
+
"step": 470
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"epoch": 0.009493295360151893,
|
| 390 |
+
"learning_rate": 0.00019810529646770302,
|
| 391 |
+
"loss": 0.08,
|
| 392 |
+
"mean_token_accuracy": 0.972789865732193,
|
| 393 |
+
"num_tokens": 905312.0,
|
| 394 |
+
"step": 480
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 0.009691072346821724,
|
| 398 |
+
"learning_rate": 0.00019806574107036906,
|
| 399 |
+
"loss": 0.0794,
|
| 400 |
+
"mean_token_accuracy": 0.9753748655319214,
|
| 401 |
+
"num_tokens": 924072.0,
|
| 402 |
+
"step": 490
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 0.009888849333491555,
|
| 406 |
+
"learning_rate": 0.0001980261856730351,
|
| 407 |
+
"loss": 0.0673,
|
| 408 |
+
"mean_token_accuracy": 0.9732649922370911,
|
| 409 |
+
"num_tokens": 942844.0,
|
| 410 |
+
"step": 500
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"epoch": 0.010086626320161387,
|
| 414 |
+
"learning_rate": 0.00019798663027570113,
|
| 415 |
+
"loss": 0.1029,
|
| 416 |
+
"mean_token_accuracy": 0.9693170130252838,
|
| 417 |
+
"num_tokens": 961524.0,
|
| 418 |
+
"step": 510
|
| 419 |
+
},
|
| 420 |
+
{
|
| 421 |
+
"epoch": 0.010284403306831218,
|
| 422 |
+
"learning_rate": 0.00019794707487836717,
|
| 423 |
+
"loss": 0.0901,
|
| 424 |
+
"mean_token_accuracy": 0.9669535160064697,
|
| 425 |
+
"num_tokens": 980332.0,
|
| 426 |
+
"step": 520
|
| 427 |
+
},
|
| 428 |
+
{
|
| 429 |
+
"epoch": 0.010482180293501049,
|
| 430 |
+
"learning_rate": 0.00019790751948103318,
|
| 431 |
+
"loss": 0.0858,
|
| 432 |
+
"mean_token_accuracy": 0.9700904309749603,
|
| 433 |
+
"num_tokens": 999163.0,
|
| 434 |
+
"step": 530
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"epoch": 0.01067995728017088,
|
| 438 |
+
"learning_rate": 0.00019786796408369922,
|
| 439 |
+
"loss": 0.1001,
|
| 440 |
+
"mean_token_accuracy": 0.9630756258964539,
|
| 441 |
+
"num_tokens": 1018053.0,
|
| 442 |
+
"step": 540
|
| 443 |
+
},
|
| 444 |
+
{
|
| 445 |
+
"epoch": 0.010877734266840711,
|
| 446 |
+
"learning_rate": 0.00019782840868636528,
|
| 447 |
+
"loss": 0.0798,
|
| 448 |
+
"mean_token_accuracy": 0.9710527896881104,
|
| 449 |
+
"num_tokens": 1036828.0,
|
| 450 |
+
"step": 550
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"epoch": 0.011075511253510542,
|
| 454 |
+
"learning_rate": 0.0001977888532890313,
|
| 455 |
+
"loss": 0.0883,
|
| 456 |
+
"mean_token_accuracy": 0.9719638526439667,
|
| 457 |
+
"num_tokens": 1055843.0,
|
| 458 |
+
"step": 560
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 0.011273288240180373,
|
| 462 |
+
"learning_rate": 0.00019774929789169733,
|
| 463 |
+
"loss": 0.0817,
|
| 464 |
+
"mean_token_accuracy": 0.9706017553806305,
|
| 465 |
+
"num_tokens": 1074733.0,
|
| 466 |
+
"step": 570
|
| 467 |
+
},
|
| 468 |
+
{
|
| 469 |
+
"epoch": 0.011471065226850204,
|
| 470 |
+
"learning_rate": 0.00019770974249436337,
|
| 471 |
+
"loss": 0.0949,
|
| 472 |
+
"mean_token_accuracy": 0.9684431076049804,
|
| 473 |
+
"num_tokens": 1093519.0,
|
| 474 |
+
"step": 580
|
| 475 |
+
},
|
| 476 |
+
{
|
| 477 |
+
"epoch": 0.011668842213520035,
|
| 478 |
+
"learning_rate": 0.0001976701870970294,
|
| 479 |
+
"loss": 0.0797,
|
| 480 |
+
"mean_token_accuracy": 0.975245076417923,
|
| 481 |
+
"num_tokens": 1112394.0,
|
| 482 |
+
"step": 590
|
| 483 |
+
},
|
| 484 |
+
{
|
| 485 |
+
"epoch": 0.011866619200189867,
|
| 486 |
+
"learning_rate": 0.00019763063169969542,
|
| 487 |
+
"loss": 0.0814,
|
| 488 |
+
"mean_token_accuracy": 0.9766307890415191,
|
| 489 |
+
"num_tokens": 1131435.0,
|
| 490 |
+
"step": 600
|
| 491 |
+
},
|
| 492 |
+
{
|
| 493 |
+
"epoch": 0.012064396186859698,
|
| 494 |
+
"learning_rate": 0.00019759107630236146,
|
| 495 |
+
"loss": 0.0893,
|
| 496 |
+
"mean_token_accuracy": 0.9671396017074585,
|
| 497 |
+
"num_tokens": 1150274.0,
|
| 498 |
+
"step": 610
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"epoch": 0.012262173173529529,
|
| 502 |
+
"learning_rate": 0.00019755152090502752,
|
| 503 |
+
"loss": 0.0965,
|
| 504 |
+
"mean_token_accuracy": 0.9663532435894012,
|
| 505 |
+
"num_tokens": 1169076.0,
|
| 506 |
+
"step": 620
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"epoch": 0.01245995016019936,
|
| 510 |
+
"learning_rate": 0.00019751196550769353,
|
| 511 |
+
"loss": 0.0692,
|
| 512 |
+
"mean_token_accuracy": 0.9754896223545074,
|
| 513 |
+
"num_tokens": 1187908.0,
|
| 514 |
+
"step": 630
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 0.012657727146869191,
|
| 518 |
+
"learning_rate": 0.00019747241011035957,
|
| 519 |
+
"loss": 0.0893,
|
| 520 |
+
"mean_token_accuracy": 0.9667335331439972,
|
| 521 |
+
"num_tokens": 1206872.0,
|
| 522 |
+
"step": 640
|
| 523 |
+
},
|
| 524 |
+
{
|
| 525 |
+
"epoch": 0.012855504133539022,
|
| 526 |
+
"learning_rate": 0.0001974328547130256,
|
| 527 |
+
"loss": 0.0762,
|
| 528 |
+
"mean_token_accuracy": 0.9724668145179749,
|
| 529 |
+
"num_tokens": 1225767.0,
|
| 530 |
+
"step": 650
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"epoch": 0.013053281120208853,
|
| 534 |
+
"learning_rate": 0.00019739329931569164,
|
| 535 |
+
"loss": 0.08,
|
| 536 |
+
"mean_token_accuracy": 0.9690144300460816,
|
| 537 |
+
"num_tokens": 1244340.0,
|
| 538 |
+
"step": 660
|
| 539 |
+
},
|
| 540 |
+
{
|
| 541 |
+
"epoch": 0.013251058106878684,
|
| 542 |
+
"learning_rate": 0.00019735374391835765,
|
| 543 |
+
"loss": 0.0872,
|
| 544 |
+
"mean_token_accuracy": 0.9725019693374634,
|
| 545 |
+
"num_tokens": 1263304.0,
|
| 546 |
+
"step": 670
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"epoch": 0.013448835093548515,
|
| 550 |
+
"learning_rate": 0.0001973141885210237,
|
| 551 |
+
"loss": 0.0894,
|
| 552 |
+
"mean_token_accuracy": 0.9694939434528351,
|
| 553 |
+
"num_tokens": 1282028.0,
|
| 554 |
+
"step": 680
|
| 555 |
+
},
|
| 556 |
+
{
|
| 557 |
+
"epoch": 0.013646612080218347,
|
| 558 |
+
"learning_rate": 0.00019727463312368976,
|
| 559 |
+
"loss": 0.096,
|
| 560 |
+
"mean_token_accuracy": 0.9666180431842804,
|
| 561 |
+
"num_tokens": 1300780.0,
|
| 562 |
+
"step": 690
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"epoch": 0.013844389066888178,
|
| 566 |
+
"learning_rate": 0.00019723507772635577,
|
| 567 |
+
"loss": 0.0804,
|
| 568 |
+
"mean_token_accuracy": 0.9751847207546234,
|
| 569 |
+
"num_tokens": 1319535.0,
|
| 570 |
+
"step": 700
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 0.014042166053558009,
|
| 574 |
+
"learning_rate": 0.0001971955223290218,
|
| 575 |
+
"loss": 0.1046,
|
| 576 |
+
"mean_token_accuracy": 0.9681148648262023,
|
| 577 |
+
"num_tokens": 1338391.0,
|
| 578 |
+
"step": 710
|
| 579 |
+
},
|
| 580 |
+
{
|
| 581 |
+
"epoch": 0.01423994304022784,
|
| 582 |
+
"learning_rate": 0.00019715596693168784,
|
| 583 |
+
"loss": 0.0804,
|
| 584 |
+
"mean_token_accuracy": 0.9726056635379792,
|
| 585 |
+
"num_tokens": 1357072.0,
|
| 586 |
+
"step": 720
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"epoch": 0.014437720026897671,
|
| 590 |
+
"learning_rate": 0.00019711641153435388,
|
| 591 |
+
"loss": 0.0664,
|
| 592 |
+
"mean_token_accuracy": 0.9787862658500671,
|
| 593 |
+
"num_tokens": 1375999.0,
|
| 594 |
+
"step": 730
|
| 595 |
+
},
|
| 596 |
+
{
|
| 597 |
+
"epoch": 0.014635497013567502,
|
| 598 |
+
"learning_rate": 0.00019707685613701992,
|
| 599 |
+
"loss": 0.0823,
|
| 600 |
+
"mean_token_accuracy": 0.9705499827861785,
|
| 601 |
+
"num_tokens": 1394654.0,
|
| 602 |
+
"step": 740
|
| 603 |
+
},
|
| 604 |
+
{
|
| 605 |
+
"epoch": 0.014833274000237333,
|
| 606 |
+
"learning_rate": 0.00019703730073968593,
|
| 607 |
+
"loss": 0.0828,
|
| 608 |
+
"mean_token_accuracy": 0.9703374147415161,
|
| 609 |
+
"num_tokens": 1413323.0,
|
| 610 |
+
"step": 750
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"epoch": 0.015031050986907164,
|
| 614 |
+
"learning_rate": 0.000196997745342352,
|
| 615 |
+
"loss": 0.0797,
|
| 616 |
+
"mean_token_accuracy": 0.9717476069927216,
|
| 617 |
+
"num_tokens": 1432274.0,
|
| 618 |
+
"step": 760
|
| 619 |
+
},
|
| 620 |
+
{
|
| 621 |
+
"epoch": 0.015228827973576995,
|
| 622 |
+
"learning_rate": 0.000196958189945018,
|
| 623 |
+
"loss": 0.0811,
|
| 624 |
+
"mean_token_accuracy": 0.9687287509441376,
|
| 625 |
+
"num_tokens": 1451009.0,
|
| 626 |
+
"step": 770
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 0.015426604960246827,
|
| 630 |
+
"learning_rate": 0.00019691863454768404,
|
| 631 |
+
"loss": 0.0799,
|
| 632 |
+
"mean_token_accuracy": 0.9712490200996399,
|
| 633 |
+
"num_tokens": 1469741.0,
|
| 634 |
+
"step": 780
|
| 635 |
+
},
|
| 636 |
+
{
|
| 637 |
+
"epoch": 0.015624381946916658,
|
| 638 |
+
"learning_rate": 0.00019687907915035008,
|
| 639 |
+
"loss": 0.0698,
|
| 640 |
+
"mean_token_accuracy": 0.9755463302135468,
|
| 641 |
+
"num_tokens": 1488644.0,
|
| 642 |
+
"step": 790
|
| 643 |
+
},
|
| 644 |
+
{
|
| 645 |
+
"epoch": 0.01582215893358649,
|
| 646 |
+
"learning_rate": 0.0001968395237530161,
|
| 647 |
+
"loss": 0.07,
|
| 648 |
+
"mean_token_accuracy": 0.9786295354366302,
|
| 649 |
+
"num_tokens": 1507384.0,
|
| 650 |
+
"step": 800
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"epoch": 0.016019935920256318,
|
| 654 |
+
"learning_rate": 0.00019679996835568215,
|
| 655 |
+
"loss": 0.0842,
|
| 656 |
+
"mean_token_accuracy": 0.9716646075248718,
|
| 657 |
+
"num_tokens": 1526208.0,
|
| 658 |
+
"step": 810
|
| 659 |
+
},
|
| 660 |
+
{
|
| 661 |
+
"epoch": 0.01621771290692615,
|
| 662 |
+
"learning_rate": 0.00019676041295834816,
|
| 663 |
+
"loss": 0.0681,
|
| 664 |
+
"mean_token_accuracy": 0.9786826431751251,
|
| 665 |
+
"num_tokens": 1545086.0,
|
| 666 |
+
"step": 820
|
| 667 |
+
},
|
| 668 |
+
{
|
| 669 |
+
"epoch": 0.01641548989359598,
|
| 670 |
+
"learning_rate": 0.00019672085756101423,
|
| 671 |
+
"loss": 0.068,
|
| 672 |
+
"mean_token_accuracy": 0.9776684403419494,
|
| 673 |
+
"num_tokens": 1563972.0,
|
| 674 |
+
"step": 830
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"epoch": 0.016613266880265813,
|
| 678 |
+
"learning_rate": 0.00019668130216368024,
|
| 679 |
+
"loss": 0.0855,
|
| 680 |
+
"mean_token_accuracy": 0.9758803486824036,
|
| 681 |
+
"num_tokens": 1582893.0,
|
| 682 |
+
"step": 840
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"epoch": 0.016811043866935643,
|
| 686 |
+
"learning_rate": 0.00019664174676634627,
|
| 687 |
+
"loss": 0.0969,
|
| 688 |
+
"mean_token_accuracy": 0.9676210284233093,
|
| 689 |
+
"num_tokens": 1601719.0,
|
| 690 |
+
"step": 850
|
| 691 |
+
},
|
| 692 |
+
{
|
| 693 |
+
"epoch": 0.017008820853605475,
|
| 694 |
+
"learning_rate": 0.0001966021913690123,
|
| 695 |
+
"loss": 0.0836,
|
| 696 |
+
"mean_token_accuracy": 0.973316353559494,
|
| 697 |
+
"num_tokens": 1620493.0,
|
| 698 |
+
"step": 860
|
| 699 |
+
},
|
| 700 |
+
{
|
| 701 |
+
"epoch": 0.017206597840275305,
|
| 702 |
+
"learning_rate": 0.00019656263597167835,
|
| 703 |
+
"loss": 0.0707,
|
| 704 |
+
"mean_token_accuracy": 0.9776813209056854,
|
| 705 |
+
"num_tokens": 1639336.0,
|
| 706 |
+
"step": 870
|
| 707 |
+
},
|
| 708 |
+
{
|
| 709 |
+
"epoch": 0.017404374826945138,
|
| 710 |
+
"learning_rate": 0.00019652308057434439,
|
| 711 |
+
"loss": 0.0921,
|
| 712 |
+
"mean_token_accuracy": 0.9702894032001496,
|
| 713 |
+
"num_tokens": 1658213.0,
|
| 714 |
+
"step": 880
|
| 715 |
+
},
|
| 716 |
+
{
|
| 717 |
+
"epoch": 0.017602151813614967,
|
| 718 |
+
"learning_rate": 0.0001964835251770104,
|
| 719 |
+
"loss": 0.0629,
|
| 720 |
+
"mean_token_accuracy": 0.9804218530654907,
|
| 721 |
+
"num_tokens": 1677260.0,
|
| 722 |
+
"step": 890
|
| 723 |
+
},
|
| 724 |
+
{
|
| 725 |
+
"epoch": 0.0177999288002848,
|
| 726 |
+
"learning_rate": 0.00019644396977967646,
|
| 727 |
+
"loss": 0.0946,
|
| 728 |
+
"mean_token_accuracy": 0.9700180232524872,
|
| 729 |
+
"num_tokens": 1696174.0,
|
| 730 |
+
"step": 900
|
| 731 |
+
},
|
| 732 |
+
{
|
| 733 |
+
"epoch": 0.01799770578695463,
|
| 734 |
+
"learning_rate": 0.00019640441438234247,
|
| 735 |
+
"loss": 0.0822,
|
| 736 |
+
"mean_token_accuracy": 0.9717112898826599,
|
| 737 |
+
"num_tokens": 1714822.0,
|
| 738 |
+
"step": 910
|
| 739 |
+
},
|
| 740 |
+
{
|
| 741 |
+
"epoch": 0.018195482773624462,
|
| 742 |
+
"learning_rate": 0.0001963648589850085,
|
| 743 |
+
"loss": 0.0732,
|
| 744 |
+
"mean_token_accuracy": 0.9710357069969178,
|
| 745 |
+
"num_tokens": 1733526.0,
|
| 746 |
+
"step": 920
|
| 747 |
+
},
|
| 748 |
+
{
|
| 749 |
+
"epoch": 0.01839325976029429,
|
| 750 |
+
"learning_rate": 0.00019632530358767455,
|
| 751 |
+
"loss": 0.0697,
|
| 752 |
+
"mean_token_accuracy": 0.9731245815753937,
|
| 753 |
+
"num_tokens": 1752469.0,
|
| 754 |
+
"step": 930
|
| 755 |
+
},
|
| 756 |
+
{
|
| 757 |
+
"epoch": 0.018591036746964124,
|
| 758 |
+
"learning_rate": 0.00019628574819034058,
|
| 759 |
+
"loss": 0.065,
|
| 760 |
+
"mean_token_accuracy": 0.9799518942832947,
|
| 761 |
+
"num_tokens": 1771439.0,
|
| 762 |
+
"step": 940
|
| 763 |
+
},
|
| 764 |
+
{
|
| 765 |
+
"epoch": 0.018788813733633954,
|
| 766 |
+
"learning_rate": 0.00019624619279300662,
|
| 767 |
+
"loss": 0.0769,
|
| 768 |
+
"mean_token_accuracy": 0.9691688776016235,
|
| 769 |
+
"num_tokens": 1790105.0,
|
| 770 |
+
"step": 950
|
| 771 |
+
},
|
| 772 |
+
{
|
| 773 |
+
"epoch": 0.018986590720303786,
|
| 774 |
+
"learning_rate": 0.00019620663739567263,
|
| 775 |
+
"loss": 0.0675,
|
| 776 |
+
"mean_token_accuracy": 0.9775746822357178,
|
| 777 |
+
"num_tokens": 1809000.0,
|
| 778 |
+
"step": 960
|
| 779 |
+
},
|
| 780 |
+
{
|
| 781 |
+
"epoch": 0.019184367706973616,
|
| 782 |
+
"learning_rate": 0.0001961670819983387,
|
| 783 |
+
"loss": 0.0761,
|
| 784 |
+
"mean_token_accuracy": 0.9722946584224701,
|
| 785 |
+
"num_tokens": 1827743.0,
|
| 786 |
+
"step": 970
|
| 787 |
+
},
|
| 788 |
+
{
|
| 789 |
+
"epoch": 0.01938214469364345,
|
| 790 |
+
"learning_rate": 0.0001961275266010047,
|
| 791 |
+
"loss": 0.0827,
|
| 792 |
+
"mean_token_accuracy": 0.9710648536682129,
|
| 793 |
+
"num_tokens": 1846483.0,
|
| 794 |
+
"step": 980
|
| 795 |
+
},
|
| 796 |
+
{
|
| 797 |
+
"epoch": 0.019579921680313278,
|
| 798 |
+
"learning_rate": 0.00019608797120367074,
|
| 799 |
+
"loss": 0.0548,
|
| 800 |
+
"mean_token_accuracy": 0.9855950713157654,
|
| 801 |
+
"num_tokens": 1865230.0,
|
| 802 |
+
"step": 990
|
| 803 |
+
},
|
| 804 |
+
{
|
| 805 |
+
"epoch": 0.01977769866698311,
|
| 806 |
+
"learning_rate": 0.00019604841580633678,
|
| 807 |
+
"loss": 0.0934,
|
| 808 |
+
"mean_token_accuracy": 0.9695265829563141,
|
| 809 |
+
"num_tokens": 1884295.0,
|
| 810 |
+
"step": 1000
|
| 811 |
+
},
|
| 812 |
+
{
|
| 813 |
+
"epoch": 0.01997547565365294,
|
| 814 |
+
"learning_rate": 0.00019600886040900282,
|
| 815 |
+
"loss": 0.0806,
|
| 816 |
+
"mean_token_accuracy": 0.9750065445899964,
|
| 817 |
+
"num_tokens": 1903297.0,
|
| 818 |
+
"step": 1010
|
| 819 |
+
},
|
| 820 |
+
{
|
| 821 |
+
"epoch": 0.020173252640322773,
|
| 822 |
+
"learning_rate": 0.00019596930501166886,
|
| 823 |
+
"loss": 0.0683,
|
| 824 |
+
"mean_token_accuracy": 0.9779970288276673,
|
| 825 |
+
"num_tokens": 1922020.0,
|
| 826 |
+
"step": 1020
|
| 827 |
+
},
|
| 828 |
+
{
|
| 829 |
+
"epoch": 0.020371029626992603,
|
| 830 |
+
"learning_rate": 0.00019592974961433487,
|
| 831 |
+
"loss": 0.0636,
|
| 832 |
+
"mean_token_accuracy": 0.9801307320594788,
|
| 833 |
+
"num_tokens": 1940771.0,
|
| 834 |
+
"step": 1030
|
| 835 |
+
},
|
| 836 |
+
{
|
| 837 |
+
"epoch": 0.020568806613662435,
|
| 838 |
+
"learning_rate": 0.00019589019421700093,
|
| 839 |
+
"loss": 0.078,
|
| 840 |
+
"mean_token_accuracy": 0.9704040169715882,
|
| 841 |
+
"num_tokens": 1959790.0,
|
| 842 |
+
"step": 1040
|
| 843 |
+
},
|
| 844 |
+
{
|
| 845 |
+
"epoch": 0.020766583600332265,
|
| 846 |
+
"learning_rate": 0.00019585063881966694,
|
| 847 |
+
"loss": 0.0641,
|
| 848 |
+
"mean_token_accuracy": 0.978104192018509,
|
| 849 |
+
"num_tokens": 1978766.0,
|
| 850 |
+
"step": 1050
|
| 851 |
+
},
|
| 852 |
+
{
|
| 853 |
+
"epoch": 0.020964360587002098,
|
| 854 |
+
"learning_rate": 0.00019581108342233298,
|
| 855 |
+
"loss": 0.0931,
|
| 856 |
+
"mean_token_accuracy": 0.967569786310196,
|
| 857 |
+
"num_tokens": 1997785.0,
|
| 858 |
+
"step": 1060
|
| 859 |
+
},
|
| 860 |
+
{
|
| 861 |
+
"epoch": 0.021162137573671927,
|
| 862 |
+
"learning_rate": 0.00019577152802499902,
|
| 863 |
+
"loss": 0.0877,
|
| 864 |
+
"mean_token_accuracy": 0.9735791981220245,
|
| 865 |
+
"num_tokens": 2016334.0,
|
| 866 |
+
"step": 1070
|
| 867 |
+
},
|
| 868 |
+
{
|
| 869 |
+
"epoch": 0.02135991456034176,
|
| 870 |
+
"learning_rate": 0.00019573197262766505,
|
| 871 |
+
"loss": 0.0833,
|
| 872 |
+
"mean_token_accuracy": 0.9717927515506745,
|
| 873 |
+
"num_tokens": 2035271.0,
|
| 874 |
+
"step": 1080
|
| 875 |
+
},
|
| 876 |
+
{
|
| 877 |
+
"epoch": 0.02155769154701159,
|
| 878 |
+
"learning_rate": 0.0001956924172303311,
|
| 879 |
+
"loss": 0.0732,
|
| 880 |
+
"mean_token_accuracy": 0.97304065823555,
|
| 881 |
+
"num_tokens": 2054158.0,
|
| 882 |
+
"step": 1090
|
| 883 |
+
},
|
| 884 |
+
{
|
| 885 |
+
"epoch": 0.021755468533681422,
|
| 886 |
+
"learning_rate": 0.0001956528618329971,
|
| 887 |
+
"loss": 0.0782,
|
| 888 |
+
"mean_token_accuracy": 0.971097469329834,
|
| 889 |
+
"num_tokens": 2072794.0,
|
| 890 |
+
"step": 1100
|
| 891 |
+
},
|
| 892 |
+
{
|
| 893 |
+
"epoch": 0.02195324552035125,
|
| 894 |
+
"learning_rate": 0.00019561330643566317,
|
| 895 |
+
"loss": 0.0745,
|
| 896 |
+
"mean_token_accuracy": 0.9693615853786468,
|
| 897 |
+
"num_tokens": 2091583.0,
|
| 898 |
+
"step": 1110
|
| 899 |
+
},
|
| 900 |
+
{
|
| 901 |
+
"epoch": 0.022151022507021084,
|
| 902 |
+
"learning_rate": 0.0001955737510383292,
|
| 903 |
+
"loss": 0.0575,
|
| 904 |
+
"mean_token_accuracy": 0.9811938166618347,
|
| 905 |
+
"num_tokens": 2110207.0,
|
| 906 |
+
"step": 1120
|
| 907 |
+
},
|
| 908 |
+
{
|
| 909 |
+
"epoch": 0.022348799493690914,
|
| 910 |
+
"learning_rate": 0.00019553419564099521,
|
| 911 |
+
"loss": 0.0838,
|
| 912 |
+
"mean_token_accuracy": 0.9719610214233398,
|
| 913 |
+
"num_tokens": 2129006.0,
|
| 914 |
+
"step": 1130
|
| 915 |
+
},
|
| 916 |
+
{
|
| 917 |
+
"epoch": 0.022546576480360746,
|
| 918 |
+
"learning_rate": 0.00019549464024366125,
|
| 919 |
+
"loss": 0.0704,
|
| 920 |
+
"mean_token_accuracy": 0.9780212998390198,
|
| 921 |
+
"num_tokens": 2147866.0,
|
| 922 |
+
"step": 1140
|
| 923 |
+
},
|
| 924 |
+
{
|
| 925 |
+
"epoch": 0.022744353467030576,
|
| 926 |
+
"learning_rate": 0.0001954550848463273,
|
| 927 |
+
"loss": 0.0626,
|
| 928 |
+
"mean_token_accuracy": 0.9765089929103852,
|
| 929 |
+
"num_tokens": 2166678.0,
|
| 930 |
+
"step": 1150
|
| 931 |
+
},
|
| 932 |
+
{
|
| 933 |
+
"epoch": 0.02294213045370041,
|
| 934 |
+
"learning_rate": 0.00019541552944899333,
|
| 935 |
+
"loss": 0.071,
|
| 936 |
+
"mean_token_accuracy": 0.9786867916584014,
|
| 937 |
+
"num_tokens": 2185491.0,
|
| 938 |
+
"step": 1160
|
| 939 |
+
},
|
| 940 |
+
{
|
| 941 |
+
"epoch": 0.023139907440370238,
|
| 942 |
+
"learning_rate": 0.00019537597405165934,
|
| 943 |
+
"loss": 0.0855,
|
| 944 |
+
"mean_token_accuracy": 0.9737950384616851,
|
| 945 |
+
"num_tokens": 2204270.0,
|
| 946 |
+
"step": 1170
|
| 947 |
+
},
|
| 948 |
+
{
|
| 949 |
+
"epoch": 0.02333768442704007,
|
| 950 |
+
"learning_rate": 0.0001953364186543254,
|
| 951 |
+
"loss": 0.0817,
|
| 952 |
+
"mean_token_accuracy": 0.9746371984481812,
|
| 953 |
+
"num_tokens": 2223262.0,
|
| 954 |
+
"step": 1180
|
| 955 |
+
},
|
| 956 |
+
{
|
| 957 |
+
"epoch": 0.0235354614137099,
|
| 958 |
+
"learning_rate": 0.00019529686325699144,
|
| 959 |
+
"loss": 0.069,
|
| 960 |
+
"mean_token_accuracy": 0.9796195566654206,
|
| 961 |
+
"num_tokens": 2242082.0,
|
| 962 |
+
"step": 1190
|
| 963 |
+
},
|
| 964 |
+
{
|
| 965 |
+
"epoch": 0.023733238400379733,
|
| 966 |
+
"learning_rate": 0.00019525730785965745,
|
| 967 |
+
"loss": 0.0976,
|
| 968 |
+
"mean_token_accuracy": 0.9742420554161072,
|
| 969 |
+
"num_tokens": 2260874.0,
|
| 970 |
+
"step": 1200
|
| 971 |
+
},
|
| 972 |
+
{
|
| 973 |
+
"epoch": 0.023931015387049562,
|
| 974 |
+
"learning_rate": 0.00019521775246232349,
|
| 975 |
+
"loss": 0.0816,
|
| 976 |
+
"mean_token_accuracy": 0.972790652513504,
|
| 977 |
+
"num_tokens": 2279620.0,
|
| 978 |
+
"step": 1210
|
| 979 |
+
},
|
| 980 |
+
{
|
| 981 |
+
"epoch": 0.024128792373719395,
|
| 982 |
+
"learning_rate": 0.00019517819706498952,
|
| 983 |
+
"loss": 0.0602,
|
| 984 |
+
"mean_token_accuracy": 0.9790996849536896,
|
| 985 |
+
"num_tokens": 2298359.0,
|
| 986 |
+
"step": 1220
|
| 987 |
+
},
|
| 988 |
+
{
|
| 989 |
+
"epoch": 0.024326569360389225,
|
| 990 |
+
"learning_rate": 0.00019513864166765556,
|
| 991 |
+
"loss": 0.0773,
|
| 992 |
+
"mean_token_accuracy": 0.9750834167003631,
|
| 993 |
+
"num_tokens": 2317171.0,
|
| 994 |
+
"step": 1230
|
| 995 |
+
},
|
| 996 |
+
{
|
| 997 |
+
"epoch": 0.024524346347059058,
|
| 998 |
+
"learning_rate": 0.00019509908627032157,
|
| 999 |
+
"loss": 0.0863,
|
| 1000 |
+
"mean_token_accuracy": 0.9679585933685303,
|
| 1001 |
+
"num_tokens": 2335825.0,
|
| 1002 |
+
"step": 1240
|
| 1003 |
+
},
|
| 1004 |
+
{
|
| 1005 |
+
"epoch": 0.024722123333728887,
|
| 1006 |
+
"learning_rate": 0.00019505953087298764,
|
| 1007 |
+
"loss": 0.0712,
|
| 1008 |
+
"mean_token_accuracy": 0.9711700201034545,
|
| 1009 |
+
"num_tokens": 2354806.0,
|
| 1010 |
+
"step": 1250
|
| 1011 |
+
},
|
| 1012 |
+
{
|
| 1013 |
+
"epoch": 0.02491990032039872,
|
| 1014 |
+
"learning_rate": 0.00019501997547565367,
|
| 1015 |
+
"loss": 0.0905,
|
| 1016 |
+
"mean_token_accuracy": 0.9734555304050445,
|
| 1017 |
+
"num_tokens": 2373919.0,
|
| 1018 |
+
"step": 1260
|
| 1019 |
+
},
|
| 1020 |
+
{
|
| 1021 |
+
"epoch": 0.02511767730706855,
|
| 1022 |
+
"learning_rate": 0.00019498042007831968,
|
| 1023 |
+
"loss": 0.0582,
|
| 1024 |
+
"mean_token_accuracy": 0.9755668938159943,
|
| 1025 |
+
"num_tokens": 2393043.0,
|
| 1026 |
+
"step": 1270
|
| 1027 |
+
},
|
| 1028 |
+
{
|
| 1029 |
+
"epoch": 0.025315454293738382,
|
| 1030 |
+
"learning_rate": 0.00019494086468098575,
|
| 1031 |
+
"loss": 0.0627,
|
| 1032 |
+
"mean_token_accuracy": 0.9795541882514953,
|
| 1033 |
+
"num_tokens": 2411888.0,
|
| 1034 |
+
"step": 1280
|
| 1035 |
+
},
|
| 1036 |
+
{
|
| 1037 |
+
"epoch": 0.02551323128040821,
|
| 1038 |
+
"learning_rate": 0.00019490130928365176,
|
| 1039 |
+
"loss": 0.0729,
|
| 1040 |
+
"mean_token_accuracy": 0.976466304063797,
|
| 1041 |
+
"num_tokens": 2430790.0,
|
| 1042 |
+
"step": 1290
|
| 1043 |
+
},
|
| 1044 |
+
{
|
| 1045 |
+
"epoch": 0.025711008267078044,
|
| 1046 |
+
"learning_rate": 0.0001948617538863178,
|
| 1047 |
+
"loss": 0.0676,
|
| 1048 |
+
"mean_token_accuracy": 0.9730806350708008,
|
| 1049 |
+
"num_tokens": 2449534.0,
|
| 1050 |
+
"step": 1300
|
| 1051 |
+
},
|
| 1052 |
+
{
|
| 1053 |
+
"epoch": 0.025908785253747874,
|
| 1054 |
+
"learning_rate": 0.0001948221984889838,
|
| 1055 |
+
"loss": 0.0849,
|
| 1056 |
+
"mean_token_accuracy": 0.9746219754219055,
|
| 1057 |
+
"num_tokens": 2468500.0,
|
| 1058 |
+
"step": 1310
|
| 1059 |
+
},
|
| 1060 |
+
{
|
| 1061 |
+
"epoch": 0.026106562240417706,
|
| 1062 |
+
"learning_rate": 0.00019478264309164987,
|
| 1063 |
+
"loss": 0.082,
|
| 1064 |
+
"mean_token_accuracy": 0.9718350946903229,
|
| 1065 |
+
"num_tokens": 2487522.0,
|
| 1066 |
+
"step": 1320
|
| 1067 |
+
},
|
| 1068 |
+
{
|
| 1069 |
+
"epoch": 0.026304339227087536,
|
| 1070 |
+
"learning_rate": 0.0001947430876943159,
|
| 1071 |
+
"loss": 0.0814,
|
| 1072 |
+
"mean_token_accuracy": 0.9727078378200531,
|
| 1073 |
+
"num_tokens": 2506228.0,
|
| 1074 |
+
"step": 1330
|
| 1075 |
+
},
|
| 1076 |
+
{
|
| 1077 |
+
"epoch": 0.02650211621375737,
|
| 1078 |
+
"learning_rate": 0.00019470353229698192,
|
| 1079 |
+
"loss": 0.0908,
|
| 1080 |
+
"mean_token_accuracy": 0.972163724899292,
|
| 1081 |
+
"num_tokens": 2524993.0,
|
| 1082 |
+
"step": 1340
|
| 1083 |
+
},
|
| 1084 |
+
{
|
| 1085 |
+
"epoch": 0.026699893200427198,
|
| 1086 |
+
"learning_rate": 0.00019466397689964798,
|
| 1087 |
+
"loss": 0.0967,
|
| 1088 |
+
"mean_token_accuracy": 0.9677857100963593,
|
| 1089 |
+
"num_tokens": 2543814.0,
|
| 1090 |
+
"step": 1350
|
| 1091 |
+
},
|
| 1092 |
+
{
|
| 1093 |
+
"epoch": 0.02689767018709703,
|
| 1094 |
+
"learning_rate": 0.000194624421502314,
|
| 1095 |
+
"loss": 0.0965,
|
| 1096 |
+
"mean_token_accuracy": 0.9627455770969391,
|
| 1097 |
+
"num_tokens": 2562624.0,
|
| 1098 |
+
"step": 1360
|
| 1099 |
+
},
|
| 1100 |
+
{
|
| 1101 |
+
"epoch": 0.02709544717376686,
|
| 1102 |
+
"learning_rate": 0.00019458486610498003,
|
| 1103 |
+
"loss": 0.0866,
|
| 1104 |
+
"mean_token_accuracy": 0.9717130541801453,
|
| 1105 |
+
"num_tokens": 2581229.0,
|
| 1106 |
+
"step": 1370
|
| 1107 |
+
},
|
| 1108 |
+
{
|
| 1109 |
+
"epoch": 0.027293224160436693,
|
| 1110 |
+
"learning_rate": 0.00019454531070764607,
|
| 1111 |
+
"loss": 0.0864,
|
| 1112 |
+
"mean_token_accuracy": 0.9688866436481476,
|
| 1113 |
+
"num_tokens": 2600073.0,
|
| 1114 |
+
"step": 1380
|
| 1115 |
+
},
|
| 1116 |
+
{
|
| 1117 |
+
"epoch": 0.027491001147106522,
|
| 1118 |
+
"learning_rate": 0.0001945057553103121,
|
| 1119 |
+
"loss": 0.0743,
|
| 1120 |
+
"mean_token_accuracy": 0.9723983883857727,
|
| 1121 |
+
"num_tokens": 2618837.0,
|
| 1122 |
+
"step": 1390
|
| 1123 |
+
},
|
| 1124 |
+
{
|
| 1125 |
+
"epoch": 0.027688778133776355,
|
| 1126 |
+
"learning_rate": 0.00019446619991297814,
|
| 1127 |
+
"loss": 0.0719,
|
| 1128 |
+
"mean_token_accuracy": 0.9758836448192596,
|
| 1129 |
+
"num_tokens": 2637487.0,
|
| 1130 |
+
"step": 1400
|
| 1131 |
+
},
|
| 1132 |
+
{
|
| 1133 |
+
"epoch": 0.027886555120446185,
|
| 1134 |
+
"learning_rate": 0.00019442664451564415,
|
| 1135 |
+
"loss": 0.0925,
|
| 1136 |
+
"mean_token_accuracy": 0.969123649597168,
|
| 1137 |
+
"num_tokens": 2656004.0,
|
| 1138 |
+
"step": 1410
|
| 1139 |
+
},
|
| 1140 |
+
{
|
| 1141 |
+
"epoch": 0.028084332107116017,
|
| 1142 |
+
"learning_rate": 0.00019438708911831022,
|
| 1143 |
+
"loss": 0.0566,
|
| 1144 |
+
"mean_token_accuracy": 0.9795430541038513,
|
| 1145 |
+
"num_tokens": 2675089.0,
|
| 1146 |
+
"step": 1420
|
| 1147 |
+
},
|
| 1148 |
+
{
|
| 1149 |
+
"epoch": 0.028282109093785847,
|
| 1150 |
+
"learning_rate": 0.00019434753372097623,
|
| 1151 |
+
"loss": 0.0618,
|
| 1152 |
+
"mean_token_accuracy": 0.9796153604984283,
|
| 1153 |
+
"num_tokens": 2693904.0,
|
| 1154 |
+
"step": 1430
|
| 1155 |
+
},
|
| 1156 |
+
{
|
| 1157 |
+
"epoch": 0.02847988608045568,
|
| 1158 |
+
"learning_rate": 0.00019430797832364227,
|
| 1159 |
+
"loss": 0.0808,
|
| 1160 |
+
"mean_token_accuracy": 0.9735188663005829,
|
| 1161 |
+
"num_tokens": 2712395.0,
|
| 1162 |
+
"step": 1440
|
| 1163 |
+
},
|
| 1164 |
+
{
|
| 1165 |
+
"epoch": 0.02867766306712551,
|
| 1166 |
+
"learning_rate": 0.0001942684229263083,
|
| 1167 |
+
"loss": 0.0657,
|
| 1168 |
+
"mean_token_accuracy": 0.9771307945251465,
|
| 1169 |
+
"num_tokens": 2731103.0,
|
| 1170 |
+
"step": 1450
|
| 1171 |
+
},
|
| 1172 |
+
{
|
| 1173 |
+
"epoch": 0.028875440053795342,
|
| 1174 |
+
"learning_rate": 0.00019422886752897434,
|
| 1175 |
+
"loss": 0.083,
|
| 1176 |
+
"mean_token_accuracy": 0.9664826571941376,
|
| 1177 |
+
"num_tokens": 2749801.0,
|
| 1178 |
+
"step": 1460
|
| 1179 |
+
},
|
| 1180 |
+
{
|
| 1181 |
+
"epoch": 0.02907321704046517,
|
| 1182 |
+
"learning_rate": 0.00019418931213164038,
|
| 1183 |
+
"loss": 0.0721,
|
| 1184 |
+
"mean_token_accuracy": 0.9747998178005218,
|
| 1185 |
+
"num_tokens": 2768551.0,
|
| 1186 |
+
"step": 1470
|
| 1187 |
+
},
|
| 1188 |
+
{
|
| 1189 |
+
"epoch": 0.029270994027135004,
|
| 1190 |
+
"learning_rate": 0.0001941497567343064,
|
| 1191 |
+
"loss": 0.0499,
|
| 1192 |
+
"mean_token_accuracy": 0.9845096707344055,
|
| 1193 |
+
"num_tokens": 2787591.0,
|
| 1194 |
+
"step": 1480
|
| 1195 |
+
},
|
| 1196 |
+
{
|
| 1197 |
+
"epoch": 0.029468771013804834,
|
| 1198 |
+
"learning_rate": 0.00019411020133697245,
|
| 1199 |
+
"loss": 0.0665,
|
| 1200 |
+
"mean_token_accuracy": 0.9782804071903228,
|
| 1201 |
+
"num_tokens": 2806455.0,
|
| 1202 |
+
"step": 1490
|
| 1203 |
+
},
|
| 1204 |
+
{
|
| 1205 |
+
"epoch": 0.029666548000474666,
|
| 1206 |
+
"learning_rate": 0.00019407064593963846,
|
| 1207 |
+
"loss": 0.0633,
|
| 1208 |
+
"mean_token_accuracy": 0.9759399771690369,
|
| 1209 |
+
"num_tokens": 2825280.0,
|
| 1210 |
+
"step": 1500
|
| 1211 |
+
},
|
| 1212 |
+
{
|
| 1213 |
+
"epoch": 0.029864324987144496,
|
| 1214 |
+
"learning_rate": 0.0001940310905423045,
|
| 1215 |
+
"loss": 0.0641,
|
| 1216 |
+
"mean_token_accuracy": 0.980692720413208,
|
| 1217 |
+
"num_tokens": 2844178.0,
|
| 1218 |
+
"step": 1510
|
| 1219 |
+
},
|
| 1220 |
+
{
|
| 1221 |
+
"epoch": 0.03006210197381433,
|
| 1222 |
+
"learning_rate": 0.00019399153514497054,
|
| 1223 |
+
"loss": 0.0568,
|
| 1224 |
+
"mean_token_accuracy": 0.9760835587978363,
|
| 1225 |
+
"num_tokens": 2863373.0,
|
| 1226 |
+
"step": 1520
|
| 1227 |
+
},
|
| 1228 |
+
{
|
| 1229 |
+
"epoch": 0.030259878960484158,
|
| 1230 |
+
"learning_rate": 0.00019395197974763658,
|
| 1231 |
+
"loss": 0.0611,
|
| 1232 |
+
"mean_token_accuracy": 0.9767693936824798,
|
| 1233 |
+
"num_tokens": 2882166.0,
|
| 1234 |
+
"step": 1530
|
| 1235 |
+
},
|
| 1236 |
+
{
|
| 1237 |
+
"epoch": 0.03045765594715399,
|
| 1238 |
+
"learning_rate": 0.00019391242435030261,
|
| 1239 |
+
"loss": 0.0557,
|
| 1240 |
+
"mean_token_accuracy": 0.9800930917263031,
|
| 1241 |
+
"num_tokens": 2901050.0,
|
| 1242 |
+
"step": 1540
|
| 1243 |
+
},
|
| 1244 |
+
{
|
| 1245 |
+
"epoch": 0.03065543293382382,
|
| 1246 |
+
"learning_rate": 0.00019387286895296862,
|
| 1247 |
+
"loss": 0.0739,
|
| 1248 |
+
"mean_token_accuracy": 0.9733968496322631,
|
| 1249 |
+
"num_tokens": 2919953.0,
|
| 1250 |
+
"step": 1550
|
| 1251 |
+
},
|
| 1252 |
+
{
|
| 1253 |
+
"epoch": 0.030853209920493653,
|
| 1254 |
+
"learning_rate": 0.0001938333135556347,
|
| 1255 |
+
"loss": 0.0706,
|
| 1256 |
+
"mean_token_accuracy": 0.9725773215293885,
|
| 1257 |
+
"num_tokens": 2939003.0,
|
| 1258 |
+
"step": 1560
|
| 1259 |
+
},
|
| 1260 |
+
{
|
| 1261 |
+
"epoch": 0.031050986907163482,
|
| 1262 |
+
"learning_rate": 0.00019379375815830073,
|
| 1263 |
+
"loss": 0.0655,
|
| 1264 |
+
"mean_token_accuracy": 0.9753368616104126,
|
| 1265 |
+
"num_tokens": 2957853.0,
|
| 1266 |
+
"step": 1570
|
| 1267 |
+
},
|
| 1268 |
+
{
|
| 1269 |
+
"epoch": 0.031248763893833315,
|
| 1270 |
+
"learning_rate": 0.00019375420276096674,
|
| 1271 |
+
"loss": 0.0667,
|
| 1272 |
+
"mean_token_accuracy": 0.9791980743408203,
|
| 1273 |
+
"num_tokens": 2976557.0,
|
| 1274 |
+
"step": 1580
|
| 1275 |
+
},
|
| 1276 |
+
{
|
| 1277 |
+
"epoch": 0.031446540880503145,
|
| 1278 |
+
"learning_rate": 0.00019371464736363277,
|
| 1279 |
+
"loss": 0.0748,
|
| 1280 |
+
"mean_token_accuracy": 0.9767062723636627,
|
| 1281 |
+
"num_tokens": 2995058.0,
|
| 1282 |
+
"step": 1590
|
| 1283 |
+
},
|
| 1284 |
+
{
|
| 1285 |
+
"epoch": 0.03164431786717298,
|
| 1286 |
+
"learning_rate": 0.0001936750919662988,
|
| 1287 |
+
"loss": 0.0628,
|
| 1288 |
+
"mean_token_accuracy": 0.9783797085285186,
|
| 1289 |
+
"num_tokens": 3014165.0,
|
| 1290 |
+
"step": 1600
|
| 1291 |
+
},
|
| 1292 |
+
{
|
| 1293 |
+
"epoch": 0.03184209485384281,
|
| 1294 |
+
"learning_rate": 0.00019363553656896485,
|
| 1295 |
+
"loss": 0.0564,
|
| 1296 |
+
"mean_token_accuracy": 0.9830330014228821,
|
| 1297 |
+
"num_tokens": 3032924.0,
|
| 1298 |
+
"step": 1610
|
| 1299 |
+
},
|
| 1300 |
+
{
|
| 1301 |
+
"epoch": 0.032039871840512636,
|
| 1302 |
+
"learning_rate": 0.00019359598117163086,
|
| 1303 |
+
"loss": 0.0783,
|
| 1304 |
+
"mean_token_accuracy": 0.9734647035598755,
|
| 1305 |
+
"num_tokens": 3051960.0,
|
| 1306 |
+
"step": 1620
|
| 1307 |
+
},
|
| 1308 |
+
{
|
| 1309 |
+
"epoch": 0.03223764882718247,
|
| 1310 |
+
"learning_rate": 0.00019355642577429692,
|
| 1311 |
+
"loss": 0.0827,
|
| 1312 |
+
"mean_token_accuracy": 0.9718622207641602,
|
| 1313 |
+
"num_tokens": 3070644.0,
|
| 1314 |
+
"step": 1630
|
| 1315 |
+
},
|
| 1316 |
+
{
|
| 1317 |
+
"epoch": 0.0324354258138523,
|
| 1318 |
+
"learning_rate": 0.00019351687037696296,
|
| 1319 |
+
"loss": 0.071,
|
| 1320 |
+
"mean_token_accuracy": 0.980438482761383,
|
| 1321 |
+
"num_tokens": 3089393.0,
|
| 1322 |
+
"step": 1640
|
| 1323 |
+
},
|
| 1324 |
+
{
|
| 1325 |
+
"epoch": 0.03263320280052213,
|
| 1326 |
+
"learning_rate": 0.00019347731497962897,
|
| 1327 |
+
"loss": 0.0656,
|
| 1328 |
+
"mean_token_accuracy": 0.9727302372455597,
|
| 1329 |
+
"num_tokens": 3108260.0,
|
| 1330 |
+
"step": 1650
|
| 1331 |
+
},
|
| 1332 |
+
{
|
| 1333 |
+
"epoch": 0.03283097978719196,
|
| 1334 |
+
"learning_rate": 0.000193437759582295,
|
| 1335 |
+
"loss": 0.0847,
|
| 1336 |
+
"mean_token_accuracy": 0.9706348955631257,
|
| 1337 |
+
"num_tokens": 3127157.0,
|
| 1338 |
+
"step": 1660
|
| 1339 |
+
},
|
| 1340 |
+
{
|
| 1341 |
+
"epoch": 0.033028756773861793,
|
| 1342 |
+
"learning_rate": 0.00019339820418496105,
|
| 1343 |
+
"loss": 0.0721,
|
| 1344 |
+
"mean_token_accuracy": 0.9732413589954376,
|
| 1345 |
+
"num_tokens": 3146126.0,
|
| 1346 |
+
"step": 1670
|
| 1347 |
+
},
|
| 1348 |
+
{
|
| 1349 |
+
"epoch": 0.033226533760531626,
|
| 1350 |
+
"learning_rate": 0.00019335864878762708,
|
| 1351 |
+
"loss": 0.0836,
|
| 1352 |
+
"mean_token_accuracy": 0.9706507205963135,
|
| 1353 |
+
"num_tokens": 3165091.0,
|
| 1354 |
+
"step": 1680
|
| 1355 |
+
}
|
| 1356 |
+
],
|
| 1357 |
+
"logging_steps": 10,
|
| 1358 |
+
"max_steps": 50562,
|
| 1359 |
+
"num_input_tokens_seen": 0,
|
| 1360 |
+
"num_train_epochs": 9223372036854775807,
|
| 1361 |
+
"save_steps": 1685,
|
| 1362 |
+
"stateful_callbacks": {
|
| 1363 |
+
"TrainerControl": {
|
| 1364 |
+
"args": {
|
| 1365 |
+
"should_epoch_stop": false,
|
| 1366 |
+
"should_evaluate": false,
|
| 1367 |
+
"should_log": false,
|
| 1368 |
+
"should_save": true,
|
| 1369 |
+
"should_training_stop": false
|
| 1370 |
+
},
|
| 1371 |
+
"attributes": {}
|
| 1372 |
+
}
|
| 1373 |
+
},
|
| 1374 |
+
"total_flos": 1.5391152511647744e+17,
|
| 1375 |
+
"train_batch_size": 16,
|
| 1376 |
+
"trial_name": null,
|
| 1377 |
+
"trial_params": null
|
| 1378 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
|
| 3 |
+
library_name: peft
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
| 200 |
+
### Framework versions
|
| 201 |
+
|
| 202 |
+
- PEFT 0.15.0
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/adapter_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": false,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 16,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.0,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"r": 8,
|
| 24 |
+
"rank_pattern": {},
|
| 25 |
+
"revision": null,
|
| 26 |
+
"target_modules": [
|
| 27 |
+
"v_proj",
|
| 28 |
+
"q_proj"
|
| 29 |
+
],
|
| 30 |
+
"task_type": "CAUSAL_LM",
|
| 31 |
+
"trainable_token_indices": null,
|
| 32 |
+
"use_dora": false,
|
| 33 |
+
"use_rslora": false
|
| 34 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer_config.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
| 33 |
+
"clean_up_tokenization_spaces": false,
|
| 34 |
+
"eos_token": "</s>",
|
| 35 |
+
"extra_special_tokens": {},
|
| 36 |
+
"legacy": false,
|
| 37 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 38 |
+
"pad_token": "</s>",
|
| 39 |
+
"padding_side": "right",
|
| 40 |
+
"sp_model_kwargs": {},
|
| 41 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 42 |
+
"unk_token": "<unk>",
|
| 43 |
+
"use_default_system_prompt": false
|
| 44 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/trainer_state.json
ADDED
|
@@ -0,0 +1,1826 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.033329872287405256,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2247,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.00014833053977483424,
|
| 14 |
+
"learning_rate": 0.00019997330050284054,
|
| 15 |
+
"loss": 0.3338,
|
| 16 |
+
"mean_token_accuracy": 0.8992965638637542,
|
| 17 |
+
"num_tokens": 14338.0,
|
| 18 |
+
"step": 10
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"epoch": 0.0002966610795496685,
|
| 22 |
+
"learning_rate": 0.00019994363439488556,
|
| 23 |
+
"loss": 0.1339,
|
| 24 |
+
"mean_token_accuracy": 0.9485014617443085,
|
| 25 |
+
"num_tokens": 28640.0,
|
| 26 |
+
"step": 20
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"epoch": 0.0004449916193245027,
|
| 30 |
+
"learning_rate": 0.0001999139682869306,
|
| 31 |
+
"loss": 0.1615,
|
| 32 |
+
"mean_token_accuracy": 0.9449628531932831,
|
| 33 |
+
"num_tokens": 42898.0,
|
| 34 |
+
"step": 30
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"epoch": 0.000593322159099337,
|
| 38 |
+
"learning_rate": 0.00019988430217897563,
|
| 39 |
+
"loss": 0.1449,
|
| 40 |
+
"mean_token_accuracy": 0.9448729813098907,
|
| 41 |
+
"num_tokens": 57046.0,
|
| 42 |
+
"step": 40
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"epoch": 0.0007416526988741712,
|
| 46 |
+
"learning_rate": 0.00019985463607102066,
|
| 47 |
+
"loss": 0.1058,
|
| 48 |
+
"mean_token_accuracy": 0.95491161942482,
|
| 49 |
+
"num_tokens": 71338.0,
|
| 50 |
+
"step": 50
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"epoch": 0.0008899832386490054,
|
| 54 |
+
"learning_rate": 0.0001998249699630657,
|
| 55 |
+
"loss": 0.0956,
|
| 56 |
+
"mean_token_accuracy": 0.9604991674423218,
|
| 57 |
+
"num_tokens": 85472.0,
|
| 58 |
+
"step": 60
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 0.0010383137784238396,
|
| 62 |
+
"learning_rate": 0.00019979530385511073,
|
| 63 |
+
"loss": 0.121,
|
| 64 |
+
"mean_token_accuracy": 0.9559885621070862,
|
| 65 |
+
"num_tokens": 99522.0,
|
| 66 |
+
"step": 70
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.001186644318198674,
|
| 70 |
+
"learning_rate": 0.00019976563774715578,
|
| 71 |
+
"loss": 0.136,
|
| 72 |
+
"mean_token_accuracy": 0.9489055037498474,
|
| 73 |
+
"num_tokens": 113467.0,
|
| 74 |
+
"step": 80
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"epoch": 0.0013349748579735083,
|
| 78 |
+
"learning_rate": 0.0001997359716392008,
|
| 79 |
+
"loss": 0.1069,
|
| 80 |
+
"mean_token_accuracy": 0.9645210564136505,
|
| 81 |
+
"num_tokens": 127613.0,
|
| 82 |
+
"step": 90
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"epoch": 0.0014833053977483424,
|
| 86 |
+
"learning_rate": 0.00019970630553124583,
|
| 87 |
+
"loss": 0.102,
|
| 88 |
+
"mean_token_accuracy": 0.9665270745754242,
|
| 89 |
+
"num_tokens": 141419.0,
|
| 90 |
+
"step": 100
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"epoch": 0.0016316359375231767,
|
| 94 |
+
"learning_rate": 0.00019967663942329088,
|
| 95 |
+
"loss": 0.1031,
|
| 96 |
+
"mean_token_accuracy": 0.9620752274990082,
|
| 97 |
+
"num_tokens": 155506.0,
|
| 98 |
+
"step": 110
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"epoch": 0.0017799664772980108,
|
| 102 |
+
"learning_rate": 0.0001996469733153359,
|
| 103 |
+
"loss": 0.1227,
|
| 104 |
+
"mean_token_accuracy": 0.9612519204616546,
|
| 105 |
+
"num_tokens": 169783.0,
|
| 106 |
+
"step": 120
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"epoch": 0.0019282970170728451,
|
| 110 |
+
"learning_rate": 0.00019961730720738095,
|
| 111 |
+
"loss": 0.1132,
|
| 112 |
+
"mean_token_accuracy": 0.9575821399688721,
|
| 113 |
+
"num_tokens": 183864.0,
|
| 114 |
+
"step": 130
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 0.0020766275568476792,
|
| 118 |
+
"learning_rate": 0.00019958764109942597,
|
| 119 |
+
"loss": 0.099,
|
| 120 |
+
"mean_token_accuracy": 0.9662568092346191,
|
| 121 |
+
"num_tokens": 197941.0,
|
| 122 |
+
"step": 140
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.0022249580966225138,
|
| 126 |
+
"learning_rate": 0.000199557974991471,
|
| 127 |
+
"loss": 0.0861,
|
| 128 |
+
"mean_token_accuracy": 0.9733373045921325,
|
| 129 |
+
"num_tokens": 212305.0,
|
| 130 |
+
"step": 150
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"epoch": 0.002373288636397348,
|
| 134 |
+
"learning_rate": 0.00019952830888351605,
|
| 135 |
+
"loss": 0.0786,
|
| 136 |
+
"mean_token_accuracy": 0.9724234759807586,
|
| 137 |
+
"num_tokens": 226639.0,
|
| 138 |
+
"step": 160
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 0.002521619176172182,
|
| 142 |
+
"learning_rate": 0.00019949864277556107,
|
| 143 |
+
"loss": 0.1153,
|
| 144 |
+
"mean_token_accuracy": 0.9579457581043244,
|
| 145 |
+
"num_tokens": 240608.0,
|
| 146 |
+
"step": 170
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"epoch": 0.0026699497159470165,
|
| 150 |
+
"learning_rate": 0.00019946897666760612,
|
| 151 |
+
"loss": 0.099,
|
| 152 |
+
"mean_token_accuracy": 0.9629672944545746,
|
| 153 |
+
"num_tokens": 254679.0,
|
| 154 |
+
"step": 180
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"epoch": 0.0028182802557218506,
|
| 158 |
+
"learning_rate": 0.00019943931055965111,
|
| 159 |
+
"loss": 0.1052,
|
| 160 |
+
"mean_token_accuracy": 0.958859795331955,
|
| 161 |
+
"num_tokens": 268884.0,
|
| 162 |
+
"step": 190
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"epoch": 0.0029666107954966848,
|
| 166 |
+
"learning_rate": 0.00019940964445169616,
|
| 167 |
+
"loss": 0.103,
|
| 168 |
+
"mean_token_accuracy": 0.9612604022026062,
|
| 169 |
+
"num_tokens": 283062.0,
|
| 170 |
+
"step": 200
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 0.003114941335271519,
|
| 174 |
+
"learning_rate": 0.00019937997834374121,
|
| 175 |
+
"loss": 0.0959,
|
| 176 |
+
"mean_token_accuracy": 0.9604475021362304,
|
| 177 |
+
"num_tokens": 297014.0,
|
| 178 |
+
"step": 210
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.0032632718750463534,
|
| 182 |
+
"learning_rate": 0.00019935031223578624,
|
| 183 |
+
"loss": 0.101,
|
| 184 |
+
"mean_token_accuracy": 0.9652803599834442,
|
| 185 |
+
"num_tokens": 311066.0,
|
| 186 |
+
"step": 220
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"epoch": 0.0034116024148211875,
|
| 190 |
+
"learning_rate": 0.0001993206461278313,
|
| 191 |
+
"loss": 0.0996,
|
| 192 |
+
"mean_token_accuracy": 0.957928591966629,
|
| 193 |
+
"num_tokens": 325289.0,
|
| 194 |
+
"step": 230
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"epoch": 0.0035599329545960216,
|
| 198 |
+
"learning_rate": 0.00019929098001987628,
|
| 199 |
+
"loss": 0.0692,
|
| 200 |
+
"mean_token_accuracy": 0.9736397683620452,
|
| 201 |
+
"num_tokens": 339616.0,
|
| 202 |
+
"step": 240
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"epoch": 0.003708263494370856,
|
| 206 |
+
"learning_rate": 0.00019926131391192133,
|
| 207 |
+
"loss": 0.095,
|
| 208 |
+
"mean_token_accuracy": 0.9670307815074921,
|
| 209 |
+
"num_tokens": 353777.0,
|
| 210 |
+
"step": 250
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"epoch": 0.0038565940341456903,
|
| 214 |
+
"learning_rate": 0.00019923164780396636,
|
| 215 |
+
"loss": 0.0968,
|
| 216 |
+
"mean_token_accuracy": 0.9648948311805725,
|
| 217 |
+
"num_tokens": 367871.0,
|
| 218 |
+
"step": 260
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"epoch": 0.004004924573920525,
|
| 222 |
+
"learning_rate": 0.0001992019816960114,
|
| 223 |
+
"loss": 0.1252,
|
| 224 |
+
"mean_token_accuracy": 0.9623154461383819,
|
| 225 |
+
"num_tokens": 381768.0,
|
| 226 |
+
"step": 270
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 0.0041532551136953585,
|
| 230 |
+
"learning_rate": 0.00019917231558805643,
|
| 231 |
+
"loss": 0.0933,
|
| 232 |
+
"mean_token_accuracy": 0.9669747233390809,
|
| 233 |
+
"num_tokens": 396018.0,
|
| 234 |
+
"step": 280
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.004301585653470193,
|
| 238 |
+
"learning_rate": 0.00019914264948010145,
|
| 239 |
+
"loss": 0.0903,
|
| 240 |
+
"mean_token_accuracy": 0.9654496192932129,
|
| 241 |
+
"num_tokens": 410375.0,
|
| 242 |
+
"step": 290
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"epoch": 0.0044499161932450276,
|
| 246 |
+
"learning_rate": 0.0001991129833721465,
|
| 247 |
+
"loss": 0.1308,
|
| 248 |
+
"mean_token_accuracy": 0.9515012204647064,
|
| 249 |
+
"num_tokens": 424339.0,
|
| 250 |
+
"step": 300
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"epoch": 0.004598246733019861,
|
| 254 |
+
"learning_rate": 0.00019908331726419153,
|
| 255 |
+
"loss": 0.0989,
|
| 256 |
+
"mean_token_accuracy": 0.9694083333015442,
|
| 257 |
+
"num_tokens": 438444.0,
|
| 258 |
+
"step": 310
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"epoch": 0.004746577272794696,
|
| 262 |
+
"learning_rate": 0.00019905365115623658,
|
| 263 |
+
"loss": 0.093,
|
| 264 |
+
"mean_token_accuracy": 0.971742856502533,
|
| 265 |
+
"num_tokens": 452483.0,
|
| 266 |
+
"step": 320
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"epoch": 0.00489490781256953,
|
| 270 |
+
"learning_rate": 0.0001990239850482816,
|
| 271 |
+
"loss": 0.102,
|
| 272 |
+
"mean_token_accuracy": 0.9643003046512604,
|
| 273 |
+
"num_tokens": 466532.0,
|
| 274 |
+
"step": 330
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"epoch": 0.005043238352344364,
|
| 278 |
+
"learning_rate": 0.00019899431894032662,
|
| 279 |
+
"loss": 0.0964,
|
| 280 |
+
"mean_token_accuracy": 0.9672096133232116,
|
| 281 |
+
"num_tokens": 480771.0,
|
| 282 |
+
"step": 340
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 0.0051915688921191985,
|
| 286 |
+
"learning_rate": 0.00019896465283237167,
|
| 287 |
+
"loss": 0.1035,
|
| 288 |
+
"mean_token_accuracy": 0.9658852636814117,
|
| 289 |
+
"num_tokens": 494954.0,
|
| 290 |
+
"step": 350
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 0.005339899431894033,
|
| 294 |
+
"learning_rate": 0.0001989349867244167,
|
| 295 |
+
"loss": 0.1085,
|
| 296 |
+
"mean_token_accuracy": 0.9633121192455292,
|
| 297 |
+
"num_tokens": 509115.0,
|
| 298 |
+
"step": 360
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"epoch": 0.005488229971668867,
|
| 302 |
+
"learning_rate": 0.00019890532061646175,
|
| 303 |
+
"loss": 0.067,
|
| 304 |
+
"mean_token_accuracy": 0.9755404174327851,
|
| 305 |
+
"num_tokens": 523364.0,
|
| 306 |
+
"step": 370
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"epoch": 0.005636560511443701,
|
| 310 |
+
"learning_rate": 0.00019887565450850677,
|
| 311 |
+
"loss": 0.0846,
|
| 312 |
+
"mean_token_accuracy": 0.9748851418495178,
|
| 313 |
+
"num_tokens": 537483.0,
|
| 314 |
+
"step": 380
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"epoch": 0.005784891051218535,
|
| 318 |
+
"learning_rate": 0.0001988459884005518,
|
| 319 |
+
"loss": 0.09,
|
| 320 |
+
"mean_token_accuracy": 0.9707007527351379,
|
| 321 |
+
"num_tokens": 551329.0,
|
| 322 |
+
"step": 390
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"epoch": 0.0059332215909933695,
|
| 326 |
+
"learning_rate": 0.00019881632229259684,
|
| 327 |
+
"loss": 0.0923,
|
| 328 |
+
"mean_token_accuracy": 0.9627965211868286,
|
| 329 |
+
"num_tokens": 565279.0,
|
| 330 |
+
"step": 400
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"epoch": 0.006081552130768204,
|
| 334 |
+
"learning_rate": 0.00019878665618464186,
|
| 335 |
+
"loss": 0.1026,
|
| 336 |
+
"mean_token_accuracy": 0.9648779332637787,
|
| 337 |
+
"num_tokens": 579408.0,
|
| 338 |
+
"step": 410
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 0.006229882670543038,
|
| 342 |
+
"learning_rate": 0.0001987569900766869,
|
| 343 |
+
"loss": 0.0626,
|
| 344 |
+
"mean_token_accuracy": 0.9769876301288605,
|
| 345 |
+
"num_tokens": 593666.0,
|
| 346 |
+
"step": 420
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 0.006378213210317872,
|
| 350 |
+
"learning_rate": 0.00019872732396873194,
|
| 351 |
+
"loss": 0.0824,
|
| 352 |
+
"mean_token_accuracy": 0.9722626864910126,
|
| 353 |
+
"num_tokens": 607862.0,
|
| 354 |
+
"step": 430
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"epoch": 0.006526543750092707,
|
| 358 |
+
"learning_rate": 0.00019869765786077696,
|
| 359 |
+
"loss": 0.0815,
|
| 360 |
+
"mean_token_accuracy": 0.9756675064563751,
|
| 361 |
+
"num_tokens": 621860.0,
|
| 362 |
+
"step": 440
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"epoch": 0.0066748742898675405,
|
| 366 |
+
"learning_rate": 0.000198667991752822,
|
| 367 |
+
"loss": 0.0864,
|
| 368 |
+
"mean_token_accuracy": 0.9753140985965729,
|
| 369 |
+
"num_tokens": 635967.0,
|
| 370 |
+
"step": 450
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"epoch": 0.006823204829642375,
|
| 374 |
+
"learning_rate": 0.00019863832564486703,
|
| 375 |
+
"loss": 0.1159,
|
| 376 |
+
"mean_token_accuracy": 0.9547139942646027,
|
| 377 |
+
"num_tokens": 650037.0,
|
| 378 |
+
"step": 460
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
"epoch": 0.0069715353694172096,
|
| 382 |
+
"learning_rate": 0.00019860865953691206,
|
| 383 |
+
"loss": 0.1035,
|
| 384 |
+
"mean_token_accuracy": 0.9647024512290955,
|
| 385 |
+
"num_tokens": 664415.0,
|
| 386 |
+
"step": 470
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"epoch": 0.007119865909192043,
|
| 390 |
+
"learning_rate": 0.0001985789934289571,
|
| 391 |
+
"loss": 0.1115,
|
| 392 |
+
"mean_token_accuracy": 0.9622772753238678,
|
| 393 |
+
"num_tokens": 678570.0,
|
| 394 |
+
"step": 480
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 0.007268196448966878,
|
| 398 |
+
"learning_rate": 0.00019854932732100213,
|
| 399 |
+
"loss": 0.1018,
|
| 400 |
+
"mean_token_accuracy": 0.9635290026664733,
|
| 401 |
+
"num_tokens": 692639.0,
|
| 402 |
+
"step": 490
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 0.007416526988741712,
|
| 406 |
+
"learning_rate": 0.00019851966121304718,
|
| 407 |
+
"loss": 0.073,
|
| 408 |
+
"mean_token_accuracy": 0.9765343546867371,
|
| 409 |
+
"num_tokens": 706812.0,
|
| 410 |
+
"step": 500
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"epoch": 0.007564857528516546,
|
| 414 |
+
"learning_rate": 0.0001984899951050922,
|
| 415 |
+
"loss": 0.0673,
|
| 416 |
+
"mean_token_accuracy": 0.9734417915344238,
|
| 417 |
+
"num_tokens": 721155.0,
|
| 418 |
+
"step": 510
|
| 419 |
+
},
|
| 420 |
+
{
|
| 421 |
+
"epoch": 0.0077131880682913805,
|
| 422 |
+
"learning_rate": 0.00019846032899713723,
|
| 423 |
+
"loss": 0.0523,
|
| 424 |
+
"mean_token_accuracy": 0.9798437833786011,
|
| 425 |
+
"num_tokens": 735500.0,
|
| 426 |
+
"step": 520
|
| 427 |
+
},
|
| 428 |
+
{
|
| 429 |
+
"epoch": 0.007861518608066215,
|
| 430 |
+
"learning_rate": 0.00019843066288918225,
|
| 431 |
+
"loss": 0.0972,
|
| 432 |
+
"mean_token_accuracy": 0.9699061930179596,
|
| 433 |
+
"num_tokens": 749718.0,
|
| 434 |
+
"step": 530
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"epoch": 0.00800984914784105,
|
| 438 |
+
"learning_rate": 0.0001984009967812273,
|
| 439 |
+
"loss": 0.0948,
|
| 440 |
+
"mean_token_accuracy": 0.9745772778987885,
|
| 441 |
+
"num_tokens": 763762.0,
|
| 442 |
+
"step": 540
|
| 443 |
+
},
|
| 444 |
+
{
|
| 445 |
+
"epoch": 0.008158179687615882,
|
| 446 |
+
"learning_rate": 0.00019837133067327235,
|
| 447 |
+
"loss": 0.0879,
|
| 448 |
+
"mean_token_accuracy": 0.9725308179855346,
|
| 449 |
+
"num_tokens": 777808.0,
|
| 450 |
+
"step": 550
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"epoch": 0.008306510227390717,
|
| 454 |
+
"learning_rate": 0.00019834166456531734,
|
| 455 |
+
"loss": 0.083,
|
| 456 |
+
"mean_token_accuracy": 0.9699824392795563,
|
| 457 |
+
"num_tokens": 791900.0,
|
| 458 |
+
"step": 560
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 0.008454840767165551,
|
| 462 |
+
"learning_rate": 0.0001983119984573624,
|
| 463 |
+
"loss": 0.0724,
|
| 464 |
+
"mean_token_accuracy": 0.9774831712245942,
|
| 465 |
+
"num_tokens": 805873.0,
|
| 466 |
+
"step": 570
|
| 467 |
+
},
|
| 468 |
+
{
|
| 469 |
+
"epoch": 0.008603171306940386,
|
| 470 |
+
"learning_rate": 0.00019828233234940742,
|
| 471 |
+
"loss": 0.0867,
|
| 472 |
+
"mean_token_accuracy": 0.9701934456825256,
|
| 473 |
+
"num_tokens": 820203.0,
|
| 474 |
+
"step": 580
|
| 475 |
+
},
|
| 476 |
+
{
|
| 477 |
+
"epoch": 0.00875150184671522,
|
| 478 |
+
"learning_rate": 0.00019825266624145247,
|
| 479 |
+
"loss": 0.0868,
|
| 480 |
+
"mean_token_accuracy": 0.9678167402744293,
|
| 481 |
+
"num_tokens": 834478.0,
|
| 482 |
+
"step": 590
|
| 483 |
+
},
|
| 484 |
+
{
|
| 485 |
+
"epoch": 0.008899832386490055,
|
| 486 |
+
"learning_rate": 0.00019822300013349752,
|
| 487 |
+
"loss": 0.087,
|
| 488 |
+
"mean_token_accuracy": 0.9668359816074371,
|
| 489 |
+
"num_tokens": 848321.0,
|
| 490 |
+
"step": 600
|
| 491 |
+
},
|
| 492 |
+
{
|
| 493 |
+
"epoch": 0.009048162926264888,
|
| 494 |
+
"learning_rate": 0.00019819333402554251,
|
| 495 |
+
"loss": 0.0579,
|
| 496 |
+
"mean_token_accuracy": 0.980604612827301,
|
| 497 |
+
"num_tokens": 862770.0,
|
| 498 |
+
"step": 610
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"epoch": 0.009196493466039722,
|
| 502 |
+
"learning_rate": 0.00019816366791758756,
|
| 503 |
+
"loss": 0.1058,
|
| 504 |
+
"mean_token_accuracy": 0.9675537347793579,
|
| 505 |
+
"num_tokens": 877137.0,
|
| 506 |
+
"step": 620
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"epoch": 0.009344824005814557,
|
| 510 |
+
"learning_rate": 0.0001981340018096326,
|
| 511 |
+
"loss": 0.0641,
|
| 512 |
+
"mean_token_accuracy": 0.9790356934070588,
|
| 513 |
+
"num_tokens": 891203.0,
|
| 514 |
+
"step": 630
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 0.009493154545589392,
|
| 518 |
+
"learning_rate": 0.00019810433570167764,
|
| 519 |
+
"loss": 0.0783,
|
| 520 |
+
"mean_token_accuracy": 0.9747758269309997,
|
| 521 |
+
"num_tokens": 905312.0,
|
| 522 |
+
"step": 640
|
| 523 |
+
},
|
| 524 |
+
{
|
| 525 |
+
"epoch": 0.009641485085364226,
|
| 526 |
+
"learning_rate": 0.00019807466959372266,
|
| 527 |
+
"loss": 0.0792,
|
| 528 |
+
"mean_token_accuracy": 0.9732525169849395,
|
| 529 |
+
"num_tokens": 919371.0,
|
| 530 |
+
"step": 650
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"epoch": 0.00978981562513906,
|
| 534 |
+
"learning_rate": 0.00019804500348576768,
|
| 535 |
+
"loss": 0.065,
|
| 536 |
+
"mean_token_accuracy": 0.9778533697128295,
|
| 537 |
+
"num_tokens": 933687.0,
|
| 538 |
+
"step": 660
|
| 539 |
+
},
|
| 540 |
+
{
|
| 541 |
+
"epoch": 0.009938146164913893,
|
| 542 |
+
"learning_rate": 0.00019801533737781273,
|
| 543 |
+
"loss": 0.0754,
|
| 544 |
+
"mean_token_accuracy": 0.9709767520427703,
|
| 545 |
+
"num_tokens": 947526.0,
|
| 546 |
+
"step": 670
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"epoch": 0.010086476704688728,
|
| 550 |
+
"learning_rate": 0.00019798567126985776,
|
| 551 |
+
"loss": 0.1087,
|
| 552 |
+
"mean_token_accuracy": 0.9693429231643677,
|
| 553 |
+
"num_tokens": 961548.0,
|
| 554 |
+
"step": 680
|
| 555 |
+
},
|
| 556 |
+
{
|
| 557 |
+
"epoch": 0.010234807244463563,
|
| 558 |
+
"learning_rate": 0.0001979560051619028,
|
| 559 |
+
"loss": 0.0875,
|
| 560 |
+
"mean_token_accuracy": 0.9671817898750306,
|
| 561 |
+
"num_tokens": 975556.0,
|
| 562 |
+
"step": 690
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"epoch": 0.010383137784238397,
|
| 566 |
+
"learning_rate": 0.00019792633905394783,
|
| 567 |
+
"loss": 0.0931,
|
| 568 |
+
"mean_token_accuracy": 0.9703514873981476,
|
| 569 |
+
"num_tokens": 989813.0,
|
| 570 |
+
"step": 700
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 0.010531468324013232,
|
| 574 |
+
"learning_rate": 0.00019789667294599285,
|
| 575 |
+
"loss": 0.0768,
|
| 576 |
+
"mean_token_accuracy": 0.973369836807251,
|
| 577 |
+
"num_tokens": 1003889.0,
|
| 578 |
+
"step": 710
|
| 579 |
+
},
|
| 580 |
+
{
|
| 581 |
+
"epoch": 0.010679798863788066,
|
| 582 |
+
"learning_rate": 0.0001978670068380379,
|
| 583 |
+
"loss": 0.1042,
|
| 584 |
+
"mean_token_accuracy": 0.9632522821426391,
|
| 585 |
+
"num_tokens": 1018053.0,
|
| 586 |
+
"step": 720
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"epoch": 0.010828129403562899,
|
| 590 |
+
"learning_rate": 0.00019783734073008293,
|
| 591 |
+
"loss": 0.0748,
|
| 592 |
+
"mean_token_accuracy": 0.9709040760993958,
|
| 593 |
+
"num_tokens": 1032016.0,
|
| 594 |
+
"step": 730
|
| 595 |
+
},
|
| 596 |
+
{
|
| 597 |
+
"epoch": 0.010976459943337733,
|
| 598 |
+
"learning_rate": 0.00019780767462212798,
|
| 599 |
+
"loss": 0.0944,
|
| 600 |
+
"mean_token_accuracy": 0.9654510498046875,
|
| 601 |
+
"num_tokens": 1046250.0,
|
| 602 |
+
"step": 740
|
| 603 |
+
},
|
| 604 |
+
{
|
| 605 |
+
"epoch": 0.011124790483112568,
|
| 606 |
+
"learning_rate": 0.000197778008514173,
|
| 607 |
+
"loss": 0.0982,
|
| 608 |
+
"mean_token_accuracy": 0.966586810350418,
|
| 609 |
+
"num_tokens": 1060619.0,
|
| 610 |
+
"step": 750
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"epoch": 0.011273121022887403,
|
| 614 |
+
"learning_rate": 0.00019774834240621802,
|
| 615 |
+
"loss": 0.0694,
|
| 616 |
+
"mean_token_accuracy": 0.9705922186374665,
|
| 617 |
+
"num_tokens": 1074706.0,
|
| 618 |
+
"step": 760
|
| 619 |
+
},
|
| 620 |
+
{
|
| 621 |
+
"epoch": 0.011421451562662237,
|
| 622 |
+
"learning_rate": 0.00019771867629826307,
|
| 623 |
+
"loss": 0.0875,
|
| 624 |
+
"mean_token_accuracy": 0.9699962615966797,
|
| 625 |
+
"num_tokens": 1088843.0,
|
| 626 |
+
"step": 770
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 0.01156978210243707,
|
| 630 |
+
"learning_rate": 0.0001976890101903081,
|
| 631 |
+
"loss": 0.0962,
|
| 632 |
+
"mean_token_accuracy": 0.9701560854911804,
|
| 633 |
+
"num_tokens": 1102921.0,
|
| 634 |
+
"step": 780
|
| 635 |
+
},
|
| 636 |
+
{
|
| 637 |
+
"epoch": 0.011718112642211904,
|
| 638 |
+
"learning_rate": 0.00019765934408235312,
|
| 639 |
+
"loss": 0.0641,
|
| 640 |
+
"mean_token_accuracy": 0.9813205659389496,
|
| 641 |
+
"num_tokens": 1117155.0,
|
| 642 |
+
"step": 790
|
| 643 |
+
},
|
| 644 |
+
{
|
| 645 |
+
"epoch": 0.011866443181986739,
|
| 646 |
+
"learning_rate": 0.00019762967797439814,
|
| 647 |
+
"loss": 0.0923,
|
| 648 |
+
"mean_token_accuracy": 0.9718149483203888,
|
| 649 |
+
"num_tokens": 1131435.0,
|
| 650 |
+
"step": 800
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"epoch": 0.012014773721761574,
|
| 654 |
+
"learning_rate": 0.0001976000118664432,
|
| 655 |
+
"loss": 0.0798,
|
| 656 |
+
"mean_token_accuracy": 0.9723540186882019,
|
| 657 |
+
"num_tokens": 1145444.0,
|
| 658 |
+
"step": 810
|
| 659 |
+
},
|
| 660 |
+
{
|
| 661 |
+
"epoch": 0.012163104261536408,
|
| 662 |
+
"learning_rate": 0.00019757034575848824,
|
| 663 |
+
"loss": 0.1028,
|
| 664 |
+
"mean_token_accuracy": 0.9645315170288086,
|
| 665 |
+
"num_tokens": 1159625.0,
|
| 666 |
+
"step": 820
|
| 667 |
+
},
|
| 668 |
+
{
|
| 669 |
+
"epoch": 0.012311434801311243,
|
| 670 |
+
"learning_rate": 0.00019754067965053326,
|
| 671 |
+
"loss": 0.0893,
|
| 672 |
+
"mean_token_accuracy": 0.9674223959445953,
|
| 673 |
+
"num_tokens": 1173790.0,
|
| 674 |
+
"step": 830
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"epoch": 0.012459765341086075,
|
| 678 |
+
"learning_rate": 0.0001975110135425783,
|
| 679 |
+
"loss": 0.0703,
|
| 680 |
+
"mean_token_accuracy": 0.9750796139240265,
|
| 681 |
+
"num_tokens": 1187787.0,
|
| 682 |
+
"step": 840
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"epoch": 0.01260809588086091,
|
| 686 |
+
"learning_rate": 0.0001974813474346233,
|
| 687 |
+
"loss": 0.0885,
|
| 688 |
+
"mean_token_accuracy": 0.9665023148059845,
|
| 689 |
+
"num_tokens": 1202159.0,
|
| 690 |
+
"step": 850
|
| 691 |
+
},
|
| 692 |
+
{
|
| 693 |
+
"epoch": 0.012756426420635745,
|
| 694 |
+
"learning_rate": 0.00019745168132666836,
|
| 695 |
+
"loss": 0.0793,
|
| 696 |
+
"mean_token_accuracy": 0.9742439985275269,
|
| 697 |
+
"num_tokens": 1216318.0,
|
| 698 |
+
"step": 860
|
| 699 |
+
},
|
| 700 |
+
{
|
| 701 |
+
"epoch": 0.012904756960410579,
|
| 702 |
+
"learning_rate": 0.0001974220152187134,
|
| 703 |
+
"loss": 0.0654,
|
| 704 |
+
"mean_token_accuracy": 0.9788648605346679,
|
| 705 |
+
"num_tokens": 1230383.0,
|
| 706 |
+
"step": 870
|
| 707 |
+
},
|
| 708 |
+
{
|
| 709 |
+
"epoch": 0.013053087500185414,
|
| 710 |
+
"learning_rate": 0.00019739234911075843,
|
| 711 |
+
"loss": 0.0862,
|
| 712 |
+
"mean_token_accuracy": 0.9693156242370605,
|
| 713 |
+
"num_tokens": 1244340.0,
|
| 714 |
+
"step": 880
|
| 715 |
+
},
|
| 716 |
+
{
|
| 717 |
+
"epoch": 0.013201418039960248,
|
| 718 |
+
"learning_rate": 0.00019736268300280346,
|
| 719 |
+
"loss": 0.091,
|
| 720 |
+
"mean_token_accuracy": 0.969611394405365,
|
| 721 |
+
"num_tokens": 1258576.0,
|
| 722 |
+
"step": 890
|
| 723 |
+
},
|
| 724 |
+
{
|
| 725 |
+
"epoch": 0.013349748579735081,
|
| 726 |
+
"learning_rate": 0.00019733301689484848,
|
| 727 |
+
"loss": 0.0907,
|
| 728 |
+
"mean_token_accuracy": 0.9679802298545838,
|
| 729 |
+
"num_tokens": 1272705.0,
|
| 730 |
+
"step": 900
|
| 731 |
+
},
|
| 732 |
+
{
|
| 733 |
+
"epoch": 0.013498079119509915,
|
| 734 |
+
"learning_rate": 0.00019730335078689353,
|
| 735 |
+
"loss": 0.098,
|
| 736 |
+
"mean_token_accuracy": 0.9632154762744903,
|
| 737 |
+
"num_tokens": 1286700.0,
|
| 738 |
+
"step": 910
|
| 739 |
+
},
|
| 740 |
+
{
|
| 741 |
+
"epoch": 0.01364640965928475,
|
| 742 |
+
"learning_rate": 0.00019727368467893855,
|
| 743 |
+
"loss": 0.0713,
|
| 744 |
+
"mean_token_accuracy": 0.973181027173996,
|
| 745 |
+
"num_tokens": 1300730.0,
|
| 746 |
+
"step": 920
|
| 747 |
+
},
|
| 748 |
+
{
|
| 749 |
+
"epoch": 0.013794740199059585,
|
| 750 |
+
"learning_rate": 0.00019724401857098357,
|
| 751 |
+
"loss": 0.0926,
|
| 752 |
+
"mean_token_accuracy": 0.9724487364292145,
|
| 753 |
+
"num_tokens": 1314843.0,
|
| 754 |
+
"step": 930
|
| 755 |
+
},
|
| 756 |
+
{
|
| 757 |
+
"epoch": 0.013943070738834419,
|
| 758 |
+
"learning_rate": 0.00019721435246302862,
|
| 759 |
+
"loss": 0.078,
|
| 760 |
+
"mean_token_accuracy": 0.9706206858158112,
|
| 761 |
+
"num_tokens": 1328967.0,
|
| 762 |
+
"step": 940
|
| 763 |
+
},
|
| 764 |
+
{
|
| 765 |
+
"epoch": 0.014091401278609254,
|
| 766 |
+
"learning_rate": 0.00019718468635507365,
|
| 767 |
+
"loss": 0.11,
|
| 768 |
+
"mean_token_accuracy": 0.962195897102356,
|
| 769 |
+
"num_tokens": 1343046.0,
|
| 770 |
+
"step": 950
|
| 771 |
+
},
|
| 772 |
+
{
|
| 773 |
+
"epoch": 0.014239731818384086,
|
| 774 |
+
"learning_rate": 0.0001971550202471187,
|
| 775 |
+
"loss": 0.0722,
|
| 776 |
+
"mean_token_accuracy": 0.9760941386222839,
|
| 777 |
+
"num_tokens": 1357072.0,
|
| 778 |
+
"step": 960
|
| 779 |
+
},
|
| 780 |
+
{
|
| 781 |
+
"epoch": 0.014388062358158921,
|
| 782 |
+
"learning_rate": 0.00019712535413916372,
|
| 783 |
+
"loss": 0.0626,
|
| 784 |
+
"mean_token_accuracy": 0.9810939252376556,
|
| 785 |
+
"num_tokens": 1371301.0,
|
| 786 |
+
"step": 970
|
| 787 |
+
},
|
| 788 |
+
{
|
| 789 |
+
"epoch": 0.014536392897933756,
|
| 790 |
+
"learning_rate": 0.00019709568803120874,
|
| 791 |
+
"loss": 0.0972,
|
| 792 |
+
"mean_token_accuracy": 0.9752714991569519,
|
| 793 |
+
"num_tokens": 1385224.0,
|
| 794 |
+
"step": 980
|
| 795 |
+
},
|
| 796 |
+
{
|
| 797 |
+
"epoch": 0.01468472343770859,
|
| 798 |
+
"learning_rate": 0.0001970660219232538,
|
| 799 |
+
"loss": 0.065,
|
| 800 |
+
"mean_token_accuracy": 0.9749573111534119,
|
| 801 |
+
"num_tokens": 1399236.0,
|
| 802 |
+
"step": 990
|
| 803 |
+
},
|
| 804 |
+
{
|
| 805 |
+
"epoch": 0.014833053977483425,
|
| 806 |
+
"learning_rate": 0.00019703635581529882,
|
| 807 |
+
"loss": 0.0733,
|
| 808 |
+
"mean_token_accuracy": 0.971584141254425,
|
| 809 |
+
"num_tokens": 1413329.0,
|
| 810 |
+
"step": 1000
|
| 811 |
+
},
|
| 812 |
+
{
|
| 813 |
+
"epoch": 0.01498138451725826,
|
| 814 |
+
"learning_rate": 0.00019700668970734387,
|
| 815 |
+
"loss": 0.0984,
|
| 816 |
+
"mean_token_accuracy": 0.9667350590229035,
|
| 817 |
+
"num_tokens": 1427449.0,
|
| 818 |
+
"step": 1010
|
| 819 |
+
},
|
| 820 |
+
{
|
| 821 |
+
"epoch": 0.015129715057033092,
|
| 822 |
+
"learning_rate": 0.0001969770235993889,
|
| 823 |
+
"loss": 0.0758,
|
| 824 |
+
"mean_token_accuracy": 0.9693971753120423,
|
| 825 |
+
"num_tokens": 1441671.0,
|
| 826 |
+
"step": 1020
|
| 827 |
+
},
|
| 828 |
+
{
|
| 829 |
+
"epoch": 0.015278045596807927,
|
| 830 |
+
"learning_rate": 0.0001969473574914339,
|
| 831 |
+
"loss": 0.0877,
|
| 832 |
+
"mean_token_accuracy": 0.9640409171581268,
|
| 833 |
+
"num_tokens": 1455599.0,
|
| 834 |
+
"step": 1030
|
| 835 |
+
},
|
| 836 |
+
{
|
| 837 |
+
"epoch": 0.015426376136582761,
|
| 838 |
+
"learning_rate": 0.00019691769138347896,
|
| 839 |
+
"loss": 0.078,
|
| 840 |
+
"mean_token_accuracy": 0.973029488325119,
|
| 841 |
+
"num_tokens": 1469741.0,
|
| 842 |
+
"step": 1040
|
| 843 |
+
},
|
| 844 |
+
{
|
| 845 |
+
"epoch": 0.015574706676357596,
|
| 846 |
+
"learning_rate": 0.00019688802527552399,
|
| 847 |
+
"loss": 0.0709,
|
| 848 |
+
"mean_token_accuracy": 0.97868133187294,
|
| 849 |
+
"num_tokens": 1483981.0,
|
| 850 |
+
"step": 1050
|
| 851 |
+
},
|
| 852 |
+
{
|
| 853 |
+
"epoch": 0.01572303721613243,
|
| 854 |
+
"learning_rate": 0.00019685835916756904,
|
| 855 |
+
"loss": 0.0718,
|
| 856 |
+
"mean_token_accuracy": 0.9745873928070068,
|
| 857 |
+
"num_tokens": 1497968.0,
|
| 858 |
+
"step": 1060
|
| 859 |
+
},
|
| 860 |
+
{
|
| 861 |
+
"epoch": 0.015871367755907265,
|
| 862 |
+
"learning_rate": 0.00019682869305961403,
|
| 863 |
+
"loss": 0.073,
|
| 864 |
+
"mean_token_accuracy": 0.9762903869152069,
|
| 865 |
+
"num_tokens": 1511999.0,
|
| 866 |
+
"step": 1070
|
| 867 |
+
},
|
| 868 |
+
{
|
| 869 |
+
"epoch": 0.0160196982956821,
|
| 870 |
+
"learning_rate": 0.00019679902695165908,
|
| 871 |
+
"loss": 0.0854,
|
| 872 |
+
"mean_token_accuracy": 0.9715777993202209,
|
| 873 |
+
"num_tokens": 1526208.0,
|
| 874 |
+
"step": 1080
|
| 875 |
+
},
|
| 876 |
+
{
|
| 877 |
+
"epoch": 0.016168028835456934,
|
| 878 |
+
"learning_rate": 0.00019676936084370413,
|
| 879 |
+
"loss": 0.0589,
|
| 880 |
+
"mean_token_accuracy": 0.9842264592647553,
|
| 881 |
+
"num_tokens": 1540482.0,
|
| 882 |
+
"step": 1090
|
| 883 |
+
},
|
| 884 |
+
{
|
| 885 |
+
"epoch": 0.016316359375231765,
|
| 886 |
+
"learning_rate": 0.00019673969473574916,
|
| 887 |
+
"loss": 0.0702,
|
| 888 |
+
"mean_token_accuracy": 0.9759678483009339,
|
| 889 |
+
"num_tokens": 1554508.0,
|
| 890 |
+
"step": 1100
|
| 891 |
+
},
|
| 892 |
+
{
|
| 893 |
+
"epoch": 0.0164646899150066,
|
| 894 |
+
"learning_rate": 0.0001967100286277942,
|
| 895 |
+
"loss": 0.0768,
|
| 896 |
+
"mean_token_accuracy": 0.9776164293289185,
|
| 897 |
+
"num_tokens": 1568694.0,
|
| 898 |
+
"step": 1110
|
| 899 |
+
},
|
| 900 |
+
{
|
| 901 |
+
"epoch": 0.016613020454781434,
|
| 902 |
+
"learning_rate": 0.0001966803625198392,
|
| 903 |
+
"loss": 0.0879,
|
| 904 |
+
"mean_token_accuracy": 0.973237669467926,
|
| 905 |
+
"num_tokens": 1582893.0,
|
| 906 |
+
"step": 1120
|
| 907 |
+
},
|
| 908 |
+
{
|
| 909 |
+
"epoch": 0.01676135099455627,
|
| 910 |
+
"learning_rate": 0.00019665069641188425,
|
| 911 |
+
"loss": 0.0974,
|
| 912 |
+
"mean_token_accuracy": 0.974377167224884,
|
| 913 |
+
"num_tokens": 1597044.0,
|
| 914 |
+
"step": 1130
|
| 915 |
+
},
|
| 916 |
+
{
|
| 917 |
+
"epoch": 0.016909681534331103,
|
| 918 |
+
"learning_rate": 0.00019662103030392927,
|
| 919 |
+
"loss": 0.0856,
|
| 920 |
+
"mean_token_accuracy": 0.9713896453380585,
|
| 921 |
+
"num_tokens": 1611013.0,
|
| 922 |
+
"step": 1140
|
| 923 |
+
},
|
| 924 |
+
{
|
| 925 |
+
"epoch": 0.017058012074105938,
|
| 926 |
+
"learning_rate": 0.00019659136419597432,
|
| 927 |
+
"loss": 0.079,
|
| 928 |
+
"mean_token_accuracy": 0.9759809911251068,
|
| 929 |
+
"num_tokens": 1625267.0,
|
| 930 |
+
"step": 1150
|
| 931 |
+
},
|
| 932 |
+
{
|
| 933 |
+
"epoch": 0.017206342613880772,
|
| 934 |
+
"learning_rate": 0.00019656169808801935,
|
| 935 |
+
"loss": 0.0837,
|
| 936 |
+
"mean_token_accuracy": 0.9738470792770386,
|
| 937 |
+
"num_tokens": 1639399.0,
|
| 938 |
+
"step": 1160
|
| 939 |
+
},
|
| 940 |
+
{
|
| 941 |
+
"epoch": 0.017354673153655607,
|
| 942 |
+
"learning_rate": 0.00019653203198006437,
|
| 943 |
+
"loss": 0.0851,
|
| 944 |
+
"mean_token_accuracy": 0.9707063674926758,
|
| 945 |
+
"num_tokens": 1653578.0,
|
| 946 |
+
"step": 1170
|
| 947 |
+
},
|
| 948 |
+
{
|
| 949 |
+
"epoch": 0.01750300369343044,
|
| 950 |
+
"learning_rate": 0.00019650236587210942,
|
| 951 |
+
"loss": 0.0848,
|
| 952 |
+
"mean_token_accuracy": 0.9718270719051361,
|
| 953 |
+
"num_tokens": 1667589.0,
|
| 954 |
+
"step": 1180
|
| 955 |
+
},
|
| 956 |
+
{
|
| 957 |
+
"epoch": 0.017651334233205276,
|
| 958 |
+
"learning_rate": 0.00019647269976415444,
|
| 959 |
+
"loss": 0.0594,
|
| 960 |
+
"mean_token_accuracy": 0.9799826085567475,
|
| 961 |
+
"num_tokens": 1682134.0,
|
| 962 |
+
"step": 1190
|
| 963 |
+
},
|
| 964 |
+
{
|
| 965 |
+
"epoch": 0.01779966477298011,
|
| 966 |
+
"learning_rate": 0.0001964430336561995,
|
| 967 |
+
"loss": 0.0945,
|
| 968 |
+
"mean_token_accuracy": 0.9730036854743958,
|
| 969 |
+
"num_tokens": 1696174.0,
|
| 970 |
+
"step": 1200
|
| 971 |
+
},
|
| 972 |
+
{
|
| 973 |
+
"epoch": 0.017947995312754945,
|
| 974 |
+
"learning_rate": 0.00019641336754824452,
|
| 975 |
+
"loss": 0.077,
|
| 976 |
+
"mean_token_accuracy": 0.9751695334911347,
|
| 977 |
+
"num_tokens": 1710134.0,
|
| 978 |
+
"step": 1210
|
| 979 |
+
},
|
| 980 |
+
{
|
| 981 |
+
"epoch": 0.018096325852529776,
|
| 982 |
+
"learning_rate": 0.00019638370144028954,
|
| 983 |
+
"loss": 0.0812,
|
| 984 |
+
"mean_token_accuracy": 0.9705005526542664,
|
| 985 |
+
"num_tokens": 1724066.0,
|
| 986 |
+
"step": 1220
|
| 987 |
+
},
|
| 988 |
+
{
|
| 989 |
+
"epoch": 0.01824465639230461,
|
| 990 |
+
"learning_rate": 0.0001963540353323346,
|
| 991 |
+
"loss": 0.0704,
|
| 992 |
+
"mean_token_accuracy": 0.9745881497859955,
|
| 993 |
+
"num_tokens": 1738123.0,
|
| 994 |
+
"step": 1230
|
| 995 |
+
},
|
| 996 |
+
{
|
| 997 |
+
"epoch": 0.018392986932079445,
|
| 998 |
+
"learning_rate": 0.0001963243692243796,
|
| 999 |
+
"loss": 0.0702,
|
| 1000 |
+
"mean_token_accuracy": 0.975337028503418,
|
| 1001 |
+
"num_tokens": 1752512.0,
|
| 1002 |
+
"step": 1240
|
| 1003 |
+
},
|
| 1004 |
+
{
|
| 1005 |
+
"epoch": 0.01854131747185428,
|
| 1006 |
+
"learning_rate": 0.00019629470311642466,
|
| 1007 |
+
"loss": 0.0658,
|
| 1008 |
+
"mean_token_accuracy": 0.9756161510944367,
|
| 1009 |
+
"num_tokens": 1766708.0,
|
| 1010 |
+
"step": 1250
|
| 1011 |
+
},
|
| 1012 |
+
{
|
| 1013 |
+
"epoch": 0.018689648011629114,
|
| 1014 |
+
"learning_rate": 0.00019626503700846969,
|
| 1015 |
+
"loss": 0.0774,
|
| 1016 |
+
"mean_token_accuracy": 0.9716881215572357,
|
| 1017 |
+
"num_tokens": 1780735.0,
|
| 1018 |
+
"step": 1260
|
| 1019 |
+
},
|
| 1020 |
+
{
|
| 1021 |
+
"epoch": 0.01883797855140395,
|
| 1022 |
+
"learning_rate": 0.0001962353709005147,
|
| 1023 |
+
"loss": 0.0714,
|
| 1024 |
+
"mean_token_accuracy": 0.9714816689491272,
|
| 1025 |
+
"num_tokens": 1794777.0,
|
| 1026 |
+
"step": 1270
|
| 1027 |
+
},
|
| 1028 |
+
{
|
| 1029 |
+
"epoch": 0.018986309091178783,
|
| 1030 |
+
"learning_rate": 0.00019620570479255976,
|
| 1031 |
+
"loss": 0.0668,
|
| 1032 |
+
"mean_token_accuracy": 0.9810837268829345,
|
| 1033 |
+
"num_tokens": 1809000.0,
|
| 1034 |
+
"step": 1280
|
| 1035 |
+
},
|
| 1036 |
+
{
|
| 1037 |
+
"epoch": 0.019134639630953618,
|
| 1038 |
+
"learning_rate": 0.00019617603868460478,
|
| 1039 |
+
"loss": 0.0793,
|
| 1040 |
+
"mean_token_accuracy": 0.9687060952186585,
|
| 1041 |
+
"num_tokens": 1822994.0,
|
| 1042 |
+
"step": 1290
|
| 1043 |
+
},
|
| 1044 |
+
{
|
| 1045 |
+
"epoch": 0.019282970170728452,
|
| 1046 |
+
"learning_rate": 0.0001961463725766498,
|
| 1047 |
+
"loss": 0.1032,
|
| 1048 |
+
"mean_token_accuracy": 0.9625421404838562,
|
| 1049 |
+
"num_tokens": 1836944.0,
|
| 1050 |
+
"step": 1300
|
| 1051 |
+
},
|
| 1052 |
+
{
|
| 1053 |
+
"epoch": 0.019431300710503287,
|
| 1054 |
+
"learning_rate": 0.00019611670646869485,
|
| 1055 |
+
"loss": 0.0492,
|
| 1056 |
+
"mean_token_accuracy": 0.9850762605667114,
|
| 1057 |
+
"num_tokens": 1851184.0,
|
| 1058 |
+
"step": 1310
|
| 1059 |
+
},
|
| 1060 |
+
{
|
| 1061 |
+
"epoch": 0.01957963125027812,
|
| 1062 |
+
"learning_rate": 0.00019608704036073988,
|
| 1063 |
+
"loss": 0.057,
|
| 1064 |
+
"mean_token_accuracy": 0.9841500043869018,
|
| 1065 |
+
"num_tokens": 1865381.0,
|
| 1066 |
+
"step": 1320
|
| 1067 |
+
},
|
| 1068 |
+
{
|
| 1069 |
+
"epoch": 0.019727961790052952,
|
| 1070 |
+
"learning_rate": 0.00019605737425278493,
|
| 1071 |
+
"loss": 0.0896,
|
| 1072 |
+
"mean_token_accuracy": 0.9729780375957489,
|
| 1073 |
+
"num_tokens": 1879523.0,
|
| 1074 |
+
"step": 1330
|
| 1075 |
+
},
|
| 1076 |
+
{
|
| 1077 |
+
"epoch": 0.019876292329827787,
|
| 1078 |
+
"learning_rate": 0.00019602770814482995,
|
| 1079 |
+
"loss": 0.0853,
|
| 1080 |
+
"mean_token_accuracy": 0.9706455945968628,
|
| 1081 |
+
"num_tokens": 1893719.0,
|
| 1082 |
+
"step": 1340
|
| 1083 |
+
},
|
| 1084 |
+
{
|
| 1085 |
+
"epoch": 0.02002462286960262,
|
| 1086 |
+
"learning_rate": 0.00019599804203687497,
|
| 1087 |
+
"loss": 0.0721,
|
| 1088 |
+
"mean_token_accuracy": 0.9733758211135864,
|
| 1089 |
+
"num_tokens": 1908020.0,
|
| 1090 |
+
"step": 1350
|
| 1091 |
+
},
|
| 1092 |
+
{
|
| 1093 |
+
"epoch": 0.020172953409377456,
|
| 1094 |
+
"learning_rate": 0.00019596837592892002,
|
| 1095 |
+
"loss": 0.0645,
|
| 1096 |
+
"mean_token_accuracy": 0.9787311971187591,
|
| 1097 |
+
"num_tokens": 1922020.0,
|
| 1098 |
+
"step": 1360
|
| 1099 |
+
},
|
| 1100 |
+
{
|
| 1101 |
+
"epoch": 0.02032128394915229,
|
| 1102 |
+
"learning_rate": 0.00019593870982096505,
|
| 1103 |
+
"loss": 0.0669,
|
| 1104 |
+
"mean_token_accuracy": 0.974337100982666,
|
| 1105 |
+
"num_tokens": 1936105.0,
|
| 1106 |
+
"step": 1370
|
| 1107 |
+
},
|
| 1108 |
+
{
|
| 1109 |
+
"epoch": 0.020469614488927125,
|
| 1110 |
+
"learning_rate": 0.0001959090437130101,
|
| 1111 |
+
"loss": 0.076,
|
| 1112 |
+
"mean_token_accuracy": 0.9723476529121399,
|
| 1113 |
+
"num_tokens": 1950298.0,
|
| 1114 |
+
"step": 1380
|
| 1115 |
+
},
|
| 1116 |
+
{
|
| 1117 |
+
"epoch": 0.02061794502870196,
|
| 1118 |
+
"learning_rate": 0.0001958793776050551,
|
| 1119 |
+
"loss": 0.0691,
|
| 1120 |
+
"mean_token_accuracy": 0.9764381349086761,
|
| 1121 |
+
"num_tokens": 1964529.0,
|
| 1122 |
+
"step": 1390
|
| 1123 |
+
},
|
| 1124 |
+
{
|
| 1125 |
+
"epoch": 0.020766275568476794,
|
| 1126 |
+
"learning_rate": 0.00019584971149710014,
|
| 1127 |
+
"loss": 0.0593,
|
| 1128 |
+
"mean_token_accuracy": 0.9796685576438904,
|
| 1129 |
+
"num_tokens": 1978773.0,
|
| 1130 |
+
"step": 1400
|
| 1131 |
+
},
|
| 1132 |
+
{
|
| 1133 |
+
"epoch": 0.02091460610825163,
|
| 1134 |
+
"learning_rate": 0.00019582004538914517,
|
| 1135 |
+
"loss": 0.0995,
|
| 1136 |
+
"mean_token_accuracy": 0.9613340020179748,
|
| 1137 |
+
"num_tokens": 1993002.0,
|
| 1138 |
+
"step": 1410
|
| 1139 |
+
},
|
| 1140 |
+
{
|
| 1141 |
+
"epoch": 0.021062936648026463,
|
| 1142 |
+
"learning_rate": 0.00019579037928119022,
|
| 1143 |
+
"loss": 0.1051,
|
| 1144 |
+
"mean_token_accuracy": 0.9684881687164306,
|
| 1145 |
+
"num_tokens": 2007048.0,
|
| 1146 |
+
"step": 1420
|
| 1147 |
+
},
|
| 1148 |
+
{
|
| 1149 |
+
"epoch": 0.021211267187801298,
|
| 1150 |
+
"learning_rate": 0.00019576071317323527,
|
| 1151 |
+
"loss": 0.0869,
|
| 1152 |
+
"mean_token_accuracy": 0.9721337735652924,
|
| 1153 |
+
"num_tokens": 2021156.0,
|
| 1154 |
+
"step": 1430
|
| 1155 |
+
},
|
| 1156 |
+
{
|
| 1157 |
+
"epoch": 0.021359597727576132,
|
| 1158 |
+
"learning_rate": 0.00019573104706528026,
|
| 1159 |
+
"loss": 0.0897,
|
| 1160 |
+
"mean_token_accuracy": 0.9715201795101166,
|
| 1161 |
+
"num_tokens": 2035271.0,
|
| 1162 |
+
"step": 1440
|
| 1163 |
+
},
|
| 1164 |
+
{
|
| 1165 |
+
"epoch": 0.021507928267350963,
|
| 1166 |
+
"learning_rate": 0.0001957013809573253,
|
| 1167 |
+
"loss": 0.0706,
|
| 1168 |
+
"mean_token_accuracy": 0.9721573770046235,
|
| 1169 |
+
"num_tokens": 2049388.0,
|
| 1170 |
+
"step": 1450
|
| 1171 |
+
},
|
| 1172 |
+
{
|
| 1173 |
+
"epoch": 0.021656258807125798,
|
| 1174 |
+
"learning_rate": 0.00019567171484937034,
|
| 1175 |
+
"loss": 0.0846,
|
| 1176 |
+
"mean_token_accuracy": 0.9714311838150025,
|
| 1177 |
+
"num_tokens": 2063480.0,
|
| 1178 |
+
"step": 1460
|
| 1179 |
+
},
|
| 1180 |
+
{
|
| 1181 |
+
"epoch": 0.021804589346900632,
|
| 1182 |
+
"learning_rate": 0.00019564204874141539,
|
| 1183 |
+
"loss": 0.0765,
|
| 1184 |
+
"mean_token_accuracy": 0.9730508744716644,
|
| 1185 |
+
"num_tokens": 2077370.0,
|
| 1186 |
+
"step": 1470
|
| 1187 |
+
},
|
| 1188 |
+
{
|
| 1189 |
+
"epoch": 0.021952919886675467,
|
| 1190 |
+
"learning_rate": 0.00019561238263346044,
|
| 1191 |
+
"loss": 0.0791,
|
| 1192 |
+
"mean_token_accuracy": 0.9674266993999481,
|
| 1193 |
+
"num_tokens": 2091582.0,
|
| 1194 |
+
"step": 1480
|
| 1195 |
+
},
|
| 1196 |
+
{
|
| 1197 |
+
"epoch": 0.0221012504264503,
|
| 1198 |
+
"learning_rate": 0.00019558271652550543,
|
| 1199 |
+
"loss": 0.0528,
|
| 1200 |
+
"mean_token_accuracy": 0.9833854794502258,
|
| 1201 |
+
"num_tokens": 2105530.0,
|
| 1202 |
+
"step": 1490
|
| 1203 |
+
},
|
| 1204 |
+
{
|
| 1205 |
+
"epoch": 0.022249580966225136,
|
| 1206 |
+
"learning_rate": 0.00019555305041755048,
|
| 1207 |
+
"loss": 0.0687,
|
| 1208 |
+
"mean_token_accuracy": 0.9794368088245392,
|
| 1209 |
+
"num_tokens": 2119632.0,
|
| 1210 |
+
"step": 1500
|
| 1211 |
+
},
|
| 1212 |
+
{
|
| 1213 |
+
"epoch": 0.02239791150599997,
|
| 1214 |
+
"learning_rate": 0.0001955233843095955,
|
| 1215 |
+
"loss": 0.0636,
|
| 1216 |
+
"mean_token_accuracy": 0.9763643145561218,
|
| 1217 |
+
"num_tokens": 2133725.0,
|
| 1218 |
+
"step": 1510
|
| 1219 |
+
},
|
| 1220 |
+
{
|
| 1221 |
+
"epoch": 0.022546242045774805,
|
| 1222 |
+
"learning_rate": 0.00019549371820164055,
|
| 1223 |
+
"loss": 0.0827,
|
| 1224 |
+
"mean_token_accuracy": 0.9748819410800934,
|
| 1225 |
+
"num_tokens": 2147866.0,
|
| 1226 |
+
"step": 1520
|
| 1227 |
+
},
|
| 1228 |
+
{
|
| 1229 |
+
"epoch": 0.02269457258554964,
|
| 1230 |
+
"learning_rate": 0.00019546405209368558,
|
| 1231 |
+
"loss": 0.0582,
|
| 1232 |
+
"mean_token_accuracy": 0.9838931798934937,
|
| 1233 |
+
"num_tokens": 2161994.0,
|
| 1234 |
+
"step": 1530
|
| 1235 |
+
},
|
| 1236 |
+
{
|
| 1237 |
+
"epoch": 0.022842903125324474,
|
| 1238 |
+
"learning_rate": 0.0001954343859857306,
|
| 1239 |
+
"loss": 0.059,
|
| 1240 |
+
"mean_token_accuracy": 0.9762311697006225,
|
| 1241 |
+
"num_tokens": 2176227.0,
|
| 1242 |
+
"step": 1540
|
| 1243 |
+
},
|
| 1244 |
+
{
|
| 1245 |
+
"epoch": 0.02299123366509931,
|
| 1246 |
+
"learning_rate": 0.00019540471987777565,
|
| 1247 |
+
"loss": 0.0773,
|
| 1248 |
+
"mean_token_accuracy": 0.977224487066269,
|
| 1249 |
+
"num_tokens": 2190154.0,
|
| 1250 |
+
"step": 1550
|
| 1251 |
+
},
|
| 1252 |
+
{
|
| 1253 |
+
"epoch": 0.02313956420487414,
|
| 1254 |
+
"learning_rate": 0.00019537505376982067,
|
| 1255 |
+
"loss": 0.0893,
|
| 1256 |
+
"mean_token_accuracy": 0.9703286468982697,
|
| 1257 |
+
"num_tokens": 2204261.0,
|
| 1258 |
+
"step": 1560
|
| 1259 |
+
},
|
| 1260 |
+
{
|
| 1261 |
+
"epoch": 0.023287894744648974,
|
| 1262 |
+
"learning_rate": 0.00019534538766186572,
|
| 1263 |
+
"loss": 0.0752,
|
| 1264 |
+
"mean_token_accuracy": 0.974709951877594,
|
| 1265 |
+
"num_tokens": 2218636.0,
|
| 1266 |
+
"step": 1570
|
| 1267 |
+
},
|
| 1268 |
+
{
|
| 1269 |
+
"epoch": 0.02343622528442381,
|
| 1270 |
+
"learning_rate": 0.00019531572155391075,
|
| 1271 |
+
"loss": 0.0841,
|
| 1272 |
+
"mean_token_accuracy": 0.9769960045814514,
|
| 1273 |
+
"num_tokens": 2232764.0,
|
| 1274 |
+
"step": 1580
|
| 1275 |
+
},
|
| 1276 |
+
{
|
| 1277 |
+
"epoch": 0.023584555824198643,
|
| 1278 |
+
"learning_rate": 0.00019528605544595577,
|
| 1279 |
+
"loss": 0.0642,
|
| 1280 |
+
"mean_token_accuracy": 0.977448046207428,
|
| 1281 |
+
"num_tokens": 2246804.0,
|
| 1282 |
+
"step": 1590
|
| 1283 |
+
},
|
| 1284 |
+
{
|
| 1285 |
+
"epoch": 0.023732886363973478,
|
| 1286 |
+
"learning_rate": 0.00019525638933800082,
|
| 1287 |
+
"loss": 0.1069,
|
| 1288 |
+
"mean_token_accuracy": 0.9677329897880554,
|
| 1289 |
+
"num_tokens": 2260874.0,
|
| 1290 |
+
"step": 1600
|
| 1291 |
+
},
|
| 1292 |
+
{
|
| 1293 |
+
"epoch": 0.023881216903748313,
|
| 1294 |
+
"learning_rate": 0.00019522672323004584,
|
| 1295 |
+
"loss": 0.0833,
|
| 1296 |
+
"mean_token_accuracy": 0.9727708697319031,
|
| 1297 |
+
"num_tokens": 2275017.0,
|
| 1298 |
+
"step": 1610
|
| 1299 |
+
},
|
| 1300 |
+
{
|
| 1301 |
+
"epoch": 0.024029547443523147,
|
| 1302 |
+
"learning_rate": 0.0001951970571220909,
|
| 1303 |
+
"loss": 0.0785,
|
| 1304 |
+
"mean_token_accuracy": 0.9746884107589722,
|
| 1305 |
+
"num_tokens": 2289036.0,
|
| 1306 |
+
"step": 1620
|
| 1307 |
+
},
|
| 1308 |
+
{
|
| 1309 |
+
"epoch": 0.02417787798329798,
|
| 1310 |
+
"learning_rate": 0.00019516739101413592,
|
| 1311 |
+
"loss": 0.0609,
|
| 1312 |
+
"mean_token_accuracy": 0.9793126463890076,
|
| 1313 |
+
"num_tokens": 2303029.0,
|
| 1314 |
+
"step": 1630
|
| 1315 |
+
},
|
| 1316 |
+
{
|
| 1317 |
+
"epoch": 0.024326208523072816,
|
| 1318 |
+
"learning_rate": 0.00019513772490618094,
|
| 1319 |
+
"loss": 0.0772,
|
| 1320 |
+
"mean_token_accuracy": 0.9758535027503967,
|
| 1321 |
+
"num_tokens": 2317190.0,
|
| 1322 |
+
"step": 1640
|
| 1323 |
+
},
|
| 1324 |
+
{
|
| 1325 |
+
"epoch": 0.02447453906284765,
|
| 1326 |
+
"learning_rate": 0.000195108058798226,
|
| 1327 |
+
"loss": 0.08,
|
| 1328 |
+
"mean_token_accuracy": 0.9721879363059998,
|
| 1329 |
+
"num_tokens": 2331217.0,
|
| 1330 |
+
"step": 1650
|
| 1331 |
+
},
|
| 1332 |
+
{
|
| 1333 |
+
"epoch": 0.024622869602622485,
|
| 1334 |
+
"learning_rate": 0.000195078392690271,
|
| 1335 |
+
"loss": 0.0801,
|
| 1336 |
+
"mean_token_accuracy": 0.9673756003379822,
|
| 1337 |
+
"num_tokens": 2345300.0,
|
| 1338 |
+
"step": 1660
|
| 1339 |
+
},
|
| 1340 |
+
{
|
| 1341 |
+
"epoch": 0.02477120014239732,
|
| 1342 |
+
"learning_rate": 0.00019504872658231603,
|
| 1343 |
+
"loss": 0.0749,
|
| 1344 |
+
"mean_token_accuracy": 0.9750915884971618,
|
| 1345 |
+
"num_tokens": 2359603.0,
|
| 1346 |
+
"step": 1670
|
| 1347 |
+
},
|
| 1348 |
+
{
|
| 1349 |
+
"epoch": 0.02491953068217215,
|
| 1350 |
+
"learning_rate": 0.00019501906047436106,
|
| 1351 |
+
"loss": 0.0913,
|
| 1352 |
+
"mean_token_accuracy": 0.975084537267685,
|
| 1353 |
+
"num_tokens": 2373919.0,
|
| 1354 |
+
"step": 1680
|
| 1355 |
+
},
|
| 1356 |
+
{
|
| 1357 |
+
"epoch": 0.025067861221946985,
|
| 1358 |
+
"learning_rate": 0.0001949893943664061,
|
| 1359 |
+
"loss": 0.057,
|
| 1360 |
+
"mean_token_accuracy": 0.9772068738937378,
|
| 1361 |
+
"num_tokens": 2388316.0,
|
| 1362 |
+
"step": 1690
|
| 1363 |
+
},
|
| 1364 |
+
{
|
| 1365 |
+
"epoch": 0.02521619176172182,
|
| 1366 |
+
"learning_rate": 0.00019495972825845116,
|
| 1367 |
+
"loss": 0.0667,
|
| 1368 |
+
"mean_token_accuracy": 0.9791467607021331,
|
| 1369 |
+
"num_tokens": 2402418.0,
|
| 1370 |
+
"step": 1700
|
| 1371 |
+
},
|
| 1372 |
+
{
|
| 1373 |
+
"epoch": 0.025364522301496654,
|
| 1374 |
+
"learning_rate": 0.00019493006215049618,
|
| 1375 |
+
"loss": 0.0688,
|
| 1376 |
+
"mean_token_accuracy": 0.9734372615814209,
|
| 1377 |
+
"num_tokens": 2416607.0,
|
| 1378 |
+
"step": 1710
|
| 1379 |
+
},
|
| 1380 |
+
{
|
| 1381 |
+
"epoch": 0.02551285284127149,
|
| 1382 |
+
"learning_rate": 0.0001949003960425412,
|
| 1383 |
+
"loss": 0.0738,
|
| 1384 |
+
"mean_token_accuracy": 0.9727434098720551,
|
| 1385 |
+
"num_tokens": 2430760.0,
|
| 1386 |
+
"step": 1720
|
| 1387 |
+
},
|
| 1388 |
+
{
|
| 1389 |
+
"epoch": 0.025661183381046324,
|
| 1390 |
+
"learning_rate": 0.00019487072993458623,
|
| 1391 |
+
"loss": 0.0631,
|
| 1392 |
+
"mean_token_accuracy": 0.9753291308879852,
|
| 1393 |
+
"num_tokens": 2444839.0,
|
| 1394 |
+
"step": 1730
|
| 1395 |
+
},
|
| 1396 |
+
{
|
| 1397 |
+
"epoch": 0.025809513920821158,
|
| 1398 |
+
"learning_rate": 0.00019484106382663128,
|
| 1399 |
+
"loss": 0.0688,
|
| 1400 |
+
"mean_token_accuracy": 0.9749854207038879,
|
| 1401 |
+
"num_tokens": 2459179.0,
|
| 1402 |
+
"step": 1740
|
| 1403 |
+
},
|
| 1404 |
+
{
|
| 1405 |
+
"epoch": 0.025957844460595993,
|
| 1406 |
+
"learning_rate": 0.00019481139771867633,
|
| 1407 |
+
"loss": 0.0798,
|
| 1408 |
+
"mean_token_accuracy": 0.9702626287937164,
|
| 1409 |
+
"num_tokens": 2473205.0,
|
| 1410 |
+
"step": 1750
|
| 1411 |
+
},
|
| 1412 |
+
{
|
| 1413 |
+
"epoch": 0.026106175000370827,
|
| 1414 |
+
"learning_rate": 0.00019478173161072132,
|
| 1415 |
+
"loss": 0.09,
|
| 1416 |
+
"mean_token_accuracy": 0.9717526614665986,
|
| 1417 |
+
"num_tokens": 2487522.0,
|
| 1418 |
+
"step": 1760
|
| 1419 |
+
},
|
| 1420 |
+
{
|
| 1421 |
+
"epoch": 0.026254505540145662,
|
| 1422 |
+
"learning_rate": 0.00019475206550276637,
|
| 1423 |
+
"loss": 0.0785,
|
| 1424 |
+
"mean_token_accuracy": 0.9719234526157379,
|
| 1425 |
+
"num_tokens": 2501550.0,
|
| 1426 |
+
"step": 1770
|
| 1427 |
+
},
|
| 1428 |
+
{
|
| 1429 |
+
"epoch": 0.026402836079920496,
|
| 1430 |
+
"learning_rate": 0.0001947223993948114,
|
| 1431 |
+
"loss": 0.0924,
|
| 1432 |
+
"mean_token_accuracy": 0.9727768480777741,
|
| 1433 |
+
"num_tokens": 2515563.0,
|
| 1434 |
+
"step": 1780
|
| 1435 |
+
},
|
| 1436 |
+
{
|
| 1437 |
+
"epoch": 0.026551166619695327,
|
| 1438 |
+
"learning_rate": 0.00019469273328685645,
|
| 1439 |
+
"loss": 0.0961,
|
| 1440 |
+
"mean_token_accuracy": 0.9694974541664123,
|
| 1441 |
+
"num_tokens": 2529739.0,
|
| 1442 |
+
"step": 1790
|
| 1443 |
+
},
|
| 1444 |
+
{
|
| 1445 |
+
"epoch": 0.026699497159470162,
|
| 1446 |
+
"learning_rate": 0.00019466306717890147,
|
| 1447 |
+
"loss": 0.0959,
|
| 1448 |
+
"mean_token_accuracy": 0.9663477540016174,
|
| 1449 |
+
"num_tokens": 2543804.0,
|
| 1450 |
+
"step": 1800
|
| 1451 |
+
},
|
| 1452 |
+
{
|
| 1453 |
+
"epoch": 0.026847827699244996,
|
| 1454 |
+
"learning_rate": 0.0001946334010709465,
|
| 1455 |
+
"loss": 0.0963,
|
| 1456 |
+
"mean_token_accuracy": 0.9670729875564575,
|
| 1457 |
+
"num_tokens": 2557916.0,
|
| 1458 |
+
"step": 1810
|
| 1459 |
+
},
|
| 1460 |
+
{
|
| 1461 |
+
"epoch": 0.02699615823901983,
|
| 1462 |
+
"learning_rate": 0.00019460373496299154,
|
| 1463 |
+
"loss": 0.0897,
|
| 1464 |
+
"mean_token_accuracy": 0.9735462188720703,
|
| 1465 |
+
"num_tokens": 2571905.0,
|
| 1466 |
+
"step": 1820
|
| 1467 |
+
},
|
| 1468 |
+
{
|
| 1469 |
+
"epoch": 0.027144488778794666,
|
| 1470 |
+
"learning_rate": 0.00019457406885503657,
|
| 1471 |
+
"loss": 0.0811,
|
| 1472 |
+
"mean_token_accuracy": 0.9717726945877075,
|
| 1473 |
+
"num_tokens": 2585903.0,
|
| 1474 |
+
"step": 1830
|
| 1475 |
+
},
|
| 1476 |
+
{
|
| 1477 |
+
"epoch": 0.0272928193185695,
|
| 1478 |
+
"learning_rate": 0.00019454440274708162,
|
| 1479 |
+
"loss": 0.0845,
|
| 1480 |
+
"mean_token_accuracy": 0.9706952691078186,
|
| 1481 |
+
"num_tokens": 2600073.0,
|
| 1482 |
+
"step": 1840
|
| 1483 |
+
},
|
| 1484 |
+
{
|
| 1485 |
+
"epoch": 0.027441149858344335,
|
| 1486 |
+
"learning_rate": 0.00019451473663912664,
|
| 1487 |
+
"loss": 0.0768,
|
| 1488 |
+
"mean_token_accuracy": 0.9728710412979126,
|
| 1489 |
+
"num_tokens": 2614150.0,
|
| 1490 |
+
"step": 1850
|
| 1491 |
+
},
|
| 1492 |
+
{
|
| 1493 |
+
"epoch": 0.02758948039811917,
|
| 1494 |
+
"learning_rate": 0.00019448507053117166,
|
| 1495 |
+
"loss": 0.0758,
|
| 1496 |
+
"mean_token_accuracy": 0.9708646655082702,
|
| 1497 |
+
"num_tokens": 2628224.0,
|
| 1498 |
+
"step": 1860
|
| 1499 |
+
},
|
| 1500 |
+
{
|
| 1501 |
+
"epoch": 0.027737810937894004,
|
| 1502 |
+
"learning_rate": 0.0001944554044232167,
|
| 1503 |
+
"loss": 0.0773,
|
| 1504 |
+
"mean_token_accuracy": 0.9729286253452301,
|
| 1505 |
+
"num_tokens": 2642113.0,
|
| 1506 |
+
"step": 1870
|
| 1507 |
+
},
|
| 1508 |
+
{
|
| 1509 |
+
"epoch": 0.027886141477668838,
|
| 1510 |
+
"learning_rate": 0.00019442573831526173,
|
| 1511 |
+
"loss": 0.0927,
|
| 1512 |
+
"mean_token_accuracy": 0.9710256516933441,
|
| 1513 |
+
"num_tokens": 2656078.0,
|
| 1514 |
+
"step": 1880
|
| 1515 |
+
},
|
| 1516 |
+
{
|
| 1517 |
+
"epoch": 0.028034472017443673,
|
| 1518 |
+
"learning_rate": 0.00019439607220730678,
|
| 1519 |
+
"loss": 0.0632,
|
| 1520 |
+
"mean_token_accuracy": 0.978791344165802,
|
| 1521 |
+
"num_tokens": 2670128.0,
|
| 1522 |
+
"step": 1890
|
| 1523 |
+
},
|
| 1524 |
+
{
|
| 1525 |
+
"epoch": 0.028182802557218507,
|
| 1526 |
+
"learning_rate": 0.0001943664060993518,
|
| 1527 |
+
"loss": 0.0543,
|
| 1528 |
+
"mean_token_accuracy": 0.9814699769020081,
|
| 1529 |
+
"num_tokens": 2684474.0,
|
| 1530 |
+
"step": 1900
|
| 1531 |
+
},
|
| 1532 |
+
{
|
| 1533 |
+
"epoch": 0.02833113309699334,
|
| 1534 |
+
"learning_rate": 0.00019433673999139683,
|
| 1535 |
+
"loss": 0.063,
|
| 1536 |
+
"mean_token_accuracy": 0.9756844103336334,
|
| 1537 |
+
"num_tokens": 2698436.0,
|
| 1538 |
+
"step": 1910
|
| 1539 |
+
},
|
| 1540 |
+
{
|
| 1541 |
+
"epoch": 0.028479463636768173,
|
| 1542 |
+
"learning_rate": 0.00019430707388344188,
|
| 1543 |
+
"loss": 0.0945,
|
| 1544 |
+
"mean_token_accuracy": 0.9691503286361695,
|
| 1545 |
+
"num_tokens": 2712395.0,
|
| 1546 |
+
"step": 1920
|
| 1547 |
+
},
|
| 1548 |
+
{
|
| 1549 |
+
"epoch": 0.028627794176543007,
|
| 1550 |
+
"learning_rate": 0.0001942774077754869,
|
| 1551 |
+
"loss": 0.0601,
|
| 1552 |
+
"mean_token_accuracy": 0.9762123942375183,
|
| 1553 |
+
"num_tokens": 2726506.0,
|
| 1554 |
+
"step": 1930
|
| 1555 |
+
},
|
| 1556 |
+
{
|
| 1557 |
+
"epoch": 0.028776124716317842,
|
| 1558 |
+
"learning_rate": 0.00019424774166753195,
|
| 1559 |
+
"loss": 0.0912,
|
| 1560 |
+
"mean_token_accuracy": 0.9622864723205566,
|
| 1561 |
+
"num_tokens": 2740293.0,
|
| 1562 |
+
"step": 1940
|
| 1563 |
+
},
|
| 1564 |
+
{
|
| 1565 |
+
"epoch": 0.028924455256092677,
|
| 1566 |
+
"learning_rate": 0.00019421807555957695,
|
| 1567 |
+
"loss": 0.0758,
|
| 1568 |
+
"mean_token_accuracy": 0.9752536177635193,
|
| 1569 |
+
"num_tokens": 2754509.0,
|
| 1570 |
+
"step": 1950
|
| 1571 |
+
},
|
| 1572 |
+
{
|
| 1573 |
+
"epoch": 0.02907278579586751,
|
| 1574 |
+
"learning_rate": 0.000194188409451622,
|
| 1575 |
+
"loss": 0.0666,
|
| 1576 |
+
"mean_token_accuracy": 0.9746933698654174,
|
| 1577 |
+
"num_tokens": 2768540.0,
|
| 1578 |
+
"step": 1960
|
| 1579 |
+
},
|
| 1580 |
+
{
|
| 1581 |
+
"epoch": 0.029221116335642346,
|
| 1582 |
+
"learning_rate": 0.00019415874334366705,
|
| 1583 |
+
"loss": 0.0546,
|
| 1584 |
+
"mean_token_accuracy": 0.9834832668304443,
|
| 1585 |
+
"num_tokens": 2782780.0,
|
| 1586 |
+
"step": 1970
|
| 1587 |
+
},
|
| 1588 |
+
{
|
| 1589 |
+
"epoch": 0.02936944687541718,
|
| 1590 |
+
"learning_rate": 0.00019412907723571207,
|
| 1591 |
+
"loss": 0.0671,
|
| 1592 |
+
"mean_token_accuracy": 0.9755673408508301,
|
| 1593 |
+
"num_tokens": 2796973.0,
|
| 1594 |
+
"step": 1980
|
| 1595 |
+
},
|
| 1596 |
+
{
|
| 1597 |
+
"epoch": 0.029517777415192015,
|
| 1598 |
+
"learning_rate": 0.00019409941112775712,
|
| 1599 |
+
"loss": 0.0639,
|
| 1600 |
+
"mean_token_accuracy": 0.9805159747600556,
|
| 1601 |
+
"num_tokens": 2811149.0,
|
| 1602 |
+
"step": 1990
|
| 1603 |
+
},
|
| 1604 |
+
{
|
| 1605 |
+
"epoch": 0.02966610795496685,
|
| 1606 |
+
"learning_rate": 0.00019406974501980212,
|
| 1607 |
+
"loss": 0.0627,
|
| 1608 |
+
"mean_token_accuracy": 0.9782679080963135,
|
| 1609 |
+
"num_tokens": 2825280.0,
|
| 1610 |
+
"step": 2000
|
| 1611 |
+
},
|
| 1612 |
+
{
|
| 1613 |
+
"epoch": 0.029814438494741684,
|
| 1614 |
+
"learning_rate": 0.00019404007891184717,
|
| 1615 |
+
"loss": 0.0661,
|
| 1616 |
+
"mean_token_accuracy": 0.9821899354457855,
|
| 1617 |
+
"num_tokens": 2839485.0,
|
| 1618 |
+
"step": 2010
|
| 1619 |
+
},
|
| 1620 |
+
{
|
| 1621 |
+
"epoch": 0.02996276903451652,
|
| 1622 |
+
"learning_rate": 0.00019401041280389222,
|
| 1623 |
+
"loss": 0.0564,
|
| 1624 |
+
"mean_token_accuracy": 0.9785571038722992,
|
| 1625 |
+
"num_tokens": 2853745.0,
|
| 1626 |
+
"step": 2020
|
| 1627 |
+
},
|
| 1628 |
+
{
|
| 1629 |
+
"epoch": 0.03011109957429135,
|
| 1630 |
+
"learning_rate": 0.00019398074669593724,
|
| 1631 |
+
"loss": 0.0562,
|
| 1632 |
+
"mean_token_accuracy": 0.9799038827419281,
|
| 1633 |
+
"num_tokens": 2867981.0,
|
| 1634 |
+
"step": 2030
|
| 1635 |
+
},
|
| 1636 |
+
{
|
| 1637 |
+
"epoch": 0.030259430114066184,
|
| 1638 |
+
"learning_rate": 0.00019395108058798226,
|
| 1639 |
+
"loss": 0.0535,
|
| 1640 |
+
"mean_token_accuracy": 0.9810381293296814,
|
| 1641 |
+
"num_tokens": 2882309.0,
|
| 1642 |
+
"step": 2040
|
| 1643 |
+
},
|
| 1644 |
+
{
|
| 1645 |
+
"epoch": 0.03040776065384102,
|
| 1646 |
+
"learning_rate": 0.0001939214144800273,
|
| 1647 |
+
"loss": 0.0713,
|
| 1648 |
+
"mean_token_accuracy": 0.9679700911045075,
|
| 1649 |
+
"num_tokens": 2896302.0,
|
| 1650 |
+
"step": 2050
|
| 1651 |
+
},
|
| 1652 |
+
{
|
| 1653 |
+
"epoch": 0.030556091193615853,
|
| 1654 |
+
"learning_rate": 0.00019389174837207234,
|
| 1655 |
+
"loss": 0.064,
|
| 1656 |
+
"mean_token_accuracy": 0.9801303565502166,
|
| 1657 |
+
"num_tokens": 2910539.0,
|
| 1658 |
+
"step": 2060
|
| 1659 |
+
},
|
| 1660 |
+
{
|
| 1661 |
+
"epoch": 0.030704421733390688,
|
| 1662 |
+
"learning_rate": 0.00019386208226411736,
|
| 1663 |
+
"loss": 0.0705,
|
| 1664 |
+
"mean_token_accuracy": 0.9750793814659119,
|
| 1665 |
+
"num_tokens": 2924693.0,
|
| 1666 |
+
"step": 2070
|
| 1667 |
+
},
|
| 1668 |
+
{
|
| 1669 |
+
"epoch": 0.030852752273165522,
|
| 1670 |
+
"learning_rate": 0.0001938324161561624,
|
| 1671 |
+
"loss": 0.0782,
|
| 1672 |
+
"mean_token_accuracy": 0.9702070772647857,
|
| 1673 |
+
"num_tokens": 2939003.0,
|
| 1674 |
+
"step": 2080
|
| 1675 |
+
},
|
| 1676 |
+
{
|
| 1677 |
+
"epoch": 0.031001082812940357,
|
| 1678 |
+
"learning_rate": 0.00019380275004820743,
|
| 1679 |
+
"loss": 0.0673,
|
| 1680 |
+
"mean_token_accuracy": 0.9760224461555481,
|
| 1681 |
+
"num_tokens": 2953090.0,
|
| 1682 |
+
"step": 2090
|
| 1683 |
+
},
|
| 1684 |
+
{
|
| 1685 |
+
"epoch": 0.03114941335271519,
|
| 1686 |
+
"learning_rate": 0.00019377308394025246,
|
| 1687 |
+
"loss": 0.0589,
|
| 1688 |
+
"mean_token_accuracy": 0.9770624697208404,
|
| 1689 |
+
"num_tokens": 2967200.0,
|
| 1690 |
+
"step": 2100
|
| 1691 |
+
},
|
| 1692 |
+
{
|
| 1693 |
+
"epoch": 0.03129774389249002,
|
| 1694 |
+
"learning_rate": 0.0001937434178322975,
|
| 1695 |
+
"loss": 0.0711,
|
| 1696 |
+
"mean_token_accuracy": 0.9783848404884339,
|
| 1697 |
+
"num_tokens": 2981214.0,
|
| 1698 |
+
"step": 2110
|
| 1699 |
+
},
|
| 1700 |
+
{
|
| 1701 |
+
"epoch": 0.03144607443226486,
|
| 1702 |
+
"learning_rate": 0.00019371375172434253,
|
| 1703 |
+
"loss": 0.074,
|
| 1704 |
+
"mean_token_accuracy": 0.9754434108734131,
|
| 1705 |
+
"num_tokens": 2995111.0,
|
| 1706 |
+
"step": 2120
|
| 1707 |
+
},
|
| 1708 |
+
{
|
| 1709 |
+
"epoch": 0.03159440497203969,
|
| 1710 |
+
"learning_rate": 0.00019368408561638758,
|
| 1711 |
+
"loss": 0.0717,
|
| 1712 |
+
"mean_token_accuracy": 0.9739742994308471,
|
| 1713 |
+
"num_tokens": 3009288.0,
|
| 1714 |
+
"step": 2130
|
| 1715 |
+
},
|
| 1716 |
+
{
|
| 1717 |
+
"epoch": 0.03174273551181453,
|
| 1718 |
+
"learning_rate": 0.0001936544195084326,
|
| 1719 |
+
"loss": 0.0722,
|
| 1720 |
+
"mean_token_accuracy": 0.9753423690795898,
|
| 1721 |
+
"num_tokens": 3023460.0,
|
| 1722 |
+
"step": 2140
|
| 1723 |
+
},
|
| 1724 |
+
{
|
| 1725 |
+
"epoch": 0.03189106605158936,
|
| 1726 |
+
"learning_rate": 0.00019362475340047763,
|
| 1727 |
+
"loss": 0.0463,
|
| 1728 |
+
"mean_token_accuracy": 0.982510793209076,
|
| 1729 |
+
"num_tokens": 3037765.0,
|
| 1730 |
+
"step": 2150
|
| 1731 |
+
},
|
| 1732 |
+
{
|
| 1733 |
+
"epoch": 0.0320393965913642,
|
| 1734 |
+
"learning_rate": 0.00019359508729252268,
|
| 1735 |
+
"loss": 0.0858,
|
| 1736 |
+
"mean_token_accuracy": 0.9690262913703919,
|
| 1737 |
+
"num_tokens": 3051960.0,
|
| 1738 |
+
"step": 2160
|
| 1739 |
+
},
|
| 1740 |
+
{
|
| 1741 |
+
"epoch": 0.03218772713113903,
|
| 1742 |
+
"learning_rate": 0.0001935654211845677,
|
| 1743 |
+
"loss": 0.0836,
|
| 1744 |
+
"mean_token_accuracy": 0.9711355090141296,
|
| 1745 |
+
"num_tokens": 3065890.0,
|
| 1746 |
+
"step": 2170
|
| 1747 |
+
},
|
| 1748 |
+
{
|
| 1749 |
+
"epoch": 0.03233605767091387,
|
| 1750 |
+
"learning_rate": 0.00019353575507661272,
|
| 1751 |
+
"loss": 0.0631,
|
| 1752 |
+
"mean_token_accuracy": 0.9817444443702698,
|
| 1753 |
+
"num_tokens": 3080088.0,
|
| 1754 |
+
"step": 2180
|
| 1755 |
+
},
|
| 1756 |
+
{
|
| 1757 |
+
"epoch": 0.0324843882106887,
|
| 1758 |
+
"learning_rate": 0.00019350608896865777,
|
| 1759 |
+
"loss": 0.076,
|
| 1760 |
+
"mean_token_accuracy": 0.974403315782547,
|
| 1761 |
+
"num_tokens": 3094137.0,
|
| 1762 |
+
"step": 2190
|
| 1763 |
+
},
|
| 1764 |
+
{
|
| 1765 |
+
"epoch": 0.03263271875046353,
|
| 1766 |
+
"learning_rate": 0.0001934764228607028,
|
| 1767 |
+
"loss": 0.0728,
|
| 1768 |
+
"mean_token_accuracy": 0.9692744731903076,
|
| 1769 |
+
"num_tokens": 3108250.0,
|
| 1770 |
+
"step": 2200
|
| 1771 |
+
},
|
| 1772 |
+
{
|
| 1773 |
+
"epoch": 0.03278104929023837,
|
| 1774 |
+
"learning_rate": 0.00019344675675274785,
|
| 1775 |
+
"loss": 0.0605,
|
| 1776 |
+
"mean_token_accuracy": 0.9794303894042968,
|
| 1777 |
+
"num_tokens": 3122535.0,
|
| 1778 |
+
"step": 2210
|
| 1779 |
+
},
|
| 1780 |
+
{
|
| 1781 |
+
"epoch": 0.0329293798300132,
|
| 1782 |
+
"learning_rate": 0.00019341709064479287,
|
| 1783 |
+
"loss": 0.0932,
|
| 1784 |
+
"mean_token_accuracy": 0.9737550437450408,
|
| 1785 |
+
"num_tokens": 3136693.0,
|
| 1786 |
+
"step": 2220
|
| 1787 |
+
},
|
| 1788 |
+
{
|
| 1789 |
+
"epoch": 0.03307771036978804,
|
| 1790 |
+
"learning_rate": 0.0001933874245368379,
|
| 1791 |
+
"loss": 0.0726,
|
| 1792 |
+
"mean_token_accuracy": 0.972658348083496,
|
| 1793 |
+
"num_tokens": 3150833.0,
|
| 1794 |
+
"step": 2230
|
| 1795 |
+
},
|
| 1796 |
+
{
|
| 1797 |
+
"epoch": 0.03322604090956287,
|
| 1798 |
+
"learning_rate": 0.00019335775842888294,
|
| 1799 |
+
"loss": 0.0894,
|
| 1800 |
+
"mean_token_accuracy": 0.9667915642261505,
|
| 1801 |
+
"num_tokens": 3165091.0,
|
| 1802 |
+
"step": 2240
|
| 1803 |
+
}
|
| 1804 |
+
],
|
| 1805 |
+
"logging_steps": 10,
|
| 1806 |
+
"max_steps": 67417,
|
| 1807 |
+
"num_input_tokens_seen": 0,
|
| 1808 |
+
"num_train_epochs": 9223372036854775807,
|
| 1809 |
+
"save_steps": 2247,
|
| 1810 |
+
"stateful_callbacks": {
|
| 1811 |
+
"TrainerControl": {
|
| 1812 |
+
"args": {
|
| 1813 |
+
"should_epoch_stop": false,
|
| 1814 |
+
"should_evaluate": false,
|
| 1815 |
+
"should_log": false,
|
| 1816 |
+
"should_save": true,
|
| 1817 |
+
"should_training_stop": false
|
| 1818 |
+
},
|
| 1819 |
+
"attributes": {}
|
| 1820 |
+
}
|
| 1821 |
+
},
|
| 1822 |
+
"total_flos": 1.5068943178039296e+17,
|
| 1823 |
+
"train_batch_size": 12,
|
| 1824 |
+
"trial_name": null,
|
| 1825 |
+
"trial_params": null
|
| 1826 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
|
| 3 |
+
library_name: peft
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
| 200 |
+
### Framework versions
|
| 201 |
+
|
| 202 |
+
- PEFT 0.15.0
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/adapter_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": false,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 16,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.0,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"r": 8,
|
| 24 |
+
"rank_pattern": {},
|
| 25 |
+
"revision": null,
|
| 26 |
+
"target_modules": [
|
| 27 |
+
"v_proj",
|
| 28 |
+
"q_proj"
|
| 29 |
+
],
|
| 30 |
+
"task_type": "CAUSAL_LM",
|
| 31 |
+
"trainable_token_indices": null,
|
| 32 |
+
"use_dora": false,
|
| 33 |
+
"use_rslora": false
|
| 34 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer_config.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
| 33 |
+
"clean_up_tokenization_spaces": false,
|
| 34 |
+
"eos_token": "</s>",
|
| 35 |
+
"extra_special_tokens": {},
|
| 36 |
+
"legacy": false,
|
| 37 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 38 |
+
"pad_token": "</s>",
|
| 39 |
+
"padding_side": "right",
|
| 40 |
+
"sp_model_kwargs": {},
|
| 41 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 42 |
+
"unk_token": "<unk>",
|
| 43 |
+
"use_default_system_prompt": false
|
| 44 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
|
| 3 |
+
library_name: peft
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
| 200 |
+
### Framework versions
|
| 201 |
+
|
| 202 |
+
- PEFT 0.15.0
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/adapter_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": false,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 16,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.0,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"r": 8,
|
| 24 |
+
"rank_pattern": {},
|
| 25 |
+
"revision": null,
|
| 26 |
+
"target_modules": [
|
| 27 |
+
"v_proj",
|
| 28 |
+
"q_proj"
|
| 29 |
+
],
|
| 30 |
+
"task_type": "CAUSAL_LM",
|
| 31 |
+
"trainable_token_indices": null,
|
| 32 |
+
"use_dora": false,
|
| 33 |
+
"use_rslora": false
|
| 34 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
|
| 3 |
+
size 14244
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb8112171b5385c5b37366ef9bade4f4b9781d2d1470892eab36e63919d55a16
|
| 3 |
+
size 1064
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer_config.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
| 33 |
+
"clean_up_tokenization_spaces": false,
|
| 34 |
+
"eos_token": "</s>",
|
| 35 |
+
"extra_special_tokens": {},
|
| 36 |
+
"legacy": false,
|
| 37 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 38 |
+
"pad_token": "</s>",
|
| 39 |
+
"padding_side": "right",
|
| 40 |
+
"sp_model_kwargs": {},
|
| 41 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 42 |
+
"unk_token": "<unk>",
|
| 43 |
+
"use_default_system_prompt": false
|
| 44 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
|
| 3 |
+
library_name: peft
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
| 200 |
+
### Framework versions
|
| 201 |
+
|
| 202 |
+
- PEFT 0.15.0
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/adapter_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": false,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 16,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.0,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"r": 8,
|
| 24 |
+
"rank_pattern": {},
|
| 25 |
+
"revision": null,
|
| 26 |
+
"target_modules": [
|
| 27 |
+
"v_proj",
|
| 28 |
+
"q_proj"
|
| 29 |
+
],
|
| 30 |
+
"task_type": "CAUSAL_LM",
|
| 31 |
+
"trainable_token_indices": null,
|
| 32 |
+
"use_dora": false,
|
| 33 |
+
"use_rslora": false
|
| 34 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer_config.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
| 33 |
+
"clean_up_tokenization_spaces": false,
|
| 34 |
+
"eos_token": "</s>",
|
| 35 |
+
"extra_special_tokens": {},
|
| 36 |
+
"legacy": false,
|
| 37 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 38 |
+
"pad_token": "</s>",
|
| 39 |
+
"padding_side": "right",
|
| 40 |
+
"sp_model_kwargs": {},
|
| 41 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 42 |
+
"unk_token": "<unk>",
|
| 43 |
+
"use_default_system_prompt": false
|
| 44 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/trainer_state.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
|
| 3 |
+
library_name: peft
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
| 200 |
+
### Framework versions
|
| 201 |
+
|
| 202 |
+
- PEFT 0.15.0
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/adapter_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": false,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 16,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.0,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"r": 8,
|
| 24 |
+
"rank_pattern": {},
|
| 25 |
+
"revision": null,
|
| 26 |
+
"target_modules": [
|
| 27 |
+
"v_proj",
|
| 28 |
+
"q_proj"
|
| 29 |
+
],
|
| 30 |
+
"task_type": "CAUSAL_LM",
|
| 31 |
+
"trainable_token_indices": null,
|
| 32 |
+
"use_dora": false,
|
| 33 |
+
"use_rslora": false
|
| 34 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer_config.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
| 33 |
+
"clean_up_tokenization_spaces": false,
|
| 34 |
+
"eos_token": "</s>",
|
| 35 |
+
"extra_special_tokens": {},
|
| 36 |
+
"legacy": false,
|
| 37 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 38 |
+
"pad_token": "</s>",
|
| 39 |
+
"padding_side": "right",
|
| 40 |
+
"sp_model_kwargs": {},
|
| 41 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 42 |
+
"unk_token": "<unk>",
|
| 43 |
+
"use_default_system_prompt": false
|
| 44 |
+
}
|