gijs commited on Nov 20

Commit

203d3ff

verified ·

1 Parent(s): 8de5419

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +17 -0
gama/gama-20250422_171856/checkpoint-2350/README.md +202 -0
gama/gama-20250422_171856/checkpoint-2350/adapter_config.json +34 -0
gama/gama-20250422_171856/checkpoint-2350/special_tokens_map.json +24 -0
gama/gama-20250422_171856/checkpoint-2350/tokenizer.json +0 -0
gama/gama-20250422_171856/checkpoint-2350/tokenizer_config.json +44 -0
gama/gama-20250422_171856/checkpoint-2350/trainer_state.json +1914 -0
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/README.md +202 -0
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/adapter_config.json +34 -0
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/special_tokens_map.json +24 -0
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer.json +0 -0
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer_config.json +44 -0
gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/trainer_state.json +1378 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/README.md +202 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/adapter_config.json +34 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/special_tokens_map.json +24 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer.json +0 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer_config.json +44 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/trainer_state.json +1378 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/README.md +202 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/adapter_config.json +34 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/special_tokens_map.json +24 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer.json +0 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer_config.json +44 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/trainer_state.json +1826 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/README.md +202 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/adapter_config.json +34 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/special_tokens_map.json +24 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer.json +0 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer_config.json +44 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/trainer_state.json +0 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/README.md +202 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/adapter_config.json +34 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/rng_state.pth +3 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/scheduler.pt +3 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/special_tokens_map.json +24 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer.json +0 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer_config.json +44 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/trainer_state.json +0 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/README.md +202 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/adapter_config.json +34 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/special_tokens_map.json +24 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer.json +0 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer_config.json +44 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/trainer_state.json +0 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/README.md +202 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/adapter_config.json +34 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/special_tokens_map.json +24 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer.json +0 -0
gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer_config.json +44 -0

.gitattributes CHANGED Viewed

@@ -668,3 +668,20 @@ grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-1500/tokenizer.json filter
 grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-2700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-2100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/gpro-clotho-audiocaps-20250404_192953/checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-1416/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-14514/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-12036/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-6372/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-11328/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-354/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-4602/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-12744/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-9204/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-3894/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-4956/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-2124/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-13452/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+grpo/grpo-20250411_032518_semantic_mc_qa-lora-e1-bs4-lr1e-06-20250425_181059/checkpoint-12390/tokenizer.json filter=lfs diff=lfs merge=lfs -text

gama/gama-20250422_171856/checkpoint-2350/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.0

gama/gama-20250422_171856/checkpoint-2350/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

gama/gama-20250422_171856/checkpoint-2350/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

gama/gama-20250422_171856/checkpoint-2350/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gama/gama-20250422_171856/checkpoint-2350/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

gama/gama-20250422_171856/checkpoint-2350/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1914 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.033332387733681315,
+  "eval_steps": 500,
+  "global_step": 2350,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0001418399478028992,
+      "learning_rate": 0.0001999744688093955,
+      "loss": 1.4644,
+      "mean_token_accuracy": 0.6479970395565033,
+      "num_tokens": 10375.0,
+      "step": 10
+    },
+    {
+      "epoch": 0.0002836798956057984,
+      "learning_rate": 0.0001999461008198349,
+      "loss": 1.3344,
+      "mean_token_accuracy": 0.6637877106666565,
+      "num_tokens": 20535.0,
+      "step": 20
+    },
+    {
+      "epoch": 0.0004255198434086976,
+      "learning_rate": 0.00019991773283027433,
+      "loss": 1.3386,
+      "mean_token_accuracy": 0.6674083709716797,
+      "num_tokens": 30845.0,
+      "step": 30
+    },
+    {
+      "epoch": 0.0005673597912115968,
+      "learning_rate": 0.00019988936484071373,
+      "loss": 1.2811,
+      "mean_token_accuracy": 0.6720114409923553,
+      "num_tokens": 41159.0,
+      "step": 40
+    },
+    {
+      "epoch": 0.0007091997390144961,
+      "learning_rate": 0.00019986099685115316,
+      "loss": 1.3092,
+      "mean_token_accuracy": 0.6668342292308808,
+      "num_tokens": 51351.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.0008510396868173952,
+      "learning_rate": 0.0001998326288615926,
+      "loss": 1.2572,
+      "mean_token_accuracy": 0.678832185268402,
+      "num_tokens": 61567.0,
+      "step": 60
+    },
+    {
+      "epoch": 0.0009928796346202944,
+      "learning_rate": 0.00019980426087203202,
+      "loss": 1.3006,
+      "mean_token_accuracy": 0.6674412608146667,
+      "num_tokens": 71722.0,
+      "step": 70
+    },
+    {
+      "epoch": 0.0011347195824231936,
+      "learning_rate": 0.00019977589288247143,
+      "loss": 1.248,
+      "mean_token_accuracy": 0.6744147539138794,
+      "num_tokens": 81984.0,
+      "step": 80
+    },
+    {
+      "epoch": 0.0012765595302260929,
+      "learning_rate": 0.00019974752489291083,
+      "loss": 1.2643,
+      "mean_token_accuracy": 0.6711925864219666,
+      "num_tokens": 92376.0,
+      "step": 90
+    },
+    {
+      "epoch": 0.0014183994780289921,
+      "learning_rate": 0.0001997191569033503,
+      "loss": 1.28,
+      "mean_token_accuracy": 0.67249955534935,
+      "num_tokens": 102699.0,
+      "step": 100
+    },
+    {
+      "epoch": 0.0015602394258318914,
+      "learning_rate": 0.0001996907889137897,
+      "loss": 1.2525,
+      "mean_token_accuracy": 0.6758616745471955,
+      "num_tokens": 112760.0,
+      "step": 110
+    },
+    {
+      "epoch": 0.0017020793736347904,
+      "learning_rate": 0.00019966242092422912,
+      "loss": 1.2454,
+      "mean_token_accuracy": 0.6850893795490265,
+      "num_tokens": 122946.0,
+      "step": 120
+    },
+    {
+      "epoch": 0.0018439193214376897,
+      "learning_rate": 0.00019963405293466852,
+      "loss": 1.2673,
+      "mean_token_accuracy": 0.6743516206741333,
+      "num_tokens": 133225.0,
+      "step": 130
+    },
+    {
+      "epoch": 0.0019857592692405887,
+      "learning_rate": 0.00019960568494510795,
+      "loss": 1.2961,
+      "mean_token_accuracy": 0.672141146659851,
+      "num_tokens": 143602.0,
+      "step": 140
+    },
+    {
+      "epoch": 0.002127599217043488,
+      "learning_rate": 0.00019957731695554738,
+      "loss": 1.2268,
+      "mean_token_accuracy": 0.6834299504756928,
+      "num_tokens": 154040.0,
+      "step": 150
+    },
+    {
+      "epoch": 0.0022694391648463872,
+      "learning_rate": 0.00019954894896598679,
+      "loss": 1.2483,
+      "mean_token_accuracy": 0.6738203048706055,
+      "num_tokens": 164237.0,
+      "step": 160
+    },
+    {
+      "epoch": 0.0024112791126492867,
+      "learning_rate": 0.00019952058097642622,
+      "loss": 1.2644,
+      "mean_token_accuracy": 0.6698677897453308,
+      "num_tokens": 174418.0,
+      "step": 170
+    },
+    {
+      "epoch": 0.0025531190604521858,
+      "learning_rate": 0.00019949221298686562,
+      "loss": 1.2569,
+      "mean_token_accuracy": 0.6776521384716034,
+      "num_tokens": 184818.0,
+      "step": 180
+    },
+    {
+      "epoch": 0.002694959008255085,
+      "learning_rate": 0.00019946384499730505,
+      "loss": 1.218,
+      "mean_token_accuracy": 0.6825460493564606,
+      "num_tokens": 194989.0,
+      "step": 190
+    },
+    {
+      "epoch": 0.0028367989560579843,
+      "learning_rate": 0.00019943547700774448,
+      "loss": 1.1975,
+      "mean_token_accuracy": 0.6864433467388154,
+      "num_tokens": 205418.0,
+      "step": 200
+    },
+    {
+      "epoch": 0.0029786389038608833,
+      "learning_rate": 0.00019940710901818388,
+      "loss": 1.1862,
+      "mean_token_accuracy": 0.6890658676624298,
+      "num_tokens": 215645.0,
+      "step": 210
+    },
+    {
+      "epoch": 0.0031204788516637828,
+      "learning_rate": 0.0001993787410286233,
+      "loss": 1.2141,
+      "mean_token_accuracy": 0.684105110168457,
+      "num_tokens": 225986.0,
+      "step": 220
+    },
+    {
+      "epoch": 0.003262318799466682,
+      "learning_rate": 0.00019935037303906271,
+      "loss": 1.2246,
+      "mean_token_accuracy": 0.6754761219024659,
+      "num_tokens": 236317.0,
+      "step": 230
+    },
+    {
+      "epoch": 0.003404158747269581,
+      "learning_rate": 0.00019932200504950217,
+      "loss": 1.1578,
+      "mean_token_accuracy": 0.6994408905506134,
+      "num_tokens": 246467.0,
+      "step": 240
+    },
+    {
+      "epoch": 0.0035459986950724803,
+      "learning_rate": 0.00019929363705994157,
+      "loss": 1.1978,
+      "mean_token_accuracy": 0.6848730027675629,
+      "num_tokens": 256900.0,
+      "step": 250
+    },
+    {
+      "epoch": 0.0036878386428753794,
+      "learning_rate": 0.00019926526907038098,
+      "loss": 1.1915,
+      "mean_token_accuracy": 0.6893563270568848,
+      "num_tokens": 267130.0,
+      "step": 260
+    },
+    {
+      "epoch": 0.003829678590678279,
+      "learning_rate": 0.0001992369010808204,
+      "loss": 1.1796,
+      "mean_token_accuracy": 0.6892049252986908,
+      "num_tokens": 277453.0,
+      "step": 270
+    },
+    {
+      "epoch": 0.0039715185384811775,
+      "learning_rate": 0.00019920853309125984,
+      "loss": 1.1481,
+      "mean_token_accuracy": 0.696202689409256,
+      "num_tokens": 287556.0,
+      "step": 280
+    },
+    {
+      "epoch": 0.004113358486284077,
+      "learning_rate": 0.00019918016510169927,
+      "loss": 1.1762,
+      "mean_token_accuracy": 0.6873701572418213,
+      "num_tokens": 297903.0,
+      "step": 290
+    },
+    {
+      "epoch": 0.004255198434086976,
+      "learning_rate": 0.00019915179711213867,
+      "loss": 1.1251,
+      "mean_token_accuracy": 0.6990953743457794,
+      "num_tokens": 307835.0,
+      "step": 300
+    },
+    {
+      "epoch": 0.0043970383818898754,
+      "learning_rate": 0.0001991234291225781,
+      "loss": 1.1808,
+      "mean_token_accuracy": 0.6895361363887786,
+      "num_tokens": 318327.0,
+      "step": 310
+    },
+    {
+      "epoch": 0.0045388783296927745,
+      "learning_rate": 0.0001990950611330175,
+      "loss": 1.1756,
+      "mean_token_accuracy": 0.6966897130012513,
+      "num_tokens": 328720.0,
+      "step": 320
+    },
+    {
+      "epoch": 0.0046807182774956735,
+      "learning_rate": 0.00019906669314345693,
+      "loss": 1.1887,
+      "mean_token_accuracy": 0.682871812582016,
+      "num_tokens": 338831.0,
+      "step": 330
+    },
+    {
+      "epoch": 0.004822558225298573,
+      "learning_rate": 0.00019903832515389636,
+      "loss": 1.1953,
+      "mean_token_accuracy": 0.6855618298053742,
+      "num_tokens": 349098.0,
+      "step": 340
+    },
+    {
+      "epoch": 0.0049643981731014725,
+      "learning_rate": 0.00019900995716433577,
+      "loss": 1.1808,
+      "mean_token_accuracy": 0.6865513443946838,
+      "num_tokens": 359364.0,
+      "step": 350
+    },
+    {
+      "epoch": 0.0051062381209043715,
+      "learning_rate": 0.0001989815891747752,
+      "loss": 1.1186,
+      "mean_token_accuracy": 0.7019730627536773,
+      "num_tokens": 369582.0,
+      "step": 360
+    },
+    {
+      "epoch": 0.0052480780687072705,
+      "learning_rate": 0.00019895322118521463,
+      "loss": 1.1432,
+      "mean_token_accuracy": 0.6962596535682678,
+      "num_tokens": 379501.0,
+      "step": 370
+    },
+    {
+      "epoch": 0.00538991801651017,
+      "learning_rate": 0.00019892485319565403,
+      "loss": 1.1904,
+      "mean_token_accuracy": 0.6854041993618012,
+      "num_tokens": 389789.0,
+      "step": 380
+    },
+    {
+      "epoch": 0.0055317579643130695,
+      "learning_rate": 0.00019889648520609346,
+      "loss": 1.1778,
+      "mean_token_accuracy": 0.6948013961315155,
+      "num_tokens": 400015.0,
+      "step": 390
+    },
+    {
+      "epoch": 0.0056735979121159685,
+      "learning_rate": 0.00019886811721653286,
+      "loss": 1.2114,
+      "mean_token_accuracy": 0.6789448976516723,
+      "num_tokens": 410407.0,
+      "step": 400
+    },
+    {
+      "epoch": 0.005815437859918868,
+      "learning_rate": 0.00019883974922697232,
+      "loss": 1.1815,
+      "mean_token_accuracy": 0.6859633207321167,
+      "num_tokens": 420563.0,
+      "step": 410
+    },
+    {
+      "epoch": 0.005957277807721767,
+      "learning_rate": 0.00019881138123741172,
+      "loss": 1.1739,
+      "mean_token_accuracy": 0.6891894340515137,
+      "num_tokens": 430802.0,
+      "step": 420
+    },
+    {
+      "epoch": 0.006099117755524666,
+      "learning_rate": 0.00019878301324785112,
+      "loss": 1.132,
+      "mean_token_accuracy": 0.7001429855823517,
+      "num_tokens": 440998.0,
+      "step": 430
+    },
+    {
+      "epoch": 0.0062409577033275656,
+      "learning_rate": 0.00019875464525829055,
+      "loss": 1.1474,
+      "mean_token_accuracy": 0.693991506099701,
+      "num_tokens": 451216.0,
+      "step": 440
+    },
+    {
+      "epoch": 0.006382797651130465,
+      "learning_rate": 0.00019872627726872996,
+      "loss": 1.199,
+      "mean_token_accuracy": 0.6871366202831268,
+      "num_tokens": 461276.0,
+      "step": 450
+    },
+    {
+      "epoch": 0.006524637598933364,
+      "learning_rate": 0.0001986979092791694,
+      "loss": 1.1204,
+      "mean_token_accuracy": 0.700880628824234,
+      "num_tokens": 471433.0,
+      "step": 460
+    },
+    {
+      "epoch": 0.006666477546736263,
+      "learning_rate": 0.00019866954128960882,
+      "loss": 1.2102,
+      "mean_token_accuracy": 0.6879798650741578,
+      "num_tokens": 481531.0,
+      "step": 470
+    },
+    {
+      "epoch": 0.006808317494539162,
+      "learning_rate": 0.00019864117330004825,
+      "loss": 1.1195,
+      "mean_token_accuracy": 0.6937784433364869,
+      "num_tokens": 491954.0,
+      "step": 480
+    },
+    {
+      "epoch": 0.006950157442342062,
+      "learning_rate": 0.00019861280531048765,
+      "loss": 1.1455,
+      "mean_token_accuracy": 0.7021094024181366,
+      "num_tokens": 502274.0,
+      "step": 490
+    },
+    {
+      "epoch": 0.007091997390144961,
+      "learning_rate": 0.00019858443732092708,
+      "loss": 1.1608,
+      "mean_token_accuracy": 0.6934002816677094,
+      "num_tokens": 512604.0,
+      "step": 500
+    },
+    {
+      "epoch": 0.00723383733794786,
+      "learning_rate": 0.0001985560693313665,
+      "loss": 1.1705,
+      "mean_token_accuracy": 0.695448386669159,
+      "num_tokens": 522792.0,
+      "step": 510
+    },
+    {
+      "epoch": 0.007375677285750759,
+      "learning_rate": 0.0001985277013418059,
+      "loss": 1.1575,
+      "mean_token_accuracy": 0.693002599477768,
+      "num_tokens": 532886.0,
+      "step": 520
+    },
+    {
+      "epoch": 0.007517517233553658,
+      "learning_rate": 0.00019849933335224534,
+      "loss": 1.1261,
+      "mean_token_accuracy": 0.6963518977165222,
+      "num_tokens": 542956.0,
+      "step": 530
+    },
+    {
+      "epoch": 0.007659357181356558,
+      "learning_rate": 0.00019847096536268474,
+      "loss": 1.1214,
+      "mean_token_accuracy": 0.700639396905899,
+      "num_tokens": 553042.0,
+      "step": 540
+    },
+    {
+      "epoch": 0.007801197129159457,
+      "learning_rate": 0.00019844259737312417,
+      "loss": 1.1725,
+      "mean_token_accuracy": 0.6955362856388092,
+      "num_tokens": 563291.0,
+      "step": 550
+    },
+    {
+      "epoch": 0.007943037076962355,
+      "learning_rate": 0.0001984142293835636,
+      "loss": 1.1567,
+      "mean_token_accuracy": 0.6907780110836029,
+      "num_tokens": 573513.0,
+      "step": 560
+    },
+    {
+      "epoch": 0.008084877024765255,
+      "learning_rate": 0.000198385861394003,
+      "loss": 1.1467,
+      "mean_token_accuracy": 0.6990721464157105,
+      "num_tokens": 583892.0,
+      "step": 570
+    },
+    {
+      "epoch": 0.008226716972568155,
+      "learning_rate": 0.00019835749340444244,
+      "loss": 1.1326,
+      "mean_token_accuracy": 0.6969013214111328,
+      "num_tokens": 593838.0,
+      "step": 580
+    },
+    {
+      "epoch": 0.008368556920371053,
+      "learning_rate": 0.00019832912541488187,
+      "loss": 1.1711,
+      "mean_token_accuracy": 0.6954678654670715,
+      "num_tokens": 603881.0,
+      "step": 590
+    },
+    {
+      "epoch": 0.008510396868173953,
+      "learning_rate": 0.00019830075742532127,
+      "loss": 1.0689,
+      "mean_token_accuracy": 0.7051353633403779,
+      "num_tokens": 614246.0,
+      "step": 600
+    },
+    {
+      "epoch": 0.008652236815976851,
+      "learning_rate": 0.0001982723894357607,
+      "loss": 1.1453,
+      "mean_token_accuracy": 0.6947243511676788,
+      "num_tokens": 624741.0,
+      "step": 610
+    },
+    {
+      "epoch": 0.008794076763779751,
+      "learning_rate": 0.0001982440214462001,
+      "loss": 1.1411,
+      "mean_token_accuracy": 0.6965227723121643,
+      "num_tokens": 634891.0,
+      "step": 620
+    },
+    {
+      "epoch": 0.00893591671158265,
+      "learning_rate": 0.00019821565345663953,
+      "loss": 1.1215,
+      "mean_token_accuracy": 0.703746247291565,
+      "num_tokens": 645040.0,
+      "step": 630
+    },
+    {
+      "epoch": 0.009077756659385549,
+      "learning_rate": 0.00019818728546707896,
+      "loss": 1.1233,
+      "mean_token_accuracy": 0.6995523154735566,
+      "num_tokens": 655495.0,
+      "step": 640
+    },
+    {
+      "epoch": 0.009219596607188449,
+      "learning_rate": 0.0001981589174775184,
+      "loss": 1.1817,
+      "mean_token_accuracy": 0.6966328859329224,
+      "num_tokens": 665731.0,
+      "step": 650
+    },
+    {
+      "epoch": 0.009361436554991347,
+      "learning_rate": 0.0001981305494879578,
+      "loss": 1.1228,
+      "mean_token_accuracy": 0.6941804051399231,
+      "num_tokens": 676297.0,
+      "step": 660
+    },
+    {
+      "epoch": 0.009503276502794247,
+      "learning_rate": 0.0001981021814983972,
+      "loss": 1.1338,
+      "mean_token_accuracy": 0.6983364522457123,
+      "num_tokens": 686502.0,
+      "step": 670
+    },
+    {
+      "epoch": 0.009645116450597147,
+      "learning_rate": 0.00019807381350883666,
+      "loss": 1.1052,
+      "mean_token_accuracy": 0.7062141060829162,
+      "num_tokens": 696824.0,
+      "step": 680
+    },
+    {
+      "epoch": 0.009786956398400045,
+      "learning_rate": 0.00019804544551927606,
+      "loss": 1.0801,
+      "mean_token_accuracy": 0.7161077737808228,
+      "num_tokens": 707051.0,
+      "step": 690
+    },
+    {
+      "epoch": 0.009928796346202945,
+      "learning_rate": 0.0001980170775297155,
+      "loss": 1.1189,
+      "mean_token_accuracy": 0.7090901613235474,
+      "num_tokens": 717106.0,
+      "step": 700
+    },
+    {
+      "epoch": 0.010070636294005843,
+      "learning_rate": 0.0001979887095401549,
+      "loss": 1.0914,
+      "mean_token_accuracy": 0.709138709306717,
+      "num_tokens": 727375.0,
+      "step": 710
+    },
+    {
+      "epoch": 0.010212476241808743,
+      "learning_rate": 0.00019796034155059432,
+      "loss": 1.1572,
+      "mean_token_accuracy": 0.6868231952190399,
+      "num_tokens": 737794.0,
+      "step": 720
+    },
+    {
+      "epoch": 0.010354316189611643,
+      "learning_rate": 0.00019793197356103375,
+      "loss": 1.1172,
+      "mean_token_accuracy": 0.6967993915081024,
+      "num_tokens": 748005.0,
+      "step": 730
+    },
+    {
+      "epoch": 0.010496156137414541,
+      "learning_rate": 0.00019790360557147315,
+      "loss": 1.1412,
+      "mean_token_accuracy": 0.7041096150875091,
+      "num_tokens": 758273.0,
+      "step": 740
+    },
+    {
+      "epoch": 0.010637996085217441,
+      "learning_rate": 0.00019787523758191258,
+      "loss": 1.1151,
+      "mean_token_accuracy": 0.701738464832306,
+      "num_tokens": 768532.0,
+      "step": 750
+    },
+    {
+      "epoch": 0.01077983603302034,
+      "learning_rate": 0.000197846869592352,
+      "loss": 1.0879,
+      "mean_token_accuracy": 0.7017017006874084,
+      "num_tokens": 778726.0,
+      "step": 760
+    },
+    {
+      "epoch": 0.010921675980823239,
+      "learning_rate": 0.00019781850160279142,
+      "loss": 1.1212,
+      "mean_token_accuracy": 0.6997404515743255,
+      "num_tokens": 788976.0,
+      "step": 770
+    },
+    {
+      "epoch": 0.011063515928626139,
+      "learning_rate": 0.00019779013361323085,
+      "loss": 1.11,
+      "mean_token_accuracy": 0.6939224183559418,
+      "num_tokens": 799236.0,
+      "step": 780
+    },
+    {
+      "epoch": 0.011205355876429037,
+      "learning_rate": 0.00019776176562367025,
+      "loss": 1.1234,
+      "mean_token_accuracy": 0.7009412169456481,
+      "num_tokens": 809407.0,
+      "step": 790
+    },
+    {
+      "epoch": 0.011347195824231937,
+      "learning_rate": 0.00019773339763410968,
+      "loss": 1.1241,
+      "mean_token_accuracy": 0.700713324546814,
+      "num_tokens": 819778.0,
+      "step": 800
+    },
+    {
+      "epoch": 0.011489035772034835,
+      "learning_rate": 0.0001977050296445491,
+      "loss": 1.1238,
+      "mean_token_accuracy": 0.7003967940807343,
+      "num_tokens": 829656.0,
+      "step": 810
+    },
+    {
+      "epoch": 0.011630875719837735,
+      "learning_rate": 0.00019767666165498854,
+      "loss": 1.121,
+      "mean_token_accuracy": 0.703648030757904,
+      "num_tokens": 839892.0,
+      "step": 820
+    },
+    {
+      "epoch": 0.011772715667640635,
+      "learning_rate": 0.00019764829366542794,
+      "loss": 1.1255,
+      "mean_token_accuracy": 0.7010339736938477,
+      "num_tokens": 849962.0,
+      "step": 830
+    },
+    {
+      "epoch": 0.011914555615443533,
+      "learning_rate": 0.00019761992567586734,
+      "loss": 1.1608,
+      "mean_token_accuracy": 0.691221284866333,
+      "num_tokens": 860382.0,
+      "step": 840
+    },
+    {
+      "epoch": 0.012056395563246433,
+      "learning_rate": 0.00019759155768630677,
+      "loss": 1.1149,
+      "mean_token_accuracy": 0.702646654844284,
+      "num_tokens": 870754.0,
+      "step": 850
+    },
+    {
+      "epoch": 0.012198235511049331,
+      "learning_rate": 0.0001975631896967462,
+      "loss": 1.1507,
+      "mean_token_accuracy": 0.6957122564315796,
+      "num_tokens": 880951.0,
+      "step": 860
+    },
+    {
+      "epoch": 0.012340075458852231,
+      "learning_rate": 0.00019753482170718563,
+      "loss": 1.1001,
+      "mean_token_accuracy": 0.7044150590896606,
+      "num_tokens": 891008.0,
+      "step": 870
+    },
+    {
+      "epoch": 0.012481915406655131,
+      "learning_rate": 0.00019750645371762504,
+      "loss": 1.1419,
+      "mean_token_accuracy": 0.7001836776733399,
+      "num_tokens": 901300.0,
+      "step": 880
+    },
+    {
+      "epoch": 0.01262375535445803,
+      "learning_rate": 0.00019747808572806447,
+      "loss": 1.1985,
+      "mean_token_accuracy": 0.6821943819522858,
+      "num_tokens": 911677.0,
+      "step": 890
+    },
+    {
+      "epoch": 0.01276559530226093,
+      "learning_rate": 0.0001974497177385039,
+      "loss": 1.1275,
+      "mean_token_accuracy": 0.6965928733348846,
+      "num_tokens": 921937.0,
+      "step": 900
+    },
+    {
+      "epoch": 0.012907435250063827,
+      "learning_rate": 0.0001974213497489433,
+      "loss": 1.1085,
+      "mean_token_accuracy": 0.7022442996501923,
+      "num_tokens": 932391.0,
+      "step": 910
+    },
+    {
+      "epoch": 0.013049275197866727,
+      "learning_rate": 0.00019739298175938273,
+      "loss": 1.1387,
+      "mean_token_accuracy": 0.7010680258274078,
+      "num_tokens": 942696.0,
+      "step": 920
+    },
+    {
+      "epoch": 0.013191115145669627,
+      "learning_rate": 0.00019736461376982213,
+      "loss": 1.1503,
+      "mean_token_accuracy": 0.693393486738205,
+      "num_tokens": 953051.0,
+      "step": 930
+    },
+    {
+      "epoch": 0.013332955093472525,
+      "learning_rate": 0.00019733624578026156,
+      "loss": 1.1153,
+      "mean_token_accuracy": 0.6921448647975922,
+      "num_tokens": 963299.0,
+      "step": 940
+    },
+    {
+      "epoch": 0.013474795041275425,
+      "learning_rate": 0.000197307877790701,
+      "loss": 1.1478,
+      "mean_token_accuracy": 0.6896324157714844,
+      "num_tokens": 973501.0,
+      "step": 950
+    },
+    {
+      "epoch": 0.013616634989078323,
+      "learning_rate": 0.0001972795098011404,
+      "loss": 1.1163,
+      "mean_token_accuracy": 0.7062079787254334,
+      "num_tokens": 983819.0,
+      "step": 960
+    },
+    {
+      "epoch": 0.013758474936881223,
+      "learning_rate": 0.00019725114181157983,
+      "loss": 1.134,
+      "mean_token_accuracy": 0.6980367541313172,
+      "num_tokens": 994251.0,
+      "step": 970
+    },
+    {
+      "epoch": 0.013900314884684123,
+      "learning_rate": 0.00019722277382201923,
+      "loss": 1.1761,
+      "mean_token_accuracy": 0.6891869902610779,
+      "num_tokens": 1004567.0,
+      "step": 980
+    },
+    {
+      "epoch": 0.014042154832487021,
+      "learning_rate": 0.00019719440583245869,
+      "loss": 1.1048,
+      "mean_token_accuracy": 0.7076132833957672,
+      "num_tokens": 1014866.0,
+      "step": 990
+    },
+    {
+      "epoch": 0.014183994780289921,
+      "learning_rate": 0.0001971660378428981,
+      "loss": 1.1435,
+      "mean_token_accuracy": 0.6922993123531341,
+      "num_tokens": 1025085.0,
+      "step": 1000
+    },
+    {
+      "epoch": 0.01432583472809282,
+      "learning_rate": 0.0001971376698533375,
+      "loss": 1.0938,
+      "mean_token_accuracy": 0.7052319467067718,
+      "num_tokens": 1035538.0,
+      "step": 1010
+    },
+    {
+      "epoch": 0.01446767467589572,
+      "learning_rate": 0.00019710930186377692,
+      "loss": 1.1406,
+      "mean_token_accuracy": 0.6931207239627838,
+      "num_tokens": 1045628.0,
+      "step": 1020
+    },
+    {
+      "epoch": 0.01460951462369862,
+      "learning_rate": 0.00019708093387421632,
+      "loss": 1.1303,
+      "mean_token_accuracy": 0.698544704914093,
+      "num_tokens": 1055971.0,
+      "step": 1030
+    },
+    {
+      "epoch": 0.014751354571501517,
+      "learning_rate": 0.00019705256588465578,
+      "loss": 1.1573,
+      "mean_token_accuracy": 0.689105898141861,
+      "num_tokens": 1066042.0,
+      "step": 1040
+    },
+    {
+      "epoch": 0.014893194519304417,
+      "learning_rate": 0.00019702419789509518,
+      "loss": 1.0628,
+      "mean_token_accuracy": 0.7112344741821289,
+      "num_tokens": 1076048.0,
+      "step": 1050
+    },
+    {
+      "epoch": 0.015035034467107316,
+      "learning_rate": 0.00019699582990553461,
+      "loss": 1.1377,
+      "mean_token_accuracy": 0.6986138761043549,
+      "num_tokens": 1086480.0,
+      "step": 1060
+    },
+    {
+      "epoch": 0.015176874414910215,
+      "learning_rate": 0.00019696746191597402,
+      "loss": 1.1203,
+      "mean_token_accuracy": 0.7030752837657929,
+      "num_tokens": 1096900.0,
+      "step": 1070
+    },
+    {
+      "epoch": 0.015318714362713115,
+      "learning_rate": 0.00019693909392641345,
+      "loss": 1.1438,
+      "mean_token_accuracy": 0.6927322566509246,
+      "num_tokens": 1107480.0,
+      "step": 1080
+    },
+    {
+      "epoch": 0.015460554310516014,
+      "learning_rate": 0.00019691072593685288,
+      "loss": 1.0853,
+      "mean_token_accuracy": 0.7137106359004974,
+      "num_tokens": 1117902.0,
+      "step": 1090
+    },
+    {
+      "epoch": 0.015602394258318913,
+      "learning_rate": 0.00019688235794729228,
+      "loss": 1.1405,
+      "mean_token_accuracy": 0.6944672584533691,
+      "num_tokens": 1128045.0,
+      "step": 1100
+    },
+    {
+      "epoch": 0.015744234206121813,
+      "learning_rate": 0.0001968539899577317,
+      "loss": 1.1166,
+      "mean_token_accuracy": 0.6966266989707947,
+      "num_tokens": 1138338.0,
+      "step": 1110
+    },
+    {
+      "epoch": 0.01588607415392471,
+      "learning_rate": 0.0001968256219681711,
+      "loss": 1.1176,
+      "mean_token_accuracy": 0.7043495059013367,
+      "num_tokens": 1148428.0,
+      "step": 1120
+    },
+    {
+      "epoch": 0.01602791410172761,
+      "learning_rate": 0.00019679725397861054,
+      "loss": 1.0583,
+      "mean_token_accuracy": 0.7091330707073211,
+      "num_tokens": 1158359.0,
+      "step": 1130
+    },
+    {
+      "epoch": 0.01616975404953051,
+      "learning_rate": 0.00019676888598904997,
+      "loss": 1.2325,
+      "mean_token_accuracy": 0.6778323352336884,
+      "num_tokens": 1168601.0,
+      "step": 1140
+    },
+    {
+      "epoch": 0.01631159399733341,
+      "learning_rate": 0.00019674051799948938,
+      "loss": 1.0737,
+      "mean_token_accuracy": 0.7033764302730561,
+      "num_tokens": 1178900.0,
+      "step": 1150
+    },
+    {
+      "epoch": 0.01645343394513631,
+      "learning_rate": 0.0001967121500099288,
+      "loss": 1.1099,
+      "mean_token_accuracy": 0.7059059202671051,
+      "num_tokens": 1189152.0,
+      "step": 1160
+    },
+    {
+      "epoch": 0.016595273892939206,
+      "learning_rate": 0.00019668378202036824,
+      "loss": 1.1593,
+      "mean_token_accuracy": 0.6904339075088501,
+      "num_tokens": 1199505.0,
+      "step": 1170
+    },
+    {
+      "epoch": 0.016737113840742106,
+      "learning_rate": 0.00019665541403080764,
+      "loss": 1.0748,
+      "mean_token_accuracy": 0.708430927991867,
+      "num_tokens": 1209675.0,
+      "step": 1180
+    },
+    {
+      "epoch": 0.016878953788545006,
+      "learning_rate": 0.00019662704604124707,
+      "loss": 1.1252,
+      "mean_token_accuracy": 0.7042509257793427,
+      "num_tokens": 1219595.0,
+      "step": 1190
+    },
+    {
+      "epoch": 0.017020793736347906,
+      "learning_rate": 0.00019659867805168647,
+      "loss": 1.0726,
+      "mean_token_accuracy": 0.6974501132965087,
+      "num_tokens": 1229844.0,
+      "step": 1200
+    },
+    {
+      "epoch": 0.017162633684150806,
+      "learning_rate": 0.00019657031006212593,
+      "loss": 1.0663,
+      "mean_token_accuracy": 0.7197851002216339,
+      "num_tokens": 1239909.0,
+      "step": 1210
+    },
+    {
+      "epoch": 0.017304473631953702,
+      "learning_rate": 0.00019654194207256533,
+      "loss": 1.0802,
+      "mean_token_accuracy": 0.7073646426200867,
+      "num_tokens": 1250098.0,
+      "step": 1220
+    },
+    {
+      "epoch": 0.017446313579756602,
+      "learning_rate": 0.00019651357408300476,
+      "loss": 1.1082,
+      "mean_token_accuracy": 0.7057863056659699,
+      "num_tokens": 1260400.0,
+      "step": 1230
+    },
+    {
+      "epoch": 0.017588153527559502,
+      "learning_rate": 0.00019648520609344416,
+      "loss": 1.1259,
+      "mean_token_accuracy": 0.699841320514679,
+      "num_tokens": 1270621.0,
+      "step": 1240
+    },
+    {
+      "epoch": 0.0177299934753624,
+      "learning_rate": 0.00019645683810388357,
+      "loss": 1.1123,
+      "mean_token_accuracy": 0.6950526118278504,
+      "num_tokens": 1280810.0,
+      "step": 1250
+    },
+    {
+      "epoch": 0.0178718334231653,
+      "learning_rate": 0.00019642847011432302,
+      "loss": 1.132,
+      "mean_token_accuracy": 0.692725783586502,
+      "num_tokens": 1291192.0,
+      "step": 1260
+    },
+    {
+      "epoch": 0.018013673370968198,
+      "learning_rate": 0.00019640010212476243,
+      "loss": 1.1279,
+      "mean_token_accuracy": 0.703784042596817,
+      "num_tokens": 1301253.0,
+      "step": 1270
+    },
+    {
+      "epoch": 0.018155513318771098,
+      "learning_rate": 0.00019637173413520186,
+      "loss": 1.064,
+      "mean_token_accuracy": 0.7076753437519073,
+      "num_tokens": 1311455.0,
+      "step": 1280
+    },
+    {
+      "epoch": 0.018297353266573998,
+      "learning_rate": 0.00019634336614564126,
+      "loss": 1.1056,
+      "mean_token_accuracy": 0.7009041368961334,
+      "num_tokens": 1322049.0,
+      "step": 1290
+    },
+    {
+      "epoch": 0.018439193214376898,
+      "learning_rate": 0.0001963149981560807,
+      "loss": 1.1307,
+      "mean_token_accuracy": 0.6981959402561188,
+      "num_tokens": 1332529.0,
+      "step": 1300
+    },
+    {
+      "epoch": 0.018581033162179798,
+      "learning_rate": 0.00019628663016652012,
+      "loss": 1.1149,
+      "mean_token_accuracy": 0.6912563383579254,
+      "num_tokens": 1342800.0,
+      "step": 1310
+    },
+    {
+      "epoch": 0.018722873109982694,
+      "learning_rate": 0.00019625826217695952,
+      "loss": 1.0897,
+      "mean_token_accuracy": 0.7025132238864898,
+      "num_tokens": 1353110.0,
+      "step": 1320
+    },
+    {
+      "epoch": 0.018864713057785594,
+      "learning_rate": 0.00019622989418739895,
+      "loss": 1.1089,
+      "mean_token_accuracy": 0.7050705254077911,
+      "num_tokens": 1363266.0,
+      "step": 1330
+    },
+    {
+      "epoch": 0.019006553005588494,
+      "learning_rate": 0.00019620152619783835,
+      "loss": 1.1038,
+      "mean_token_accuracy": 0.7059103548526764,
+      "num_tokens": 1373500.0,
+      "step": 1340
+    },
+    {
+      "epoch": 0.019148392953391394,
+      "learning_rate": 0.00019617315820827778,
+      "loss": 1.1222,
+      "mean_token_accuracy": 0.7083336532115936,
+      "num_tokens": 1383687.0,
+      "step": 1350
+    },
+    {
+      "epoch": 0.019290232901194294,
+      "learning_rate": 0.00019614479021871721,
+      "loss": 1.1291,
+      "mean_token_accuracy": 0.6995004296302796,
+      "num_tokens": 1394120.0,
+      "step": 1360
+    },
+    {
+      "epoch": 0.01943207284899719,
+      "learning_rate": 0.00019611642222915662,
+      "loss": 1.0957,
+      "mean_token_accuracy": 0.7055183351039886,
+      "num_tokens": 1404491.0,
+      "step": 1370
+    },
+    {
+      "epoch": 0.01957391279680009,
+      "learning_rate": 0.00019608805423959605,
+      "loss": 1.1187,
+      "mean_token_accuracy": 0.6980840861797333,
+      "num_tokens": 1414654.0,
+      "step": 1380
+    },
+    {
+      "epoch": 0.01971575274460299,
+      "learning_rate": 0.00019605968625003548,
+      "loss": 1.1484,
+      "mean_token_accuracy": 0.6943079948425293,
+      "num_tokens": 1424822.0,
+      "step": 1390
+    },
+    {
+      "epoch": 0.01985759269240589,
+      "learning_rate": 0.0001960313182604749,
+      "loss": 1.1231,
+      "mean_token_accuracy": 0.7027773916721344,
+      "num_tokens": 1435210.0,
+      "step": 1400
+    },
+    {
+      "epoch": 0.01999943264020879,
+      "learning_rate": 0.0001960029502709143,
+      "loss": 1.0682,
+      "mean_token_accuracy": 0.7105901122093201,
+      "num_tokens": 1445483.0,
+      "step": 1410
+    },
+    {
+      "epoch": 0.020141272588011686,
+      "learning_rate": 0.0001959745822813537,
+      "loss": 1.0946,
+      "mean_token_accuracy": 0.7032223284244538,
+      "num_tokens": 1455942.0,
+      "step": 1420
+    },
+    {
+      "epoch": 0.020283112535814586,
+      "learning_rate": 0.00019594621429179314,
+      "loss": 1.0769,
+      "mean_token_accuracy": 0.7099736094474792,
+      "num_tokens": 1465992.0,
+      "step": 1430
+    },
+    {
+      "epoch": 0.020424952483617486,
+      "learning_rate": 0.00019591784630223257,
+      "loss": 1.044,
+      "mean_token_accuracy": 0.7143494069576264,
+      "num_tokens": 1476070.0,
+      "step": 1440
+    },
+    {
+      "epoch": 0.020566792431420386,
+      "learning_rate": 0.000195889478312672,
+      "loss": 1.0988,
+      "mean_token_accuracy": 0.6990650355815887,
+      "num_tokens": 1486388.0,
+      "step": 1450
+    },
+    {
+      "epoch": 0.020708632379223286,
+      "learning_rate": 0.0001958611103231114,
+      "loss": 1.0812,
+      "mean_token_accuracy": 0.7099774420261383,
+      "num_tokens": 1496689.0,
+      "step": 1460
+    },
+    {
+      "epoch": 0.020850472327026182,
+      "learning_rate": 0.00019583274233355084,
+      "loss": 1.0747,
+      "mean_token_accuracy": 0.7068913519382477,
+      "num_tokens": 1506676.0,
+      "step": 1470
+    },
+    {
+      "epoch": 0.020992312274829082,
+      "learning_rate": 0.00019580437434399027,
+      "loss": 1.1216,
+      "mean_token_accuracy": 0.6996153056621551,
+      "num_tokens": 1516996.0,
+      "step": 1480
+    },
+    {
+      "epoch": 0.021134152222631982,
+      "learning_rate": 0.00019577600635442967,
+      "loss": 1.0814,
+      "mean_token_accuracy": 0.7133045315742492,
+      "num_tokens": 1526981.0,
+      "step": 1490
+    },
+    {
+      "epoch": 0.021275992170434882,
+      "learning_rate": 0.0001957476383648691,
+      "loss": 1.042,
+      "mean_token_accuracy": 0.7156777441501617,
+      "num_tokens": 1536967.0,
+      "step": 1500
+    },
+    {
+      "epoch": 0.021417832118237782,
+      "learning_rate": 0.0001957192703753085,
+      "loss": 1.0735,
+      "mean_token_accuracy": 0.7105422735214233,
+      "num_tokens": 1547354.0,
+      "step": 1510
+    },
+    {
+      "epoch": 0.02155967206604068,
+      "learning_rate": 0.00019569090238574793,
+      "loss": 1.1221,
+      "mean_token_accuracy": 0.6900959551334381,
+      "num_tokens": 1557285.0,
+      "step": 1520
+    },
+    {
+      "epoch": 0.021701512013843578,
+      "learning_rate": 0.00019566253439618736,
+      "loss": 1.1432,
+      "mean_token_accuracy": 0.7033149361610412,
+      "num_tokens": 1567450.0,
+      "step": 1530
+    },
+    {
+      "epoch": 0.021843351961646478,
+      "learning_rate": 0.00019563416640662676,
+      "loss": 1.0863,
+      "mean_token_accuracy": 0.7135837018489838,
+      "num_tokens": 1577684.0,
+      "step": 1540
+    },
+    {
+      "epoch": 0.021985191909449378,
+      "learning_rate": 0.0001956057984170662,
+      "loss": 1.0564,
+      "mean_token_accuracy": 0.7110729515552521,
+      "num_tokens": 1588410.0,
+      "step": 1550
+    },
+    {
+      "epoch": 0.022127031857252278,
+      "learning_rate": 0.0001955774304275056,
+      "loss": 1.1056,
+      "mean_token_accuracy": 0.7048493981361389,
+      "num_tokens": 1598595.0,
+      "step": 1560
+    },
+    {
+      "epoch": 0.022268871805055174,
+      "learning_rate": 0.00019554906243794505,
+      "loss": 1.0904,
+      "mean_token_accuracy": 0.707161259651184,
+      "num_tokens": 1608782.0,
+      "step": 1570
+    },
+    {
+      "epoch": 0.022410711752858074,
+      "learning_rate": 0.00019552069444838446,
+      "loss": 1.0611,
+      "mean_token_accuracy": 0.7107515692710876,
+      "num_tokens": 1618985.0,
+      "step": 1580
+    },
+    {
+      "epoch": 0.022552551700660974,
+      "learning_rate": 0.00019549232645882386,
+      "loss": 1.0864,
+      "mean_token_accuracy": 0.7008066534996032,
+      "num_tokens": 1629402.0,
+      "step": 1590
+    },
+    {
+      "epoch": 0.022694391648463874,
+      "learning_rate": 0.0001954639584692633,
+      "loss": 1.1076,
+      "mean_token_accuracy": 0.6996842324733734,
+      "num_tokens": 1639522.0,
+      "step": 1600
+    },
+    {
+      "epoch": 0.022836231596266774,
+      "learning_rate": 0.00019543559047970272,
+      "loss": 1.1011,
+      "mean_token_accuracy": 0.7056106328964233,
+      "num_tokens": 1649778.0,
+      "step": 1610
+    },
+    {
+      "epoch": 0.02297807154406967,
+      "learning_rate": 0.00019540722249014215,
+      "loss": 1.118,
+      "mean_token_accuracy": 0.696357262134552,
+      "num_tokens": 1660207.0,
+      "step": 1620
+    },
+    {
+      "epoch": 0.02311991149187257,
+      "learning_rate": 0.00019537885450058155,
+      "loss": 1.0676,
+      "mean_token_accuracy": 0.7104279041290283,
+      "num_tokens": 1670377.0,
+      "step": 1630
+    },
+    {
+      "epoch": 0.02326175143967547,
+      "learning_rate": 0.00019535048651102098,
+      "loss": 1.0716,
+      "mean_token_accuracy": 0.7117774069309235,
+      "num_tokens": 1680672.0,
+      "step": 1640
+    },
+    {
+      "epoch": 0.02340359138747837,
+      "learning_rate": 0.00019532211852146038,
+      "loss": 1.088,
+      "mean_token_accuracy": 0.7042657971382141,
+      "num_tokens": 1691141.0,
+      "step": 1650
+    },
+    {
+      "epoch": 0.02354543133528127,
+      "learning_rate": 0.00019529375053189981,
+      "loss": 1.1076,
+      "mean_token_accuracy": 0.7016879081726074,
+      "num_tokens": 1701472.0,
+      "step": 1660
+    },
+    {
+      "epoch": 0.023687271283084167,
+      "learning_rate": 0.00019526538254233924,
+      "loss": 1.0842,
+      "mean_token_accuracy": 0.7067211866378784,
+      "num_tokens": 1711678.0,
+      "step": 1670
+    },
+    {
+      "epoch": 0.023829111230887066,
+      "learning_rate": 0.00019523701455277865,
+      "loss": 1.0875,
+      "mean_token_accuracy": 0.7065619647502899,
+      "num_tokens": 1722146.0,
+      "step": 1680
+    },
+    {
+      "epoch": 0.023970951178689966,
+      "learning_rate": 0.00019520864656321808,
+      "loss": 1.0687,
+      "mean_token_accuracy": 0.7062947809696197,
+      "num_tokens": 1732546.0,
+      "step": 1690
+    },
+    {
+      "epoch": 0.024112791126492866,
+      "learning_rate": 0.0001951802785736575,
+      "loss": 1.0985,
+      "mean_token_accuracy": 0.7017097353935242,
+      "num_tokens": 1742873.0,
+      "step": 1700
+    },
+    {
+      "epoch": 0.024254631074295766,
+      "learning_rate": 0.0001951519105840969,
+      "loss": 1.0768,
+      "mean_token_accuracy": 0.7073511421680451,
+      "num_tokens": 1753232.0,
+      "step": 1710
+    },
+    {
+      "epoch": 0.024396471022098663,
+      "learning_rate": 0.00019512354259453634,
+      "loss": 1.0708,
+      "mean_token_accuracy": 0.7018490791320801,
+      "num_tokens": 1763622.0,
+      "step": 1720
+    },
+    {
+      "epoch": 0.024538310969901563,
+      "learning_rate": 0.00019509517460497574,
+      "loss": 1.1175,
+      "mean_token_accuracy": 0.7016505122184753,
+      "num_tokens": 1774027.0,
+      "step": 1730
+    },
+    {
+      "epoch": 0.024680150917704462,
+      "learning_rate": 0.00019506680661541517,
+      "loss": 1.0774,
+      "mean_token_accuracy": 0.7039576828479767,
+      "num_tokens": 1784277.0,
+      "step": 1740
+    },
+    {
+      "epoch": 0.024821990865507362,
+      "learning_rate": 0.0001950384386258546,
+      "loss": 1.092,
+      "mean_token_accuracy": 0.7060896992683411,
+      "num_tokens": 1794436.0,
+      "step": 1750
+    },
+    {
+      "epoch": 0.024963830813310262,
+      "learning_rate": 0.000195010070636294,
+      "loss": 1.0478,
+      "mean_token_accuracy": 0.708050674200058,
+      "num_tokens": 1804575.0,
+      "step": 1760
+    },
+    {
+      "epoch": 0.02510567076111316,
+      "learning_rate": 0.00019498170264673344,
+      "loss": 1.0737,
+      "mean_token_accuracy": 0.7079983413219452,
+      "num_tokens": 1814603.0,
+      "step": 1770
+    },
+    {
+      "epoch": 0.02524751070891606,
+      "learning_rate": 0.00019495333465717284,
+      "loss": 1.0445,
+      "mean_token_accuracy": 0.7188371956348419,
+      "num_tokens": 1824990.0,
+      "step": 1780
+    },
+    {
+      "epoch": 0.02538935065671896,
+      "learning_rate": 0.0001949249666676123,
+      "loss": 1.0557,
+      "mean_token_accuracy": 0.7108985543251037,
+      "num_tokens": 1835477.0,
+      "step": 1790
+    },
+    {
+      "epoch": 0.02553119060452186,
+      "learning_rate": 0.0001948965986780517,
+      "loss": 1.054,
+      "mean_token_accuracy": 0.7082186043262482,
+      "num_tokens": 1845736.0,
+      "step": 1800
+    },
+    {
+      "epoch": 0.02567303055232476,
+      "learning_rate": 0.00019486823068849113,
+      "loss": 1.0705,
+      "mean_token_accuracy": 0.7082441449165344,
+      "num_tokens": 1855966.0,
+      "step": 1810
+    },
+    {
+      "epoch": 0.025814870500127655,
+      "learning_rate": 0.00019483986269893053,
+      "loss": 1.0473,
+      "mean_token_accuracy": 0.7133744478225708,
+      "num_tokens": 1866194.0,
+      "step": 1820
+    },
+    {
+      "epoch": 0.025956710447930555,
+      "learning_rate": 0.00019481149470936993,
+      "loss": 1.0913,
+      "mean_token_accuracy": 0.700226366519928,
+      "num_tokens": 1876551.0,
+      "step": 1830
+    },
+    {
+      "epoch": 0.026098550395733455,
+      "learning_rate": 0.0001947831267198094,
+      "loss": 1.0627,
+      "mean_token_accuracy": 0.7126640200614929,
+      "num_tokens": 1886799.0,
+      "step": 1840
+    },
+    {
+      "epoch": 0.026240390343536354,
+      "learning_rate": 0.0001947547587302488,
+      "loss": 1.1058,
+      "mean_token_accuracy": 0.7105142951011658,
+      "num_tokens": 1897204.0,
+      "step": 1850
+    },
+    {
+      "epoch": 0.026382230291339254,
+      "learning_rate": 0.00019472639074068822,
+      "loss": 1.1202,
+      "mean_token_accuracy": 0.6887533903121948,
+      "num_tokens": 1907365.0,
+      "step": 1860
+    },
+    {
+      "epoch": 0.02652407023914215,
+      "learning_rate": 0.00019469802275112763,
+      "loss": 1.0668,
+      "mean_token_accuracy": 0.7144553422927856,
+      "num_tokens": 1917702.0,
+      "step": 1870
+    },
+    {
+      "epoch": 0.02666591018694505,
+      "learning_rate": 0.00019466965476156706,
+      "loss": 1.1356,
+      "mean_token_accuracy": 0.6965939939022064,
+      "num_tokens": 1928031.0,
+      "step": 1880
+    },
+    {
+      "epoch": 0.02680775013474795,
+      "learning_rate": 0.0001946412867720065,
+      "loss": 1.0778,
+      "mean_token_accuracy": 0.7089079439640045,
+      "num_tokens": 1938084.0,
+      "step": 1890
+    },
+    {
+      "epoch": 0.02694959008255085,
+      "learning_rate": 0.0001946129187824459,
+      "loss": 1.0483,
+      "mean_token_accuracy": 0.7225513160228729,
+      "num_tokens": 1948205.0,
+      "step": 1900
+    },
+    {
+      "epoch": 0.02709143003035375,
+      "learning_rate": 0.00019458455079288532,
+      "loss": 1.0154,
+      "mean_token_accuracy": 0.7130086362361908,
+      "num_tokens": 1958368.0,
+      "step": 1910
+    },
+    {
+      "epoch": 0.027233269978156647,
+      "learning_rate": 0.00019455618280332472,
+      "loss": 1.1258,
+      "mean_token_accuracy": 0.695936119556427,
+      "num_tokens": 1968738.0,
+      "step": 1920
+    },
+    {
+      "epoch": 0.027375109925959547,
+      "learning_rate": 0.00019452781481376415,
+      "loss": 1.0626,
+      "mean_token_accuracy": 0.7156413078308106,
+      "num_tokens": 1979045.0,
+      "step": 1930
+    },
+    {
+      "epoch": 0.027516949873762447,
+      "learning_rate": 0.00019449944682420358,
+      "loss": 1.0519,
+      "mean_token_accuracy": 0.7116637229919434,
+      "num_tokens": 1989218.0,
+      "step": 1940
+    },
+    {
+      "epoch": 0.027658789821565347,
+      "learning_rate": 0.00019447107883464299,
+      "loss": 1.0712,
+      "mean_token_accuracy": 0.711160945892334,
+      "num_tokens": 1999563.0,
+      "step": 1950
+    },
+    {
+      "epoch": 0.027800629769368247,
+      "learning_rate": 0.00019444271084508242,
+      "loss": 1.1045,
+      "mean_token_accuracy": 0.7045223116874695,
+      "num_tokens": 2009911.0,
+      "step": 1960
+    },
+    {
+      "epoch": 0.027942469717171143,
+      "learning_rate": 0.00019441434285552184,
+      "loss": 1.0463,
+      "mean_token_accuracy": 0.7148343741893768,
+      "num_tokens": 2020120.0,
+      "step": 1970
+    },
+    {
+      "epoch": 0.028084309664974043,
+      "learning_rate": 0.00019438597486596127,
+      "loss": 1.0957,
+      "mean_token_accuracy": 0.7087677419185638,
+      "num_tokens": 2030445.0,
+      "step": 1980
+    },
+    {
+      "epoch": 0.028226149612776943,
+      "learning_rate": 0.00019435760687640068,
+      "loss": 1.0338,
+      "mean_token_accuracy": 0.7160651028156281,
+      "num_tokens": 2040755.0,
+      "step": 1990
+    },
+    {
+      "epoch": 0.028367989560579843,
+      "learning_rate": 0.00019432923888684008,
+      "loss": 1.0508,
+      "mean_token_accuracy": 0.710547685623169,
+      "num_tokens": 2050888.0,
+      "step": 2000
+    },
+    {
+      "epoch": 0.028509829508382743,
+      "learning_rate": 0.00019430087089727954,
+      "loss": 1.0991,
+      "mean_token_accuracy": 0.6983346939086914,
+      "num_tokens": 2061216.0,
+      "step": 2010
+    },
+    {
+      "epoch": 0.02865166945618564,
+      "learning_rate": 0.00019427250290771894,
+      "loss": 1.0335,
+      "mean_token_accuracy": 0.7186195015907287,
+      "num_tokens": 2071685.0,
+      "step": 2020
+    },
+    {
+      "epoch": 0.02879350940398854,
+      "learning_rate": 0.00019424413491815837,
+      "loss": 1.0356,
+      "mean_token_accuracy": 0.707346785068512,
+      "num_tokens": 2081856.0,
+      "step": 2030
+    },
+    {
+      "epoch": 0.02893534935179144,
+      "learning_rate": 0.00019421576692859777,
+      "loss": 1.0796,
+      "mean_token_accuracy": 0.713982081413269,
+      "num_tokens": 2092164.0,
+      "step": 2040
+    },
+    {
+      "epoch": 0.02907718929959434,
+      "learning_rate": 0.0001941873989390372,
+      "loss": 1.0606,
+      "mean_token_accuracy": 0.7008832335472107,
+      "num_tokens": 2102352.0,
+      "step": 2050
+    },
+    {
+      "epoch": 0.02921902924739724,
+      "learning_rate": 0.00019415903094947663,
+      "loss": 1.0889,
+      "mean_token_accuracy": 0.7124337434768677,
+      "num_tokens": 2112565.0,
+      "step": 2060
+    },
+    {
+      "epoch": 0.029360869195200135,
+      "learning_rate": 0.00019413066295991604,
+      "loss": 1.1011,
+      "mean_token_accuracy": 0.7071171522140502,
+      "num_tokens": 2122888.0,
+      "step": 2070
+    },
+    {
+      "epoch": 0.029502709143003035,
+      "learning_rate": 0.00019410229497035547,
+      "loss": 1.1143,
+      "mean_token_accuracy": 0.706645280122757,
+      "num_tokens": 2133066.0,
+      "step": 2080
+    },
+    {
+      "epoch": 0.029644549090805935,
+      "learning_rate": 0.00019407392698079487,
+      "loss": 1.0579,
+      "mean_token_accuracy": 0.7105309844017029,
+      "num_tokens": 2143320.0,
+      "step": 2090
+    },
+    {
+      "epoch": 0.029786389038608835,
+      "learning_rate": 0.0001940455589912343,
+      "loss": 1.0705,
+      "mean_token_accuracy": 0.7104713022708893,
+      "num_tokens": 2153668.0,
+      "step": 2100
+    },
+    {
+      "epoch": 0.029928228986411735,
+      "learning_rate": 0.00019401719100167373,
+      "loss": 1.0957,
+      "mean_token_accuracy": 0.7042783737182617,
+      "num_tokens": 2164139.0,
+      "step": 2110
+    },
+    {
+      "epoch": 0.03007006893421463,
+      "learning_rate": 0.00019398882301211313,
+      "loss": 1.0888,
+      "mean_token_accuracy": 0.7090662837028503,
+      "num_tokens": 2174254.0,
+      "step": 2120
+    },
+    {
+      "epoch": 0.03021190888201753,
+      "learning_rate": 0.00019396045502255256,
+      "loss": 1.085,
+      "mean_token_accuracy": 0.7122257769107818,
+      "num_tokens": 2184390.0,
+      "step": 2130
+    },
+    {
+      "epoch": 0.03035374882982043,
+      "learning_rate": 0.00019393208703299196,
+      "loss": 1.0785,
+      "mean_token_accuracy": 0.7105566322803497,
+      "num_tokens": 2194501.0,
+      "step": 2140
+    },
+    {
+      "epoch": 0.03049558877762333,
+      "learning_rate": 0.00019390371904343142,
+      "loss": 1.078,
+      "mean_token_accuracy": 0.7077171504497528,
+      "num_tokens": 2204777.0,
+      "step": 2150
+    },
+    {
+      "epoch": 0.03063742872542623,
+      "learning_rate": 0.00019387535105387082,
+      "loss": 1.0877,
+      "mean_token_accuracy": 0.7124951481819153,
+      "num_tokens": 2215021.0,
+      "step": 2160
+    },
+    {
+      "epoch": 0.030779268673229127,
+      "learning_rate": 0.00019384698306431023,
+      "loss": 1.114,
+      "mean_token_accuracy": 0.6971140921115875,
+      "num_tokens": 2225350.0,
+      "step": 2170
+    },
+    {
+      "epoch": 0.030921108621032027,
+      "learning_rate": 0.00019381861507474966,
+      "loss": 1.081,
+      "mean_token_accuracy": 0.7068113803863525,
+      "num_tokens": 2235521.0,
+      "step": 2180
+    },
+    {
+      "epoch": 0.031062948568834927,
+      "learning_rate": 0.0001937902470851891,
+      "loss": 1.0834,
+      "mean_token_accuracy": 0.7073126614093781,
+      "num_tokens": 2245925.0,
+      "step": 2190
+    },
+    {
+      "epoch": 0.031204788516637827,
+      "learning_rate": 0.00019376187909562852,
+      "loss": 1.0519,
+      "mean_token_accuracy": 0.7111847221851348,
+      "num_tokens": 2256364.0,
+      "step": 2200
+    },
+    {
+      "epoch": 0.03134662846444072,
+      "learning_rate": 0.00019373351110606792,
+      "loss": 1.0787,
+      "mean_token_accuracy": 0.709247374534607,
+      "num_tokens": 2266801.0,
+      "step": 2210
+    },
+    {
+      "epoch": 0.03148846841224363,
+      "learning_rate": 0.00019370514311650735,
+      "loss": 1.0879,
+      "mean_token_accuracy": 0.7051020622253418,
+      "num_tokens": 2276883.0,
+      "step": 2220
+    },
+    {
+      "epoch": 0.03163030836004652,
+      "learning_rate": 0.00019367677512694675,
+      "loss": 1.0577,
+      "mean_token_accuracy": 0.7003655672073364,
+      "num_tokens": 2287178.0,
+      "step": 2230
+    },
+    {
+      "epoch": 0.03177214830784942,
+      "learning_rate": 0.00019364840713738618,
+      "loss": 1.1319,
+      "mean_token_accuracy": 0.6982253730297089,
+      "num_tokens": 2297269.0,
+      "step": 2240
+    },
+    {
+      "epoch": 0.03191398825565232,
+      "learning_rate": 0.0001936200391478256,
+      "loss": 1.0966,
+      "mean_token_accuracy": 0.7028747737407685,
+      "num_tokens": 2307553.0,
+      "step": 2250
+    },
+    {
+      "epoch": 0.03205582820345522,
+      "learning_rate": 0.00019359167115826502,
+      "loss": 1.0234,
+      "mean_token_accuracy": 0.7202287912368774,
+      "num_tokens": 2317537.0,
+      "step": 2260
+    },
+    {
+      "epoch": 0.03219766815125812,
+      "learning_rate": 0.00019356330316870445,
+      "loss": 1.0733,
+      "mean_token_accuracy": 0.7071494162082672,
+      "num_tokens": 2327652.0,
+      "step": 2270
+    },
+    {
+      "epoch": 0.03233950809906102,
+      "learning_rate": 0.00019353493517914388,
+      "loss": 1.0692,
+      "mean_token_accuracy": 0.704166728258133,
+      "num_tokens": 2337948.0,
+      "step": 2280
+    },
+    {
+      "epoch": 0.032481348046863916,
+      "learning_rate": 0.00019350656718958328,
+      "loss": 1.0926,
+      "mean_token_accuracy": 0.7022884428501129,
+      "num_tokens": 2348363.0,
+      "step": 2290
+    },
+    {
+      "epoch": 0.03262318799466682,
+      "learning_rate": 0.0001934781992000227,
+      "loss": 1.1009,
+      "mean_token_accuracy": 0.7048872351646424,
+      "num_tokens": 2358709.0,
+      "step": 2300
+    },
+    {
+      "epoch": 0.032765027942469716,
+      "learning_rate": 0.0001934498312104621,
+      "loss": 1.0722,
+      "mean_token_accuracy": 0.7133905410766601,
+      "num_tokens": 2368896.0,
+      "step": 2310
+    },
+    {
+      "epoch": 0.03290686789027262,
+      "learning_rate": 0.00019342146322090154,
+      "loss": 1.0448,
+      "mean_token_accuracy": 0.7166795194149017,
+      "num_tokens": 2379133.0,
+      "step": 2320
+    },
+    {
+      "epoch": 0.033048707838075515,
+      "learning_rate": 0.00019339309523134097,
+      "loss": 1.07,
+      "mean_token_accuracy": 0.7065820157527923,
+      "num_tokens": 2389292.0,
+      "step": 2330
+    },
+    {
+      "epoch": 0.03319054778587841,
+      "learning_rate": 0.00019336472724178037,
+      "loss": 1.084,
+      "mean_token_accuracy": 0.7133280396461487,
+      "num_tokens": 2399640.0,
+      "step": 2340
+    },
+    {
+      "epoch": 0.033332387733681315,
+      "learning_rate": 0.0001933363592522198,
+      "loss": 1.0789,
+      "mean_token_accuracy": 0.70665163397789,
+      "num_tokens": 2410039.0,
+      "step": 2350
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 70502,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 2350,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.2857716965310464e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.0

gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250423_222017/checkpoint-1685/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1378 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.03332542225386654,
+  "eval_steps": 500,
+  "global_step": 1685,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0001977769866698311,
+      "learning_rate": 0.00019996440014239942,
+      "loss": 0.3352,
+      "mean_token_accuracy": 0.9023264050483704,
+      "num_tokens": 19163.0,
+      "step": 10
+    },
+    {
+      "epoch": 0.0003955539733396622,
+      "learning_rate": 0.0001999248447450655,
+      "loss": 0.1524,
+      "mean_token_accuracy": 0.947095412015915,
+      "num_tokens": 38071.0,
+      "step": 20
+    },
+    {
+      "epoch": 0.0005933309600094933,
+      "learning_rate": 0.0001998852893477315,
+      "loss": 0.1461,
+      "mean_token_accuracy": 0.9463379800319671,
+      "num_tokens": 57017.0,
+      "step": 30
+    },
+    {
+      "epoch": 0.0007911079466793244,
+      "learning_rate": 0.00019984573395039754,
+      "loss": 0.1024,
+      "mean_token_accuracy": 0.9588611423969269,
+      "num_tokens": 76047.0,
+      "step": 40
+    },
+    {
+      "epoch": 0.0009888849333491555,
+      "learning_rate": 0.00019980617855306357,
+      "loss": 0.117,
+      "mean_token_accuracy": 0.9553473949432373,
+      "num_tokens": 94824.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.0011866619200189867,
+      "learning_rate": 0.0001997666231557296,
+      "loss": 0.1281,
+      "mean_token_accuracy": 0.9538084208965302,
+      "num_tokens": 113467.0,
+      "step": 60
+    },
+    {
+      "epoch": 0.0013844389066888178,
+      "learning_rate": 0.00019972706775839565,
+      "loss": 0.1075,
+      "mean_token_accuracy": 0.961921775341034,
+      "num_tokens": 132229.0,
+      "step": 70
+    },
+    {
+      "epoch": 0.0015822158933586489,
+      "learning_rate": 0.00019968751236106166,
+      "loss": 0.1057,
+      "mean_token_accuracy": 0.9676864743232727,
+      "num_tokens": 150811.0,
+      "step": 80
+    },
+    {
+      "epoch": 0.00177999288002848,
+      "learning_rate": 0.00019964795696372772,
+      "loss": 0.1048,
+      "mean_token_accuracy": 0.9597055971622467,
+      "num_tokens": 169804.0,
+      "step": 90
+    },
+    {
+      "epoch": 0.001977769866698311,
+      "learning_rate": 0.00019960840156639376,
+      "loss": 0.1147,
+      "mean_token_accuracy": 0.9561040580272675,
+      "num_tokens": 188503.0,
+      "step": 100
+    },
+    {
+      "epoch": 0.002175546853368142,
+      "learning_rate": 0.00019956884616905977,
+      "loss": 0.0904,
+      "mean_token_accuracy": 0.9663344562053681,
+      "num_tokens": 207370.0,
+      "step": 110
+    },
+    {
+      "epoch": 0.0023733238400379733,
+      "learning_rate": 0.0001995292907717258,
+      "loss": 0.085,
+      "mean_token_accuracy": 0.9692767798900604,
+      "num_tokens": 226639.0,
+      "step": 120
+    },
+    {
+      "epoch": 0.0025711008267078044,
+      "learning_rate": 0.00019948973537439185,
+      "loss": 0.1055,
+      "mean_token_accuracy": 0.962373024225235,
+      "num_tokens": 245318.0,
+      "step": 130
+    },
+    {
+      "epoch": 0.0027688778133776355,
+      "learning_rate": 0.00019945017997705788,
+      "loss": 0.1023,
+      "mean_token_accuracy": 0.9603676617145538,
+      "num_tokens": 264089.0,
+      "step": 140
+    },
+    {
+      "epoch": 0.0029666548000474666,
+      "learning_rate": 0.0001994106245797239,
+      "loss": 0.106,
+      "mean_token_accuracy": 0.9619979500770569,
+      "num_tokens": 282977.0,
+      "step": 150
+    },
+    {
+      "epoch": 0.0031644317867172977,
+      "learning_rate": 0.00019937106918238996,
+      "loss": 0.1021,
+      "mean_token_accuracy": 0.9600433588027955,
+      "num_tokens": 301615.0,
+      "step": 160
+    },
+    {
+      "epoch": 0.003362208773387129,
+      "learning_rate": 0.000199331513785056,
+      "loss": 0.095,
+      "mean_token_accuracy": 0.9641285121440888,
+      "num_tokens": 320502.0,
+      "step": 170
+    },
+    {
+      "epoch": 0.00355998576005696,
+      "learning_rate": 0.000199291958387722,
+      "loss": 0.0722,
+      "mean_token_accuracy": 0.9708887040615082,
+      "num_tokens": 339616.0,
+      "step": 180
+    },
+    {
+      "epoch": 0.003757762746726791,
+      "learning_rate": 0.00019925240299038804,
+      "loss": 0.0951,
+      "mean_token_accuracy": 0.9712291181087493,
+      "num_tokens": 358474.0,
+      "step": 190
+    },
+    {
+      "epoch": 0.003955539733396622,
+      "learning_rate": 0.00019921284759305408,
+      "loss": 0.1194,
+      "mean_token_accuracy": 0.9630812525749206,
+      "num_tokens": 377094.0,
+      "step": 200
+    },
+    {
+      "epoch": 0.004153316720066453,
+      "learning_rate": 0.00019917329219572012,
+      "loss": 0.1002,
+      "mean_token_accuracy": 0.9660979807376862,
+      "num_tokens": 396000.0,
+      "step": 210
+    },
+    {
+      "epoch": 0.004351093706736284,
+      "learning_rate": 0.00019913373679838613,
+      "loss": 0.0954,
+      "mean_token_accuracy": 0.9636943399906158,
+      "num_tokens": 415019.0,
+      "step": 220
+    },
+    {
+      "epoch": 0.0045488706934061155,
+      "learning_rate": 0.0001990941814010522,
+      "loss": 0.1114,
+      "mean_token_accuracy": 0.9662698566913605,
+      "num_tokens": 433711.0,
+      "step": 230
+    },
+    {
+      "epoch": 0.004746647680075947,
+      "learning_rate": 0.00019905462600371823,
+      "loss": 0.0915,
+      "mean_token_accuracy": 0.9679243505001068,
+      "num_tokens": 452483.0,
+      "step": 240
+    },
+    {
+      "epoch": 0.004944424666745778,
+      "learning_rate": 0.00019901507060638424,
+      "loss": 0.095,
+      "mean_token_accuracy": 0.9688079237937928,
+      "num_tokens": 471395.0,
+      "step": 250
+    },
+    {
+      "epoch": 0.005142201653415609,
+      "learning_rate": 0.00019897551520905028,
+      "loss": 0.1123,
+      "mean_token_accuracy": 0.962276142835617,
+      "num_tokens": 489983.0,
+      "step": 260
+    },
+    {
+      "epoch": 0.00533997864008544,
+      "learning_rate": 0.00019893595981171632,
+      "loss": 0.0855,
+      "mean_token_accuracy": 0.9698696434497833,
+      "num_tokens": 509148.0,
+      "step": 270
+    },
+    {
+      "epoch": 0.005537755626755271,
+      "learning_rate": 0.00019889640441438235,
+      "loss": 0.0777,
+      "mean_token_accuracy": 0.9697826623916626,
+      "num_tokens": 528042.0,
+      "step": 280
+    },
+    {
+      "epoch": 0.005735532613425102,
+      "learning_rate": 0.0001988568490170484,
+      "loss": 0.0944,
+      "mean_token_accuracy": 0.9690817773342133,
+      "num_tokens": 546656.0,
+      "step": 290
+    },
+    {
+      "epoch": 0.005933309600094933,
+      "learning_rate": 0.00019881729361971443,
+      "loss": 0.0872,
+      "mean_token_accuracy": 0.9661558032035827,
+      "num_tokens": 565279.0,
+      "step": 300
+    },
+    {
+      "epoch": 0.006131086586764764,
+      "learning_rate": 0.00019877773822238047,
+      "loss": 0.09,
+      "mean_token_accuracy": 0.9669564247131348,
+      "num_tokens": 584196.0,
+      "step": 310
+    },
+    {
+      "epoch": 0.0063288635734345955,
+      "learning_rate": 0.00019873818282504648,
+      "loss": 0.0702,
+      "mean_token_accuracy": 0.9722951114177704,
+      "num_tokens": 603050.0,
+      "step": 320
+    },
+    {
+      "epoch": 0.006526640560104427,
+      "learning_rate": 0.00019869862742771251,
+      "loss": 0.0923,
+      "mean_token_accuracy": 0.9684451401233674,
+      "num_tokens": 621880.0,
+      "step": 330
+    },
+    {
+      "epoch": 0.006724417546774258,
+      "learning_rate": 0.00019865907203037855,
+      "loss": 0.0976,
+      "mean_token_accuracy": 0.9660769879817963,
+      "num_tokens": 640657.0,
+      "step": 340
+    },
+    {
+      "epoch": 0.006922194533444089,
+      "learning_rate": 0.0001986195166330446,
+      "loss": 0.107,
+      "mean_token_accuracy": 0.9633386790752411,
+      "num_tokens": 659503.0,
+      "step": 350
+    },
+    {
+      "epoch": 0.00711997152011392,
+      "learning_rate": 0.00019857996123571063,
+      "loss": 0.1058,
+      "mean_token_accuracy": 0.9641251742839814,
+      "num_tokens": 678570.0,
+      "step": 360
+    },
+    {
+      "epoch": 0.007317748506783751,
+      "learning_rate": 0.00019854040583837666,
+      "loss": 0.097,
+      "mean_token_accuracy": 0.9664280533790588,
+      "num_tokens": 697294.0,
+      "step": 370
+    },
+    {
+      "epoch": 0.007515525493453582,
+      "learning_rate": 0.0001985008504410427,
+      "loss": 0.0677,
+      "mean_token_accuracy": 0.9754213869571686,
+      "num_tokens": 716458.0,
+      "step": 380
+    },
+    {
+      "epoch": 0.007713302480123413,
+      "learning_rate": 0.0001984612950437087,
+      "loss": 0.0622,
+      "mean_token_accuracy": 0.9724574089050293,
+      "num_tokens": 735437.0,
+      "step": 390
+    },
+    {
+      "epoch": 0.007911079466793244,
+      "learning_rate": 0.00019842173964637475,
+      "loss": 0.1004,
+      "mean_token_accuracy": 0.9710333228111268,
+      "num_tokens": 754416.0,
+      "step": 400
+    },
+    {
+      "epoch": 0.008108856453463075,
+      "learning_rate": 0.0001983821842490408,
+      "loss": 0.0922,
+      "mean_token_accuracy": 0.9718518137931824,
+      "num_tokens": 773091.0,
+      "step": 410
+    },
+    {
+      "epoch": 0.008306633440132907,
+      "learning_rate": 0.00019834262885170682,
+      "loss": 0.0835,
+      "mean_token_accuracy": 0.9694978713989257,
+      "num_tokens": 791900.0,
+      "step": 420
+    },
+    {
+      "epoch": 0.008504410426802738,
+      "learning_rate": 0.00019830307345437286,
+      "loss": 0.0822,
+      "mean_token_accuracy": 0.974584549665451,
+      "num_tokens": 810547.0,
+      "step": 430
+    },
+    {
+      "epoch": 0.008702187413472569,
+      "learning_rate": 0.0001982635180570389,
+      "loss": 0.0733,
+      "mean_token_accuracy": 0.9770256340503692,
+      "num_tokens": 829753.0,
+      "step": 440
+    },
+    {
+      "epoch": 0.0088999644001424,
+      "learning_rate": 0.00019822396265970494,
+      "loss": 0.0882,
+      "mean_token_accuracy": 0.9668483734130859,
+      "num_tokens": 848294.0,
+      "step": 450
+    },
+    {
+      "epoch": 0.009097741386812231,
+      "learning_rate": 0.00019818440726237095,
+      "loss": 0.0921,
+      "mean_token_accuracy": 0.9715612173080445,
+      "num_tokens": 867635.0,
+      "step": 460
+    },
+    {
+      "epoch": 0.009295518373482062,
+      "learning_rate": 0.00019814485186503699,
+      "loss": 0.0668,
+      "mean_token_accuracy": 0.9765562832355499,
+      "num_tokens": 886596.0,
+      "step": 470
+    },
+    {
+      "epoch": 0.009493295360151893,
+      "learning_rate": 0.00019810529646770302,
+      "loss": 0.0797,
+      "mean_token_accuracy": 0.972789865732193,
+      "num_tokens": 905312.0,
+      "step": 480
+    },
+    {
+      "epoch": 0.009691072346821724,
+      "learning_rate": 0.00019806574107036906,
+      "loss": 0.0792,
+      "mean_token_accuracy": 0.976115608215332,
+      "num_tokens": 924072.0,
+      "step": 490
+    },
+    {
+      "epoch": 0.009888849333491555,
+      "learning_rate": 0.0001980261856730351,
+      "loss": 0.0672,
+      "mean_token_accuracy": 0.9746761620044708,
+      "num_tokens": 942844.0,
+      "step": 500
+    },
+    {
+      "epoch": 0.010086626320161387,
+      "learning_rate": 0.00019798663027570113,
+      "loss": 0.1031,
+      "mean_token_accuracy": 0.9693170130252838,
+      "num_tokens": 961524.0,
+      "step": 510
+    },
+    {
+      "epoch": 0.010284403306831218,
+      "learning_rate": 0.00019794707487836717,
+      "loss": 0.0899,
+      "mean_token_accuracy": 0.9669535160064697,
+      "num_tokens": 980332.0,
+      "step": 520
+    },
+    {
+      "epoch": 0.010482180293501049,
+      "learning_rate": 0.00019790751948103318,
+      "loss": 0.0861,
+      "mean_token_accuracy": 0.9700904309749603,
+      "num_tokens": 999163.0,
+      "step": 530
+    },
+    {
+      "epoch": 0.01067995728017088,
+      "learning_rate": 0.00019786796408369922,
+      "loss": 0.0997,
+      "mean_token_accuracy": 0.9640280067920685,
+      "num_tokens": 1018053.0,
+      "step": 540
+    },
+    {
+      "epoch": 0.010877734266840711,
+      "learning_rate": 0.00019782840868636528,
+      "loss": 0.0795,
+      "mean_token_accuracy": 0.9717990577220916,
+      "num_tokens": 1036828.0,
+      "step": 550
+    },
+    {
+      "epoch": 0.011075511253510542,
+      "learning_rate": 0.0001977888532890313,
+      "loss": 0.0885,
+      "mean_token_accuracy": 0.9719638526439667,
+      "num_tokens": 1055843.0,
+      "step": 560
+    },
+    {
+      "epoch": 0.011273288240180373,
+      "learning_rate": 0.00019774929789169733,
+      "loss": 0.0823,
+      "mean_token_accuracy": 0.9698925375938415,
+      "num_tokens": 1074733.0,
+      "step": 570
+    },
+    {
+      "epoch": 0.011471065226850204,
+      "learning_rate": 0.00019770974249436337,
+      "loss": 0.0945,
+      "mean_token_accuracy": 0.9694046437740326,
+      "num_tokens": 1093519.0,
+      "step": 580
+    },
+    {
+      "epoch": 0.011668842213520035,
+      "learning_rate": 0.0001976701870970294,
+      "loss": 0.0797,
+      "mean_token_accuracy": 0.975245076417923,
+      "num_tokens": 1112394.0,
+      "step": 590
+    },
+    {
+      "epoch": 0.011866619200189867,
+      "learning_rate": 0.00019763063169969542,
+      "loss": 0.0816,
+      "mean_token_accuracy": 0.9766307890415191,
+      "num_tokens": 1131435.0,
+      "step": 600
+    },
+    {
+      "epoch": 0.012064396186859698,
+      "learning_rate": 0.00019759107630236146,
+      "loss": 0.089,
+      "mean_token_accuracy": 0.9677313148975373,
+      "num_tokens": 1150274.0,
+      "step": 610
+    },
+    {
+      "epoch": 0.012262173173529529,
+      "learning_rate": 0.00019755152090502752,
+      "loss": 0.096,
+      "mean_token_accuracy": 0.9663532435894012,
+      "num_tokens": 1169076.0,
+      "step": 620
+    },
+    {
+      "epoch": 0.01245995016019936,
+      "learning_rate": 0.00019751196550769353,
+      "loss": 0.0686,
+      "mean_token_accuracy": 0.9754896223545074,
+      "num_tokens": 1187908.0,
+      "step": 630
+    },
+    {
+      "epoch": 0.012657727146869191,
+      "learning_rate": 0.00019747241011035957,
+      "loss": 0.0902,
+      "mean_token_accuracy": 0.9659872591495514,
+      "num_tokens": 1206872.0,
+      "step": 640
+    },
+    {
+      "epoch": 0.012855504133539022,
+      "learning_rate": 0.0001974328547130256,
+      "loss": 0.0757,
+      "mean_token_accuracy": 0.9724668145179749,
+      "num_tokens": 1225767.0,
+      "step": 650
+    },
+    {
+      "epoch": 0.013053281120208853,
+      "learning_rate": 0.00019739329931569164,
+      "loss": 0.0798,
+      "mean_token_accuracy": 0.9697497248649597,
+      "num_tokens": 1244340.0,
+      "step": 660
+    },
+    {
+      "epoch": 0.013251058106878684,
+      "learning_rate": 0.00019735374391835765,
+      "loss": 0.0874,
+      "mean_token_accuracy": 0.9725019693374634,
+      "num_tokens": 1263304.0,
+      "step": 670
+    },
+    {
+      "epoch": 0.013448835093548515,
+      "learning_rate": 0.0001973141885210237,
+      "loss": 0.0894,
+      "mean_token_accuracy": 0.9702820897102356,
+      "num_tokens": 1282028.0,
+      "step": 680
+    },
+    {
+      "epoch": 0.013646612080218347,
+      "learning_rate": 0.00019727463312368976,
+      "loss": 0.0958,
+      "mean_token_accuracy": 0.9666474103927613,
+      "num_tokens": 1300780.0,
+      "step": 690
+    },
+    {
+      "epoch": 0.013844389066888178,
+      "learning_rate": 0.00019723507772635577,
+      "loss": 0.0808,
+      "mean_token_accuracy": 0.9751847207546234,
+      "num_tokens": 1319535.0,
+      "step": 700
+    },
+    {
+      "epoch": 0.014042166053558009,
+      "learning_rate": 0.0001971955223290218,
+      "loss": 0.1044,
+      "mean_token_accuracy": 0.9665757656097412,
+      "num_tokens": 1338391.0,
+      "step": 710
+    },
+    {
+      "epoch": 0.01423994304022784,
+      "learning_rate": 0.00019715596693168784,
+      "loss": 0.0798,
+      "mean_token_accuracy": 0.9741999268531799,
+      "num_tokens": 1357072.0,
+      "step": 720
+    },
+    {
+      "epoch": 0.014437720026897671,
+      "learning_rate": 0.00019711641153435388,
+      "loss": 0.0661,
+      "mean_token_accuracy": 0.9795381426811218,
+      "num_tokens": 1375999.0,
+      "step": 730
+    },
+    {
+      "epoch": 0.014635497013567502,
+      "learning_rate": 0.00019707685613701992,
+      "loss": 0.082,
+      "mean_token_accuracy": 0.9710344612598419,
+      "num_tokens": 1394654.0,
+      "step": 740
+    },
+    {
+      "epoch": 0.014833274000237333,
+      "learning_rate": 0.00019703730073968593,
+      "loss": 0.0825,
+      "mean_token_accuracy": 0.9703740537166595,
+      "num_tokens": 1413323.0,
+      "step": 750
+    },
+    {
+      "epoch": 0.015031050986907164,
+      "learning_rate": 0.000196997745342352,
+      "loss": 0.0797,
+      "mean_token_accuracy": 0.9717476069927216,
+      "num_tokens": 1432274.0,
+      "step": 760
+    },
+    {
+      "epoch": 0.015228827973576995,
+      "learning_rate": 0.000196958189945018,
+      "loss": 0.0811,
+      "mean_token_accuracy": 0.9687287509441376,
+      "num_tokens": 1451009.0,
+      "step": 770
+    },
+    {
+      "epoch": 0.015426604960246827,
+      "learning_rate": 0.00019691863454768404,
+      "loss": 0.0802,
+      "mean_token_accuracy": 0.9698404908180237,
+      "num_tokens": 1469741.0,
+      "step": 780
+    },
+    {
+      "epoch": 0.015624381946916658,
+      "learning_rate": 0.00019687907915035008,
+      "loss": 0.0698,
+      "mean_token_accuracy": 0.9755463302135468,
+      "num_tokens": 1488644.0,
+      "step": 790
+    },
+    {
+      "epoch": 0.01582215893358649,
+      "learning_rate": 0.0001968395237530161,
+      "loss": 0.0698,
+      "mean_token_accuracy": 0.9777807116508483,
+      "num_tokens": 1507384.0,
+      "step": 800
+    },
+    {
+      "epoch": 0.016019935920256318,
+      "learning_rate": 0.00019679996835568215,
+      "loss": 0.085,
+      "mean_token_accuracy": 0.9731873035430908,
+      "num_tokens": 1526208.0,
+      "step": 810
+    },
+    {
+      "epoch": 0.01621771290692615,
+      "learning_rate": 0.00019676041295834816,
+      "loss": 0.0682,
+      "mean_token_accuracy": 0.9786826431751251,
+      "num_tokens": 1545086.0,
+      "step": 820
+    },
+    {
+      "epoch": 0.01641548989359598,
+      "learning_rate": 0.00019672085756101423,
+      "loss": 0.0687,
+      "mean_token_accuracy": 0.9754264533519745,
+      "num_tokens": 1563972.0,
+      "step": 830
+    },
+    {
+      "epoch": 0.016613266880265813,
+      "learning_rate": 0.00019668130216368024,
+      "loss": 0.0847,
+      "mean_token_accuracy": 0.9757691383361816,
+      "num_tokens": 1582893.0,
+      "step": 840
+    },
+    {
+      "epoch": 0.016811043866935643,
+      "learning_rate": 0.00019664174676634627,
+      "loss": 0.0978,
+      "mean_token_accuracy": 0.9691124320030212,
+      "num_tokens": 1601719.0,
+      "step": 850
+    },
+    {
+      "epoch": 0.017008820853605475,
+      "learning_rate": 0.0001966021913690123,
+      "loss": 0.0834,
+      "mean_token_accuracy": 0.9732567369937897,
+      "num_tokens": 1620493.0,
+      "step": 860
+    },
+    {
+      "epoch": 0.017206597840275305,
+      "learning_rate": 0.00019656263597167835,
+      "loss": 0.0703,
+      "mean_token_accuracy": 0.9783392190933228,
+      "num_tokens": 1639336.0,
+      "step": 870
+    },
+    {
+      "epoch": 0.017404374826945138,
+      "learning_rate": 0.00019652308057434439,
+      "loss": 0.0916,
+      "mean_token_accuracy": 0.9702894032001496,
+      "num_tokens": 1658213.0,
+      "step": 880
+    },
+    {
+      "epoch": 0.017602151813614967,
+      "learning_rate": 0.0001964835251770104,
+      "loss": 0.0631,
+      "mean_token_accuracy": 0.9804218530654907,
+      "num_tokens": 1677260.0,
+      "step": 890
+    },
+    {
+      "epoch": 0.0177999288002848,
+      "learning_rate": 0.00019644396977967646,
+      "loss": 0.0942,
+      "mean_token_accuracy": 0.9701037347316742,
+      "num_tokens": 1696174.0,
+      "step": 900
+    },
+    {
+      "epoch": 0.01799770578695463,
+      "learning_rate": 0.00019640441438234247,
+      "loss": 0.0827,
+      "mean_token_accuracy": 0.9717662394046783,
+      "num_tokens": 1714822.0,
+      "step": 910
+    },
+    {
+      "epoch": 0.018195482773624462,
+      "learning_rate": 0.0001963648589850085,
+      "loss": 0.0728,
+      "mean_token_accuracy": 0.9710807025432586,
+      "num_tokens": 1733526.0,
+      "step": 920
+    },
+    {
+      "epoch": 0.01839325976029429,
+      "learning_rate": 0.00019632530358767455,
+      "loss": 0.0689,
+      "mean_token_accuracy": 0.9746571719646454,
+      "num_tokens": 1752469.0,
+      "step": 930
+    },
+    {
+      "epoch": 0.018591036746964124,
+      "learning_rate": 0.00019628574819034058,
+      "loss": 0.0648,
+      "mean_token_accuracy": 0.9815335392951965,
+      "num_tokens": 1771439.0,
+      "step": 940
+    },
+    {
+      "epoch": 0.018788813733633954,
+      "learning_rate": 0.00019624619279300662,
+      "loss": 0.0773,
+      "mean_token_accuracy": 0.9698800563812255,
+      "num_tokens": 1790105.0,
+      "step": 950
+    },
+    {
+      "epoch": 0.018986590720303786,
+      "learning_rate": 0.00019620663739567263,
+      "loss": 0.067,
+      "mean_token_accuracy": 0.9776150286197662,
+      "num_tokens": 1809000.0,
+      "step": 960
+    },
+    {
+      "epoch": 0.019184367706973616,
+      "learning_rate": 0.0001961670819983387,
+      "loss": 0.0771,
+      "mean_token_accuracy": 0.9723577439785004,
+      "num_tokens": 1827743.0,
+      "step": 970
+    },
+    {
+      "epoch": 0.01938214469364345,
+      "learning_rate": 0.0001961275266010047,
+      "loss": 0.0829,
+      "mean_token_accuracy": 0.9695183992385864,
+      "num_tokens": 1846483.0,
+      "step": 980
+    },
+    {
+      "epoch": 0.019579921680313278,
+      "learning_rate": 0.00019608797120367074,
+      "loss": 0.0544,
+      "mean_token_accuracy": 0.9838368058204651,
+      "num_tokens": 1865230.0,
+      "step": 990
+    },
+    {
+      "epoch": 0.01977769866698311,
+      "learning_rate": 0.00019604841580633678,
+      "loss": 0.0913,
+      "mean_token_accuracy": 0.9702223718166352,
+      "num_tokens": 1884295.0,
+      "step": 1000
+    },
+    {
+      "epoch": 0.01997547565365294,
+      "learning_rate": 0.00019600886040900282,
+      "loss": 0.0794,
+      "mean_token_accuracy": 0.9764923632144928,
+      "num_tokens": 1903297.0,
+      "step": 1010
+    },
+    {
+      "epoch": 0.020173252640322773,
+      "learning_rate": 0.00019596930501166886,
+      "loss": 0.0673,
+      "mean_token_accuracy": 0.9780684530735015,
+      "num_tokens": 1922020.0,
+      "step": 1020
+    },
+    {
+      "epoch": 0.020371029626992603,
+      "learning_rate": 0.00019592974961433487,
+      "loss": 0.0626,
+      "mean_token_accuracy": 0.9800810873508453,
+      "num_tokens": 1940771.0,
+      "step": 1030
+    },
+    {
+      "epoch": 0.020568806613662435,
+      "learning_rate": 0.00019589019421700093,
+      "loss": 0.0765,
+      "mean_token_accuracy": 0.9712104678153992,
+      "num_tokens": 1959790.0,
+      "step": 1040
+    },
+    {
+      "epoch": 0.020766583600332265,
+      "learning_rate": 0.00019585063881966694,
+      "loss": 0.0648,
+      "mean_token_accuracy": 0.978104192018509,
+      "num_tokens": 1978766.0,
+      "step": 1050
+    },
+    {
+      "epoch": 0.020964360587002098,
+      "learning_rate": 0.00019581108342233298,
+      "loss": 0.0938,
+      "mean_token_accuracy": 0.9689741492271423,
+      "num_tokens": 1997785.0,
+      "step": 1060
+    },
+    {
+      "epoch": 0.021162137573671927,
+      "learning_rate": 0.00019577152802499902,
+      "loss": 0.0891,
+      "mean_token_accuracy": 0.9727595269680023,
+      "num_tokens": 2016334.0,
+      "step": 1070
+    },
+    {
+      "epoch": 0.02135991456034176,
+      "learning_rate": 0.00019573197262766505,
+      "loss": 0.0818,
+      "mean_token_accuracy": 0.9725301325321197,
+      "num_tokens": 2035271.0,
+      "step": 1080
+    },
+    {
+      "epoch": 0.02155769154701159,
+      "learning_rate": 0.0001956924172303311,
+      "loss": 0.0721,
+      "mean_token_accuracy": 0.9739335179328918,
+      "num_tokens": 2054158.0,
+      "step": 1090
+    },
+    {
+      "epoch": 0.021755468533681422,
+      "learning_rate": 0.0001956528618329971,
+      "loss": 0.0785,
+      "mean_token_accuracy": 0.971097469329834,
+      "num_tokens": 2072794.0,
+      "step": 1100
+    },
+    {
+      "epoch": 0.02195324552035125,
+      "learning_rate": 0.00019561330643566317,
+      "loss": 0.0759,
+      "mean_token_accuracy": 0.9685551345348358,
+      "num_tokens": 2091583.0,
+      "step": 1110
+    },
+    {
+      "epoch": 0.022151022507021084,
+      "learning_rate": 0.0001955737510383292,
+      "loss": 0.0584,
+      "mean_token_accuracy": 0.9821391940116883,
+      "num_tokens": 2110207.0,
+      "step": 1120
+    },
+    {
+      "epoch": 0.022348799493690914,
+      "learning_rate": 0.00019553419564099521,
+      "loss": 0.0841,
+      "mean_token_accuracy": 0.9727806925773621,
+      "num_tokens": 2129006.0,
+      "step": 1130
+    },
+    {
+      "epoch": 0.022546576480360746,
+      "learning_rate": 0.00019549464024366125,
+      "loss": 0.0708,
+      "mean_token_accuracy": 0.9772637248039245,
+      "num_tokens": 2147866.0,
+      "step": 1140
+    },
+    {
+      "epoch": 0.022744353467030576,
+      "learning_rate": 0.0001954550848463273,
+      "loss": 0.0615,
+      "mean_token_accuracy": 0.9771132528781891,
+      "num_tokens": 2166678.0,
+      "step": 1150
+    },
+    {
+      "epoch": 0.02294213045370041,
+      "learning_rate": 0.00019541552944899333,
+      "loss": 0.0716,
+      "mean_token_accuracy": 0.97862588763237,
+      "num_tokens": 2185491.0,
+      "step": 1160
+    },
+    {
+      "epoch": 0.023139907440370238,
+      "learning_rate": 0.00019537597405165934,
+      "loss": 0.0835,
+      "mean_token_accuracy": 0.9729329645633698,
+      "num_tokens": 2204270.0,
+      "step": 1170
+    },
+    {
+      "epoch": 0.02333768442704007,
+      "learning_rate": 0.0001953364186543254,
+      "loss": 0.0825,
+      "mean_token_accuracy": 0.975412392616272,
+      "num_tokens": 2223262.0,
+      "step": 1180
+    },
+    {
+      "epoch": 0.0235354614137099,
+      "learning_rate": 0.00019529686325699144,
+      "loss": 0.0694,
+      "mean_token_accuracy": 0.9787419438362122,
+      "num_tokens": 2242082.0,
+      "step": 1190
+    },
+    {
+      "epoch": 0.023733238400379733,
+      "learning_rate": 0.00019525730785965745,
+      "loss": 0.0971,
+      "mean_token_accuracy": 0.9742420554161072,
+      "num_tokens": 2260874.0,
+      "step": 1200
+    },
+    {
+      "epoch": 0.023931015387049562,
+      "learning_rate": 0.00019521775246232349,
+      "loss": 0.0813,
+      "mean_token_accuracy": 0.973499870300293,
+      "num_tokens": 2279620.0,
+      "step": 1210
+    },
+    {
+      "epoch": 0.024128792373719395,
+      "learning_rate": 0.00019517819706498952,
+      "loss": 0.0591,
+      "mean_token_accuracy": 0.9806219100952148,
+      "num_tokens": 2298359.0,
+      "step": 1220
+    },
+    {
+      "epoch": 0.024326569360389225,
+      "learning_rate": 0.00019513864166765556,
+      "loss": 0.0784,
+      "mean_token_accuracy": 0.9767163157463074,
+      "num_tokens": 2317171.0,
+      "step": 1230
+    },
+    {
+      "epoch": 0.024524346347059058,
+      "learning_rate": 0.00019509908627032157,
+      "loss": 0.0867,
+      "mean_token_accuracy": 0.9671182572841645,
+      "num_tokens": 2335825.0,
+      "step": 1240
+    },
+    {
+      "epoch": 0.024722123333728887,
+      "learning_rate": 0.00019505953087298764,
+      "loss": 0.0727,
+      "mean_token_accuracy": 0.9711700201034545,
+      "num_tokens": 2354806.0,
+      "step": 1250
+    },
+    {
+      "epoch": 0.02491990032039872,
+      "learning_rate": 0.00019501997547565367,
+      "loss": 0.0901,
+      "mean_token_accuracy": 0.9734555304050445,
+      "num_tokens": 2373919.0,
+      "step": 1260
+    },
+    {
+      "epoch": 0.02511767730706855,
+      "learning_rate": 0.00019498042007831968,
+      "loss": 0.0579,
+      "mean_token_accuracy": 0.9764434218406677,
+      "num_tokens": 2393043.0,
+      "step": 1270
+    },
+    {
+      "epoch": 0.025315454293738382,
+      "learning_rate": 0.00019494086468098575,
+      "loss": 0.0623,
+      "mean_token_accuracy": 0.9802065193653107,
+      "num_tokens": 2411888.0,
+      "step": 1280
+    },
+    {
+      "epoch": 0.02551323128040821,
+      "learning_rate": 0.00019490130928365176,
+      "loss": 0.0739,
+      "mean_token_accuracy": 0.9758767008781433,
+      "num_tokens": 2430790.0,
+      "step": 1290
+    },
+    {
+      "epoch": 0.025711008267078044,
+      "learning_rate": 0.0001948617538863178,
+      "loss": 0.0676,
+      "mean_token_accuracy": 0.9723714172840119,
+      "num_tokens": 2449534.0,
+      "step": 1300
+    },
+    {
+      "epoch": 0.025908785253747874,
+      "learning_rate": 0.0001948221984889838,
+      "loss": 0.0842,
+      "mean_token_accuracy": 0.9739417016506196,
+      "num_tokens": 2468500.0,
+      "step": 1310
+    },
+    {
+      "epoch": 0.026106562240417706,
+      "learning_rate": 0.00019478264309164987,
+      "loss": 0.0812,
+      "mean_token_accuracy": 0.9725883424282074,
+      "num_tokens": 2487522.0,
+      "step": 1320
+    },
+    {
+      "epoch": 0.026304339227087536,
+      "learning_rate": 0.0001947430876943159,
+      "loss": 0.081,
+      "mean_token_accuracy": 0.9718135535717011,
+      "num_tokens": 2506228.0,
+      "step": 1330
+    },
+    {
+      "epoch": 0.02650211621375737,
+      "learning_rate": 0.00019470353229698192,
+      "loss": 0.0913,
+      "mean_token_accuracy": 0.9727324843406677,
+      "num_tokens": 2524993.0,
+      "step": 1340
+    },
+    {
+      "epoch": 0.026699893200427198,
+      "learning_rate": 0.00019466397689964798,
+      "loss": 0.0961,
+      "mean_token_accuracy": 0.9686046600341797,
+      "num_tokens": 2543814.0,
+      "step": 1350
+    },
+    {
+      "epoch": 0.02689767018709703,
+      "learning_rate": 0.000194624421502314,
+      "loss": 0.0961,
+      "mean_token_accuracy": 0.9635930359363556,
+      "num_tokens": 2562624.0,
+      "step": 1360
+    },
+    {
+      "epoch": 0.02709544717376686,
+      "learning_rate": 0.00019458486610498003,
+      "loss": 0.0882,
+      "mean_token_accuracy": 0.971700656414032,
+      "num_tokens": 2581229.0,
+      "step": 1370
+    },
+    {
+      "epoch": 0.027293224160436693,
+      "learning_rate": 0.00019454531070764607,
+      "loss": 0.0875,
+      "mean_token_accuracy": 0.9696930944919586,
+      "num_tokens": 2600073.0,
+      "step": 1380
+    },
+    {
+      "epoch": 0.027491001147106522,
+      "learning_rate": 0.0001945057553103121,
+      "loss": 0.0732,
+      "mean_token_accuracy": 0.9732912659645081,
+      "num_tokens": 2618837.0,
+      "step": 1390
+    },
+    {
+      "epoch": 0.027688778133776355,
+      "learning_rate": 0.00019446619991297814,
+      "loss": 0.0711,
+      "mean_token_accuracy": 0.975910484790802,
+      "num_tokens": 2637487.0,
+      "step": 1400
+    },
+    {
+      "epoch": 0.027886555120446185,
+      "learning_rate": 0.00019442664451564415,
+      "loss": 0.0944,
+      "mean_token_accuracy": 0.96585413813591,
+      "num_tokens": 2656004.0,
+      "step": 1410
+    },
+    {
+      "epoch": 0.028084332107116017,
+      "learning_rate": 0.00019438708911831022,
+      "loss": 0.0553,
+      "mean_token_accuracy": 0.980397754907608,
+      "num_tokens": 2675089.0,
+      "step": 1420
+    },
+    {
+      "epoch": 0.028282109093785847,
+      "learning_rate": 0.00019434753372097623,
+      "loss": 0.062,
+      "mean_token_accuracy": 0.9789350926876068,
+      "num_tokens": 2693904.0,
+      "step": 1430
+    },
+    {
+      "epoch": 0.02847988608045568,
+      "learning_rate": 0.00019430797832364227,
+      "loss": 0.08,
+      "mean_token_accuracy": 0.9751649796962738,
+      "num_tokens": 2712395.0,
+      "step": 1440
+    },
+    {
+      "epoch": 0.02867766306712551,
+      "learning_rate": 0.0001942684229263083,
+      "loss": 0.0662,
+      "mean_token_accuracy": 0.9778707563877106,
+      "num_tokens": 2731103.0,
+      "step": 1450
+    },
+    {
+      "epoch": 0.028875440053795342,
+      "learning_rate": 0.00019422886752897434,
+      "loss": 0.0834,
+      "mean_token_accuracy": 0.9672891080379487,
+      "num_tokens": 2749801.0,
+      "step": 1460
+    },
+    {
+      "epoch": 0.02907321704046517,
+      "learning_rate": 0.00019418931213164038,
+      "loss": 0.0719,
+      "mean_token_accuracy": 0.9755631804466247,
+      "num_tokens": 2768551.0,
+      "step": 1470
+    },
+    {
+      "epoch": 0.029270994027135004,
+      "learning_rate": 0.0001941497567343064,
+      "loss": 0.0499,
+      "mean_token_accuracy": 0.9837284207344055,
+      "num_tokens": 2787591.0,
+      "step": 1480
+    },
+    {
+      "epoch": 0.029468771013804834,
+      "learning_rate": 0.00019411020133697245,
+      "loss": 0.066,
+      "mean_token_accuracy": 0.9761005103588104,
+      "num_tokens": 2806455.0,
+      "step": 1490
+    },
+    {
+      "epoch": 0.029666548000474666,
+      "learning_rate": 0.00019407064593963846,
+      "loss": 0.0623,
+      "mean_token_accuracy": 0.9766542613506317,
+      "num_tokens": 2825280.0,
+      "step": 1500
+    },
+    {
+      "epoch": 0.029864324987144496,
+      "learning_rate": 0.0001940310905423045,
+      "loss": 0.0643,
+      "mean_token_accuracy": 0.9814446032047272,
+      "num_tokens": 2844178.0,
+      "step": 1510
+    },
+    {
+      "epoch": 0.03006210197381433,
+      "learning_rate": 0.00019399153514497054,
+      "loss": 0.0559,
+      "mean_token_accuracy": 0.9753939032554626,
+      "num_tokens": 2863373.0,
+      "step": 1520
+    },
+    {
+      "epoch": 0.030259878960484158,
+      "learning_rate": 0.00019395197974763658,
+      "loss": 0.0633,
+      "mean_token_accuracy": 0.9776888847351074,
+      "num_tokens": 2882166.0,
+      "step": 1530
+    },
+    {
+      "epoch": 0.03045765594715399,
+      "learning_rate": 0.00019391242435030261,
+      "loss": 0.0568,
+      "mean_token_accuracy": 0.9781621396541595,
+      "num_tokens": 2901050.0,
+      "step": 1540
+    },
+    {
+      "epoch": 0.03065543293382382,
+      "learning_rate": 0.00019387286895296862,
+      "loss": 0.0748,
+      "mean_token_accuracy": 0.9726321280002594,
+      "num_tokens": 2919953.0,
+      "step": 1550
+    },
+    {
+      "epoch": 0.030853209920493653,
+      "learning_rate": 0.0001938333135556347,
+      "loss": 0.0717,
+      "mean_token_accuracy": 0.9703111469745636,
+      "num_tokens": 2939003.0,
+      "step": 1560
+    },
+    {
+      "epoch": 0.031050986907163482,
+      "learning_rate": 0.00019379375815830073,
+      "loss": 0.065,
+      "mean_token_accuracy": 0.97583766579628,
+      "num_tokens": 2957853.0,
+      "step": 1570
+    },
+    {
+      "epoch": 0.031248763893833315,
+      "learning_rate": 0.00019375420276096674,
+      "loss": 0.0667,
+      "mean_token_accuracy": 0.9808376967906952,
+      "num_tokens": 2976557.0,
+      "step": 1580
+    },
+    {
+      "epoch": 0.031446540880503145,
+      "learning_rate": 0.00019371464736363277,
+      "loss": 0.0777,
+      "mean_token_accuracy": 0.9767092704772949,
+      "num_tokens": 2995058.0,
+      "step": 1590
+    },
+    {
+      "epoch": 0.03164431786717298,
+      "learning_rate": 0.0001936750919662988,
+      "loss": 0.0644,
+      "mean_token_accuracy": 0.9776780724525451,
+      "num_tokens": 3014165.0,
+      "step": 1600
+    },
+    {
+      "epoch": 0.03184209485384281,
+      "learning_rate": 0.00019363553656896485,
+      "loss": 0.0566,
+      "mean_token_accuracy": 0.9828783690929412,
+      "num_tokens": 3032924.0,
+      "step": 1610
+    },
+    {
+      "epoch": 0.032039871840512636,
+      "learning_rate": 0.00019359598117163086,
+      "loss": 0.0805,
+      "mean_token_accuracy": 0.972636216878891,
+      "num_tokens": 3051960.0,
+      "step": 1620
+    },
+    {
+      "epoch": 0.03223764882718247,
+      "learning_rate": 0.00019355642577429692,
+      "loss": 0.0831,
+      "mean_token_accuracy": 0.9726630806922912,
+      "num_tokens": 3070644.0,
+      "step": 1630
+    },
+    {
+      "epoch": 0.0324354258138523,
+      "learning_rate": 0.00019351687037696296,
+      "loss": 0.0766,
+      "mean_token_accuracy": 0.9787626624107361,
+      "num_tokens": 3089393.0,
+      "step": 1640
+    },
+    {
+      "epoch": 0.03263320280052213,
+      "learning_rate": 0.00019347731497962897,
+      "loss": 0.0665,
+      "mean_token_accuracy": 0.9728036403656006,
+      "num_tokens": 3108260.0,
+      "step": 1650
+    },
+    {
+      "epoch": 0.03283097978719196,
+      "learning_rate": 0.000193437759582295,
+      "loss": 0.0842,
+      "mean_token_accuracy": 0.969732540845871,
+      "num_tokens": 3127157.0,
+      "step": 1660
+    },
+    {
+      "epoch": 0.033028756773861793,
+      "learning_rate": 0.00019339820418496105,
+      "loss": 0.072,
+      "mean_token_accuracy": 0.9739607870578766,
+      "num_tokens": 3146126.0,
+      "step": 1670
+    },
+    {
+      "epoch": 0.033226533760531626,
+      "learning_rate": 0.00019335864878762708,
+      "loss": 0.0835,
+      "mean_token_accuracy": 0.9699565410614014,
+      "num_tokens": 3165091.0,
+      "step": 1680
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 50562,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 1685,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5391152511647744e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.0

gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_081253/checkpoint-1685/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1378 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.03332542225386654,
+  "eval_steps": 500,
+  "global_step": 1685,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0001977769866698311,
+      "learning_rate": 0.00019996440014239942,
+      "loss": 0.3352,
+      "mean_token_accuracy": 0.9023264050483704,
+      "num_tokens": 19163.0,
+      "step": 10
+    },
+    {
+      "epoch": 0.0003955539733396622,
+      "learning_rate": 0.0001999248447450655,
+      "loss": 0.1524,
+      "mean_token_accuracy": 0.947095412015915,
+      "num_tokens": 38071.0,
+      "step": 20
+    },
+    {
+      "epoch": 0.0005933309600094933,
+      "learning_rate": 0.0001998852893477315,
+      "loss": 0.1461,
+      "mean_token_accuracy": 0.9463379800319671,
+      "num_tokens": 57017.0,
+      "step": 30
+    },
+    {
+      "epoch": 0.0007911079466793244,
+      "learning_rate": 0.00019984573395039754,
+      "loss": 0.1024,
+      "mean_token_accuracy": 0.9588611423969269,
+      "num_tokens": 76047.0,
+      "step": 40
+    },
+    {
+      "epoch": 0.0009888849333491555,
+      "learning_rate": 0.00019980617855306357,
+      "loss": 0.117,
+      "mean_token_accuracy": 0.9553473949432373,
+      "num_tokens": 94824.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.0011866619200189867,
+      "learning_rate": 0.0001997666231557296,
+      "loss": 0.1281,
+      "mean_token_accuracy": 0.9538084208965302,
+      "num_tokens": 113467.0,
+      "step": 60
+    },
+    {
+      "epoch": 0.0013844389066888178,
+      "learning_rate": 0.00019972706775839565,
+      "loss": 0.1075,
+      "mean_token_accuracy": 0.961921775341034,
+      "num_tokens": 132229.0,
+      "step": 70
+    },
+    {
+      "epoch": 0.0015822158933586489,
+      "learning_rate": 0.00019968751236106166,
+      "loss": 0.1057,
+      "mean_token_accuracy": 0.9676864743232727,
+      "num_tokens": 150811.0,
+      "step": 80
+    },
+    {
+      "epoch": 0.00177999288002848,
+      "learning_rate": 0.00019964795696372772,
+      "loss": 0.1048,
+      "mean_token_accuracy": 0.9597055971622467,
+      "num_tokens": 169804.0,
+      "step": 90
+    },
+    {
+      "epoch": 0.001977769866698311,
+      "learning_rate": 0.00019960840156639376,
+      "loss": 0.1147,
+      "mean_token_accuracy": 0.9561040580272675,
+      "num_tokens": 188503.0,
+      "step": 100
+    },
+    {
+      "epoch": 0.002175546853368142,
+      "learning_rate": 0.00019956884616905977,
+      "loss": 0.0904,
+      "mean_token_accuracy": 0.9663344562053681,
+      "num_tokens": 207370.0,
+      "step": 110
+    },
+    {
+      "epoch": 0.0023733238400379733,
+      "learning_rate": 0.0001995292907717258,
+      "loss": 0.085,
+      "mean_token_accuracy": 0.9692767798900604,
+      "num_tokens": 226639.0,
+      "step": 120
+    },
+    {
+      "epoch": 0.0025711008267078044,
+      "learning_rate": 0.00019948973537439185,
+      "loss": 0.1055,
+      "mean_token_accuracy": 0.962373024225235,
+      "num_tokens": 245318.0,
+      "step": 130
+    },
+    {
+      "epoch": 0.0027688778133776355,
+      "learning_rate": 0.00019945017997705788,
+      "loss": 0.1023,
+      "mean_token_accuracy": 0.9603676617145538,
+      "num_tokens": 264089.0,
+      "step": 140
+    },
+    {
+      "epoch": 0.0029666548000474666,
+      "learning_rate": 0.0001994106245797239,
+      "loss": 0.1061,
+      "mean_token_accuracy": 0.9619979500770569,
+      "num_tokens": 282977.0,
+      "step": 150
+    },
+    {
+      "epoch": 0.0031644317867172977,
+      "learning_rate": 0.00019937106918238996,
+      "loss": 0.1021,
+      "mean_token_accuracy": 0.9600433588027955,
+      "num_tokens": 301615.0,
+      "step": 160
+    },
+    {
+      "epoch": 0.003362208773387129,
+      "learning_rate": 0.000199331513785056,
+      "loss": 0.095,
+      "mean_token_accuracy": 0.9641285121440888,
+      "num_tokens": 320502.0,
+      "step": 170
+    },
+    {
+      "epoch": 0.00355998576005696,
+      "learning_rate": 0.000199291958387722,
+      "loss": 0.0722,
+      "mean_token_accuracy": 0.9708887040615082,
+      "num_tokens": 339616.0,
+      "step": 180
+    },
+    {
+      "epoch": 0.003757762746726791,
+      "learning_rate": 0.00019925240299038804,
+      "loss": 0.0951,
+      "mean_token_accuracy": 0.9712291181087493,
+      "num_tokens": 358474.0,
+      "step": 190
+    },
+    {
+      "epoch": 0.003955539733396622,
+      "learning_rate": 0.00019921284759305408,
+      "loss": 0.1194,
+      "mean_token_accuracy": 0.9630812525749206,
+      "num_tokens": 377094.0,
+      "step": 200
+    },
+    {
+      "epoch": 0.004153316720066453,
+      "learning_rate": 0.00019917329219572012,
+      "loss": 0.1002,
+      "mean_token_accuracy": 0.9660979807376862,
+      "num_tokens": 396000.0,
+      "step": 210
+    },
+    {
+      "epoch": 0.004351093706736284,
+      "learning_rate": 0.00019913373679838613,
+      "loss": 0.0954,
+      "mean_token_accuracy": 0.9636943399906158,
+      "num_tokens": 415019.0,
+      "step": 220
+    },
+    {
+      "epoch": 0.0045488706934061155,
+      "learning_rate": 0.0001990941814010522,
+      "loss": 0.1114,
+      "mean_token_accuracy": 0.9662698566913605,
+      "num_tokens": 433711.0,
+      "step": 230
+    },
+    {
+      "epoch": 0.004746647680075947,
+      "learning_rate": 0.00019905462600371823,
+      "loss": 0.0915,
+      "mean_token_accuracy": 0.9679243505001068,
+      "num_tokens": 452483.0,
+      "step": 240
+    },
+    {
+      "epoch": 0.004944424666745778,
+      "learning_rate": 0.00019901507060638424,
+      "loss": 0.0951,
+      "mean_token_accuracy": 0.9688079237937928,
+      "num_tokens": 471395.0,
+      "step": 250
+    },
+    {
+      "epoch": 0.005142201653415609,
+      "learning_rate": 0.00019897551520905028,
+      "loss": 0.1123,
+      "mean_token_accuracy": 0.962276142835617,
+      "num_tokens": 489983.0,
+      "step": 260
+    },
+    {
+      "epoch": 0.00533997864008544,
+      "learning_rate": 0.00019893595981171632,
+      "loss": 0.0855,
+      "mean_token_accuracy": 0.9698696434497833,
+      "num_tokens": 509148.0,
+      "step": 270
+    },
+    {
+      "epoch": 0.005537755626755271,
+      "learning_rate": 0.00019889640441438235,
+      "loss": 0.0777,
+      "mean_token_accuracy": 0.9697826623916626,
+      "num_tokens": 528042.0,
+      "step": 280
+    },
+    {
+      "epoch": 0.005735532613425102,
+      "learning_rate": 0.0001988568490170484,
+      "loss": 0.0944,
+      "mean_token_accuracy": 0.9690817773342133,
+      "num_tokens": 546656.0,
+      "step": 290
+    },
+    {
+      "epoch": 0.005933309600094933,
+      "learning_rate": 0.00019881729361971443,
+      "loss": 0.0872,
+      "mean_token_accuracy": 0.9661558032035827,
+      "num_tokens": 565279.0,
+      "step": 300
+    },
+    {
+      "epoch": 0.006131086586764764,
+      "learning_rate": 0.00019877773822238047,
+      "loss": 0.09,
+      "mean_token_accuracy": 0.9669564247131348,
+      "num_tokens": 584196.0,
+      "step": 310
+    },
+    {
+      "epoch": 0.0063288635734345955,
+      "learning_rate": 0.00019873818282504648,
+      "loss": 0.0701,
+      "mean_token_accuracy": 0.9722951114177704,
+      "num_tokens": 603050.0,
+      "step": 320
+    },
+    {
+      "epoch": 0.006526640560104427,
+      "learning_rate": 0.00019869862742771251,
+      "loss": 0.0922,
+      "mean_token_accuracy": 0.9692854762077332,
+      "num_tokens": 621880.0,
+      "step": 330
+    },
+    {
+      "epoch": 0.006724417546774258,
+      "learning_rate": 0.00019865907203037855,
+      "loss": 0.0976,
+      "mean_token_accuracy": 0.9660769879817963,
+      "num_tokens": 640657.0,
+      "step": 340
+    },
+    {
+      "epoch": 0.006922194533444089,
+      "learning_rate": 0.0001986195166330446,
+      "loss": 0.1071,
+      "mean_token_accuracy": 0.9633386790752411,
+      "num_tokens": 659503.0,
+      "step": 350
+    },
+    {
+      "epoch": 0.00711997152011392,
+      "learning_rate": 0.00019857996123571063,
+      "loss": 0.1058,
+      "mean_token_accuracy": 0.9641251742839814,
+      "num_tokens": 678570.0,
+      "step": 360
+    },
+    {
+      "epoch": 0.007317748506783751,
+      "learning_rate": 0.00019854040583837666,
+      "loss": 0.097,
+      "mean_token_accuracy": 0.9664280533790588,
+      "num_tokens": 697294.0,
+      "step": 370
+    },
+    {
+      "epoch": 0.007515525493453582,
+      "learning_rate": 0.0001985008504410427,
+      "loss": 0.0677,
+      "mean_token_accuracy": 0.9754213869571686,
+      "num_tokens": 716458.0,
+      "step": 380
+    },
+    {
+      "epoch": 0.007713302480123413,
+      "learning_rate": 0.0001984612950437087,
+      "loss": 0.0622,
+      "mean_token_accuracy": 0.9724574089050293,
+      "num_tokens": 735437.0,
+      "step": 390
+    },
+    {
+      "epoch": 0.007911079466793244,
+      "learning_rate": 0.00019842173964637475,
+      "loss": 0.1003,
+      "mean_token_accuracy": 0.9710333228111268,
+      "num_tokens": 754416.0,
+      "step": 400
+    },
+    {
+      "epoch": 0.008108856453463075,
+      "learning_rate": 0.0001983821842490408,
+      "loss": 0.0921,
+      "mean_token_accuracy": 0.9718518137931824,
+      "num_tokens": 773091.0,
+      "step": 410
+    },
+    {
+      "epoch": 0.008306633440132907,
+      "learning_rate": 0.00019834262885170682,
+      "loss": 0.0836,
+      "mean_token_accuracy": 0.9694978713989257,
+      "num_tokens": 791900.0,
+      "step": 420
+    },
+    {
+      "epoch": 0.008504410426802738,
+      "learning_rate": 0.00019830307345437286,
+      "loss": 0.0822,
+      "mean_token_accuracy": 0.974584549665451,
+      "num_tokens": 810547.0,
+      "step": 430
+    },
+    {
+      "epoch": 0.008702187413472569,
+      "learning_rate": 0.0001982635180570389,
+      "loss": 0.0735,
+      "mean_token_accuracy": 0.9770256340503692,
+      "num_tokens": 829753.0,
+      "step": 440
+    },
+    {
+      "epoch": 0.0088999644001424,
+      "learning_rate": 0.00019822396265970494,
+      "loss": 0.0882,
+      "mean_token_accuracy": 0.9668483734130859,
+      "num_tokens": 848294.0,
+      "step": 450
+    },
+    {
+      "epoch": 0.009097741386812231,
+      "learning_rate": 0.00019818440726237095,
+      "loss": 0.0923,
+      "mean_token_accuracy": 0.9715612173080445,
+      "num_tokens": 867635.0,
+      "step": 460
+    },
+    {
+      "epoch": 0.009295518373482062,
+      "learning_rate": 0.00019814485186503699,
+      "loss": 0.0665,
+      "mean_token_accuracy": 0.9765240132808686,
+      "num_tokens": 886596.0,
+      "step": 470
+    },
+    {
+      "epoch": 0.009493295360151893,
+      "learning_rate": 0.00019810529646770302,
+      "loss": 0.08,
+      "mean_token_accuracy": 0.972789865732193,
+      "num_tokens": 905312.0,
+      "step": 480
+    },
+    {
+      "epoch": 0.009691072346821724,
+      "learning_rate": 0.00019806574107036906,
+      "loss": 0.0794,
+      "mean_token_accuracy": 0.9753748655319214,
+      "num_tokens": 924072.0,
+      "step": 490
+    },
+    {
+      "epoch": 0.009888849333491555,
+      "learning_rate": 0.0001980261856730351,
+      "loss": 0.0673,
+      "mean_token_accuracy": 0.9732649922370911,
+      "num_tokens": 942844.0,
+      "step": 500
+    },
+    {
+      "epoch": 0.010086626320161387,
+      "learning_rate": 0.00019798663027570113,
+      "loss": 0.1029,
+      "mean_token_accuracy": 0.9693170130252838,
+      "num_tokens": 961524.0,
+      "step": 510
+    },
+    {
+      "epoch": 0.010284403306831218,
+      "learning_rate": 0.00019794707487836717,
+      "loss": 0.0901,
+      "mean_token_accuracy": 0.9669535160064697,
+      "num_tokens": 980332.0,
+      "step": 520
+    },
+    {
+      "epoch": 0.010482180293501049,
+      "learning_rate": 0.00019790751948103318,
+      "loss": 0.0858,
+      "mean_token_accuracy": 0.9700904309749603,
+      "num_tokens": 999163.0,
+      "step": 530
+    },
+    {
+      "epoch": 0.01067995728017088,
+      "learning_rate": 0.00019786796408369922,
+      "loss": 0.1001,
+      "mean_token_accuracy": 0.9630756258964539,
+      "num_tokens": 1018053.0,
+      "step": 540
+    },
+    {
+      "epoch": 0.010877734266840711,
+      "learning_rate": 0.00019782840868636528,
+      "loss": 0.0798,
+      "mean_token_accuracy": 0.9710527896881104,
+      "num_tokens": 1036828.0,
+      "step": 550
+    },
+    {
+      "epoch": 0.011075511253510542,
+      "learning_rate": 0.0001977888532890313,
+      "loss": 0.0883,
+      "mean_token_accuracy": 0.9719638526439667,
+      "num_tokens": 1055843.0,
+      "step": 560
+    },
+    {
+      "epoch": 0.011273288240180373,
+      "learning_rate": 0.00019774929789169733,
+      "loss": 0.0817,
+      "mean_token_accuracy": 0.9706017553806305,
+      "num_tokens": 1074733.0,
+      "step": 570
+    },
+    {
+      "epoch": 0.011471065226850204,
+      "learning_rate": 0.00019770974249436337,
+      "loss": 0.0949,
+      "mean_token_accuracy": 0.9684431076049804,
+      "num_tokens": 1093519.0,
+      "step": 580
+    },
+    {
+      "epoch": 0.011668842213520035,
+      "learning_rate": 0.0001976701870970294,
+      "loss": 0.0797,
+      "mean_token_accuracy": 0.975245076417923,
+      "num_tokens": 1112394.0,
+      "step": 590
+    },
+    {
+      "epoch": 0.011866619200189867,
+      "learning_rate": 0.00019763063169969542,
+      "loss": 0.0814,
+      "mean_token_accuracy": 0.9766307890415191,
+      "num_tokens": 1131435.0,
+      "step": 600
+    },
+    {
+      "epoch": 0.012064396186859698,
+      "learning_rate": 0.00019759107630236146,
+      "loss": 0.0893,
+      "mean_token_accuracy": 0.9671396017074585,
+      "num_tokens": 1150274.0,
+      "step": 610
+    },
+    {
+      "epoch": 0.012262173173529529,
+      "learning_rate": 0.00019755152090502752,
+      "loss": 0.0965,
+      "mean_token_accuracy": 0.9663532435894012,
+      "num_tokens": 1169076.0,
+      "step": 620
+    },
+    {
+      "epoch": 0.01245995016019936,
+      "learning_rate": 0.00019751196550769353,
+      "loss": 0.0692,
+      "mean_token_accuracy": 0.9754896223545074,
+      "num_tokens": 1187908.0,
+      "step": 630
+    },
+    {
+      "epoch": 0.012657727146869191,
+      "learning_rate": 0.00019747241011035957,
+      "loss": 0.0893,
+      "mean_token_accuracy": 0.9667335331439972,
+      "num_tokens": 1206872.0,
+      "step": 640
+    },
+    {
+      "epoch": 0.012855504133539022,
+      "learning_rate": 0.0001974328547130256,
+      "loss": 0.0762,
+      "mean_token_accuracy": 0.9724668145179749,
+      "num_tokens": 1225767.0,
+      "step": 650
+    },
+    {
+      "epoch": 0.013053281120208853,
+      "learning_rate": 0.00019739329931569164,
+      "loss": 0.08,
+      "mean_token_accuracy": 0.9690144300460816,
+      "num_tokens": 1244340.0,
+      "step": 660
+    },
+    {
+      "epoch": 0.013251058106878684,
+      "learning_rate": 0.00019735374391835765,
+      "loss": 0.0872,
+      "mean_token_accuracy": 0.9725019693374634,
+      "num_tokens": 1263304.0,
+      "step": 670
+    },
+    {
+      "epoch": 0.013448835093548515,
+      "learning_rate": 0.0001973141885210237,
+      "loss": 0.0894,
+      "mean_token_accuracy": 0.9694939434528351,
+      "num_tokens": 1282028.0,
+      "step": 680
+    },
+    {
+      "epoch": 0.013646612080218347,
+      "learning_rate": 0.00019727463312368976,
+      "loss": 0.096,
+      "mean_token_accuracy": 0.9666180431842804,
+      "num_tokens": 1300780.0,
+      "step": 690
+    },
+    {
+      "epoch": 0.013844389066888178,
+      "learning_rate": 0.00019723507772635577,
+      "loss": 0.0804,
+      "mean_token_accuracy": 0.9751847207546234,
+      "num_tokens": 1319535.0,
+      "step": 700
+    },
+    {
+      "epoch": 0.014042166053558009,
+      "learning_rate": 0.0001971955223290218,
+      "loss": 0.1046,
+      "mean_token_accuracy": 0.9681148648262023,
+      "num_tokens": 1338391.0,
+      "step": 710
+    },
+    {
+      "epoch": 0.01423994304022784,
+      "learning_rate": 0.00019715596693168784,
+      "loss": 0.0804,
+      "mean_token_accuracy": 0.9726056635379792,
+      "num_tokens": 1357072.0,
+      "step": 720
+    },
+    {
+      "epoch": 0.014437720026897671,
+      "learning_rate": 0.00019711641153435388,
+      "loss": 0.0664,
+      "mean_token_accuracy": 0.9787862658500671,
+      "num_tokens": 1375999.0,
+      "step": 730
+    },
+    {
+      "epoch": 0.014635497013567502,
+      "learning_rate": 0.00019707685613701992,
+      "loss": 0.0823,
+      "mean_token_accuracy": 0.9705499827861785,
+      "num_tokens": 1394654.0,
+      "step": 740
+    },
+    {
+      "epoch": 0.014833274000237333,
+      "learning_rate": 0.00019703730073968593,
+      "loss": 0.0828,
+      "mean_token_accuracy": 0.9703374147415161,
+      "num_tokens": 1413323.0,
+      "step": 750
+    },
+    {
+      "epoch": 0.015031050986907164,
+      "learning_rate": 0.000196997745342352,
+      "loss": 0.0797,
+      "mean_token_accuracy": 0.9717476069927216,
+      "num_tokens": 1432274.0,
+      "step": 760
+    },
+    {
+      "epoch": 0.015228827973576995,
+      "learning_rate": 0.000196958189945018,
+      "loss": 0.0811,
+      "mean_token_accuracy": 0.9687287509441376,
+      "num_tokens": 1451009.0,
+      "step": 770
+    },
+    {
+      "epoch": 0.015426604960246827,
+      "learning_rate": 0.00019691863454768404,
+      "loss": 0.0799,
+      "mean_token_accuracy": 0.9712490200996399,
+      "num_tokens": 1469741.0,
+      "step": 780
+    },
+    {
+      "epoch": 0.015624381946916658,
+      "learning_rate": 0.00019687907915035008,
+      "loss": 0.0698,
+      "mean_token_accuracy": 0.9755463302135468,
+      "num_tokens": 1488644.0,
+      "step": 790
+    },
+    {
+      "epoch": 0.01582215893358649,
+      "learning_rate": 0.0001968395237530161,
+      "loss": 0.07,
+      "mean_token_accuracy": 0.9786295354366302,
+      "num_tokens": 1507384.0,
+      "step": 800
+    },
+    {
+      "epoch": 0.016019935920256318,
+      "learning_rate": 0.00019679996835568215,
+      "loss": 0.0842,
+      "mean_token_accuracy": 0.9716646075248718,
+      "num_tokens": 1526208.0,
+      "step": 810
+    },
+    {
+      "epoch": 0.01621771290692615,
+      "learning_rate": 0.00019676041295834816,
+      "loss": 0.0681,
+      "mean_token_accuracy": 0.9786826431751251,
+      "num_tokens": 1545086.0,
+      "step": 820
+    },
+    {
+      "epoch": 0.01641548989359598,
+      "learning_rate": 0.00019672085756101423,
+      "loss": 0.068,
+      "mean_token_accuracy": 0.9776684403419494,
+      "num_tokens": 1563972.0,
+      "step": 830
+    },
+    {
+      "epoch": 0.016613266880265813,
+      "learning_rate": 0.00019668130216368024,
+      "loss": 0.0855,
+      "mean_token_accuracy": 0.9758803486824036,
+      "num_tokens": 1582893.0,
+      "step": 840
+    },
+    {
+      "epoch": 0.016811043866935643,
+      "learning_rate": 0.00019664174676634627,
+      "loss": 0.0969,
+      "mean_token_accuracy": 0.9676210284233093,
+      "num_tokens": 1601719.0,
+      "step": 850
+    },
+    {
+      "epoch": 0.017008820853605475,
+      "learning_rate": 0.0001966021913690123,
+      "loss": 0.0836,
+      "mean_token_accuracy": 0.973316353559494,
+      "num_tokens": 1620493.0,
+      "step": 860
+    },
+    {
+      "epoch": 0.017206597840275305,
+      "learning_rate": 0.00019656263597167835,
+      "loss": 0.0707,
+      "mean_token_accuracy": 0.9776813209056854,
+      "num_tokens": 1639336.0,
+      "step": 870
+    },
+    {
+      "epoch": 0.017404374826945138,
+      "learning_rate": 0.00019652308057434439,
+      "loss": 0.0921,
+      "mean_token_accuracy": 0.9702894032001496,
+      "num_tokens": 1658213.0,
+      "step": 880
+    },
+    {
+      "epoch": 0.017602151813614967,
+      "learning_rate": 0.0001964835251770104,
+      "loss": 0.0629,
+      "mean_token_accuracy": 0.9804218530654907,
+      "num_tokens": 1677260.0,
+      "step": 890
+    },
+    {
+      "epoch": 0.0177999288002848,
+      "learning_rate": 0.00019644396977967646,
+      "loss": 0.0946,
+      "mean_token_accuracy": 0.9700180232524872,
+      "num_tokens": 1696174.0,
+      "step": 900
+    },
+    {
+      "epoch": 0.01799770578695463,
+      "learning_rate": 0.00019640441438234247,
+      "loss": 0.0822,
+      "mean_token_accuracy": 0.9717112898826599,
+      "num_tokens": 1714822.0,
+      "step": 910
+    },
+    {
+      "epoch": 0.018195482773624462,
+      "learning_rate": 0.0001963648589850085,
+      "loss": 0.0732,
+      "mean_token_accuracy": 0.9710357069969178,
+      "num_tokens": 1733526.0,
+      "step": 920
+    },
+    {
+      "epoch": 0.01839325976029429,
+      "learning_rate": 0.00019632530358767455,
+      "loss": 0.0697,
+      "mean_token_accuracy": 0.9731245815753937,
+      "num_tokens": 1752469.0,
+      "step": 930
+    },
+    {
+      "epoch": 0.018591036746964124,
+      "learning_rate": 0.00019628574819034058,
+      "loss": 0.065,
+      "mean_token_accuracy": 0.9799518942832947,
+      "num_tokens": 1771439.0,
+      "step": 940
+    },
+    {
+      "epoch": 0.018788813733633954,
+      "learning_rate": 0.00019624619279300662,
+      "loss": 0.0769,
+      "mean_token_accuracy": 0.9691688776016235,
+      "num_tokens": 1790105.0,
+      "step": 950
+    },
+    {
+      "epoch": 0.018986590720303786,
+      "learning_rate": 0.00019620663739567263,
+      "loss": 0.0675,
+      "mean_token_accuracy": 0.9775746822357178,
+      "num_tokens": 1809000.0,
+      "step": 960
+    },
+    {
+      "epoch": 0.019184367706973616,
+      "learning_rate": 0.0001961670819983387,
+      "loss": 0.0761,
+      "mean_token_accuracy": 0.9722946584224701,
+      "num_tokens": 1827743.0,
+      "step": 970
+    },
+    {
+      "epoch": 0.01938214469364345,
+      "learning_rate": 0.0001961275266010047,
+      "loss": 0.0827,
+      "mean_token_accuracy": 0.9710648536682129,
+      "num_tokens": 1846483.0,
+      "step": 980
+    },
+    {
+      "epoch": 0.019579921680313278,
+      "learning_rate": 0.00019608797120367074,
+      "loss": 0.0548,
+      "mean_token_accuracy": 0.9855950713157654,
+      "num_tokens": 1865230.0,
+      "step": 990
+    },
+    {
+      "epoch": 0.01977769866698311,
+      "learning_rate": 0.00019604841580633678,
+      "loss": 0.0934,
+      "mean_token_accuracy": 0.9695265829563141,
+      "num_tokens": 1884295.0,
+      "step": 1000
+    },
+    {
+      "epoch": 0.01997547565365294,
+      "learning_rate": 0.00019600886040900282,
+      "loss": 0.0806,
+      "mean_token_accuracy": 0.9750065445899964,
+      "num_tokens": 1903297.0,
+      "step": 1010
+    },
+    {
+      "epoch": 0.020173252640322773,
+      "learning_rate": 0.00019596930501166886,
+      "loss": 0.0683,
+      "mean_token_accuracy": 0.9779970288276673,
+      "num_tokens": 1922020.0,
+      "step": 1020
+    },
+    {
+      "epoch": 0.020371029626992603,
+      "learning_rate": 0.00019592974961433487,
+      "loss": 0.0636,
+      "mean_token_accuracy": 0.9801307320594788,
+      "num_tokens": 1940771.0,
+      "step": 1030
+    },
+    {
+      "epoch": 0.020568806613662435,
+      "learning_rate": 0.00019589019421700093,
+      "loss": 0.078,
+      "mean_token_accuracy": 0.9704040169715882,
+      "num_tokens": 1959790.0,
+      "step": 1040
+    },
+    {
+      "epoch": 0.020766583600332265,
+      "learning_rate": 0.00019585063881966694,
+      "loss": 0.0641,
+      "mean_token_accuracy": 0.978104192018509,
+      "num_tokens": 1978766.0,
+      "step": 1050
+    },
+    {
+      "epoch": 0.020964360587002098,
+      "learning_rate": 0.00019581108342233298,
+      "loss": 0.0931,
+      "mean_token_accuracy": 0.967569786310196,
+      "num_tokens": 1997785.0,
+      "step": 1060
+    },
+    {
+      "epoch": 0.021162137573671927,
+      "learning_rate": 0.00019577152802499902,
+      "loss": 0.0877,
+      "mean_token_accuracy": 0.9735791981220245,
+      "num_tokens": 2016334.0,
+      "step": 1070
+    },
+    {
+      "epoch": 0.02135991456034176,
+      "learning_rate": 0.00019573197262766505,
+      "loss": 0.0833,
+      "mean_token_accuracy": 0.9717927515506745,
+      "num_tokens": 2035271.0,
+      "step": 1080
+    },
+    {
+      "epoch": 0.02155769154701159,
+      "learning_rate": 0.0001956924172303311,
+      "loss": 0.0732,
+      "mean_token_accuracy": 0.97304065823555,
+      "num_tokens": 2054158.0,
+      "step": 1090
+    },
+    {
+      "epoch": 0.021755468533681422,
+      "learning_rate": 0.0001956528618329971,
+      "loss": 0.0782,
+      "mean_token_accuracy": 0.971097469329834,
+      "num_tokens": 2072794.0,
+      "step": 1100
+    },
+    {
+      "epoch": 0.02195324552035125,
+      "learning_rate": 0.00019561330643566317,
+      "loss": 0.0745,
+      "mean_token_accuracy": 0.9693615853786468,
+      "num_tokens": 2091583.0,
+      "step": 1110
+    },
+    {
+      "epoch": 0.022151022507021084,
+      "learning_rate": 0.0001955737510383292,
+      "loss": 0.0575,
+      "mean_token_accuracy": 0.9811938166618347,
+      "num_tokens": 2110207.0,
+      "step": 1120
+    },
+    {
+      "epoch": 0.022348799493690914,
+      "learning_rate": 0.00019553419564099521,
+      "loss": 0.0838,
+      "mean_token_accuracy": 0.9719610214233398,
+      "num_tokens": 2129006.0,
+      "step": 1130
+    },
+    {
+      "epoch": 0.022546576480360746,
+      "learning_rate": 0.00019549464024366125,
+      "loss": 0.0704,
+      "mean_token_accuracy": 0.9780212998390198,
+      "num_tokens": 2147866.0,
+      "step": 1140
+    },
+    {
+      "epoch": 0.022744353467030576,
+      "learning_rate": 0.0001954550848463273,
+      "loss": 0.0626,
+      "mean_token_accuracy": 0.9765089929103852,
+      "num_tokens": 2166678.0,
+      "step": 1150
+    },
+    {
+      "epoch": 0.02294213045370041,
+      "learning_rate": 0.00019541552944899333,
+      "loss": 0.071,
+      "mean_token_accuracy": 0.9786867916584014,
+      "num_tokens": 2185491.0,
+      "step": 1160
+    },
+    {
+      "epoch": 0.023139907440370238,
+      "learning_rate": 0.00019537597405165934,
+      "loss": 0.0855,
+      "mean_token_accuracy": 0.9737950384616851,
+      "num_tokens": 2204270.0,
+      "step": 1170
+    },
+    {
+      "epoch": 0.02333768442704007,
+      "learning_rate": 0.0001953364186543254,
+      "loss": 0.0817,
+      "mean_token_accuracy": 0.9746371984481812,
+      "num_tokens": 2223262.0,
+      "step": 1180
+    },
+    {
+      "epoch": 0.0235354614137099,
+      "learning_rate": 0.00019529686325699144,
+      "loss": 0.069,
+      "mean_token_accuracy": 0.9796195566654206,
+      "num_tokens": 2242082.0,
+      "step": 1190
+    },
+    {
+      "epoch": 0.023733238400379733,
+      "learning_rate": 0.00019525730785965745,
+      "loss": 0.0976,
+      "mean_token_accuracy": 0.9742420554161072,
+      "num_tokens": 2260874.0,
+      "step": 1200
+    },
+    {
+      "epoch": 0.023931015387049562,
+      "learning_rate": 0.00019521775246232349,
+      "loss": 0.0816,
+      "mean_token_accuracy": 0.972790652513504,
+      "num_tokens": 2279620.0,
+      "step": 1210
+    },
+    {
+      "epoch": 0.024128792373719395,
+      "learning_rate": 0.00019517819706498952,
+      "loss": 0.0602,
+      "mean_token_accuracy": 0.9790996849536896,
+      "num_tokens": 2298359.0,
+      "step": 1220
+    },
+    {
+      "epoch": 0.024326569360389225,
+      "learning_rate": 0.00019513864166765556,
+      "loss": 0.0773,
+      "mean_token_accuracy": 0.9750834167003631,
+      "num_tokens": 2317171.0,
+      "step": 1230
+    },
+    {
+      "epoch": 0.024524346347059058,
+      "learning_rate": 0.00019509908627032157,
+      "loss": 0.0863,
+      "mean_token_accuracy": 0.9679585933685303,
+      "num_tokens": 2335825.0,
+      "step": 1240
+    },
+    {
+      "epoch": 0.024722123333728887,
+      "learning_rate": 0.00019505953087298764,
+      "loss": 0.0712,
+      "mean_token_accuracy": 0.9711700201034545,
+      "num_tokens": 2354806.0,
+      "step": 1250
+    },
+    {
+      "epoch": 0.02491990032039872,
+      "learning_rate": 0.00019501997547565367,
+      "loss": 0.0905,
+      "mean_token_accuracy": 0.9734555304050445,
+      "num_tokens": 2373919.0,
+      "step": 1260
+    },
+    {
+      "epoch": 0.02511767730706855,
+      "learning_rate": 0.00019498042007831968,
+      "loss": 0.0582,
+      "mean_token_accuracy": 0.9755668938159943,
+      "num_tokens": 2393043.0,
+      "step": 1270
+    },
+    {
+      "epoch": 0.025315454293738382,
+      "learning_rate": 0.00019494086468098575,
+      "loss": 0.0627,
+      "mean_token_accuracy": 0.9795541882514953,
+      "num_tokens": 2411888.0,
+      "step": 1280
+    },
+    {
+      "epoch": 0.02551323128040821,
+      "learning_rate": 0.00019490130928365176,
+      "loss": 0.0729,
+      "mean_token_accuracy": 0.976466304063797,
+      "num_tokens": 2430790.0,
+      "step": 1290
+    },
+    {
+      "epoch": 0.025711008267078044,
+      "learning_rate": 0.0001948617538863178,
+      "loss": 0.0676,
+      "mean_token_accuracy": 0.9730806350708008,
+      "num_tokens": 2449534.0,
+      "step": 1300
+    },
+    {
+      "epoch": 0.025908785253747874,
+      "learning_rate": 0.0001948221984889838,
+      "loss": 0.0849,
+      "mean_token_accuracy": 0.9746219754219055,
+      "num_tokens": 2468500.0,
+      "step": 1310
+    },
+    {
+      "epoch": 0.026106562240417706,
+      "learning_rate": 0.00019478264309164987,
+      "loss": 0.082,
+      "mean_token_accuracy": 0.9718350946903229,
+      "num_tokens": 2487522.0,
+      "step": 1320
+    },
+    {
+      "epoch": 0.026304339227087536,
+      "learning_rate": 0.0001947430876943159,
+      "loss": 0.0814,
+      "mean_token_accuracy": 0.9727078378200531,
+      "num_tokens": 2506228.0,
+      "step": 1330
+    },
+    {
+      "epoch": 0.02650211621375737,
+      "learning_rate": 0.00019470353229698192,
+      "loss": 0.0908,
+      "mean_token_accuracy": 0.972163724899292,
+      "num_tokens": 2524993.0,
+      "step": 1340
+    },
+    {
+      "epoch": 0.026699893200427198,
+      "learning_rate": 0.00019466397689964798,
+      "loss": 0.0967,
+      "mean_token_accuracy": 0.9677857100963593,
+      "num_tokens": 2543814.0,
+      "step": 1350
+    },
+    {
+      "epoch": 0.02689767018709703,
+      "learning_rate": 0.000194624421502314,
+      "loss": 0.0965,
+      "mean_token_accuracy": 0.9627455770969391,
+      "num_tokens": 2562624.0,
+      "step": 1360
+    },
+    {
+      "epoch": 0.02709544717376686,
+      "learning_rate": 0.00019458486610498003,
+      "loss": 0.0866,
+      "mean_token_accuracy": 0.9717130541801453,
+      "num_tokens": 2581229.0,
+      "step": 1370
+    },
+    {
+      "epoch": 0.027293224160436693,
+      "learning_rate": 0.00019454531070764607,
+      "loss": 0.0864,
+      "mean_token_accuracy": 0.9688866436481476,
+      "num_tokens": 2600073.0,
+      "step": 1380
+    },
+    {
+      "epoch": 0.027491001147106522,
+      "learning_rate": 0.0001945057553103121,
+      "loss": 0.0743,
+      "mean_token_accuracy": 0.9723983883857727,
+      "num_tokens": 2618837.0,
+      "step": 1390
+    },
+    {
+      "epoch": 0.027688778133776355,
+      "learning_rate": 0.00019446619991297814,
+      "loss": 0.0719,
+      "mean_token_accuracy": 0.9758836448192596,
+      "num_tokens": 2637487.0,
+      "step": 1400
+    },
+    {
+      "epoch": 0.027886555120446185,
+      "learning_rate": 0.00019442664451564415,
+      "loss": 0.0925,
+      "mean_token_accuracy": 0.969123649597168,
+      "num_tokens": 2656004.0,
+      "step": 1410
+    },
+    {
+      "epoch": 0.028084332107116017,
+      "learning_rate": 0.00019438708911831022,
+      "loss": 0.0566,
+      "mean_token_accuracy": 0.9795430541038513,
+      "num_tokens": 2675089.0,
+      "step": 1420
+    },
+    {
+      "epoch": 0.028282109093785847,
+      "learning_rate": 0.00019434753372097623,
+      "loss": 0.0618,
+      "mean_token_accuracy": 0.9796153604984283,
+      "num_tokens": 2693904.0,
+      "step": 1430
+    },
+    {
+      "epoch": 0.02847988608045568,
+      "learning_rate": 0.00019430797832364227,
+      "loss": 0.0808,
+      "mean_token_accuracy": 0.9735188663005829,
+      "num_tokens": 2712395.0,
+      "step": 1440
+    },
+    {
+      "epoch": 0.02867766306712551,
+      "learning_rate": 0.0001942684229263083,
+      "loss": 0.0657,
+      "mean_token_accuracy": 0.9771307945251465,
+      "num_tokens": 2731103.0,
+      "step": 1450
+    },
+    {
+      "epoch": 0.028875440053795342,
+      "learning_rate": 0.00019422886752897434,
+      "loss": 0.083,
+      "mean_token_accuracy": 0.9664826571941376,
+      "num_tokens": 2749801.0,
+      "step": 1460
+    },
+    {
+      "epoch": 0.02907321704046517,
+      "learning_rate": 0.00019418931213164038,
+      "loss": 0.0721,
+      "mean_token_accuracy": 0.9747998178005218,
+      "num_tokens": 2768551.0,
+      "step": 1470
+    },
+    {
+      "epoch": 0.029270994027135004,
+      "learning_rate": 0.0001941497567343064,
+      "loss": 0.0499,
+      "mean_token_accuracy": 0.9845096707344055,
+      "num_tokens": 2787591.0,
+      "step": 1480
+    },
+    {
+      "epoch": 0.029468771013804834,
+      "learning_rate": 0.00019411020133697245,
+      "loss": 0.0665,
+      "mean_token_accuracy": 0.9782804071903228,
+      "num_tokens": 2806455.0,
+      "step": 1490
+    },
+    {
+      "epoch": 0.029666548000474666,
+      "learning_rate": 0.00019407064593963846,
+      "loss": 0.0633,
+      "mean_token_accuracy": 0.9759399771690369,
+      "num_tokens": 2825280.0,
+      "step": 1500
+    },
+    {
+      "epoch": 0.029864324987144496,
+      "learning_rate": 0.0001940310905423045,
+      "loss": 0.0641,
+      "mean_token_accuracy": 0.980692720413208,
+      "num_tokens": 2844178.0,
+      "step": 1510
+    },
+    {
+      "epoch": 0.03006210197381433,
+      "learning_rate": 0.00019399153514497054,
+      "loss": 0.0568,
+      "mean_token_accuracy": 0.9760835587978363,
+      "num_tokens": 2863373.0,
+      "step": 1520
+    },
+    {
+      "epoch": 0.030259878960484158,
+      "learning_rate": 0.00019395197974763658,
+      "loss": 0.0611,
+      "mean_token_accuracy": 0.9767693936824798,
+      "num_tokens": 2882166.0,
+      "step": 1530
+    },
+    {
+      "epoch": 0.03045765594715399,
+      "learning_rate": 0.00019391242435030261,
+      "loss": 0.0557,
+      "mean_token_accuracy": 0.9800930917263031,
+      "num_tokens": 2901050.0,
+      "step": 1540
+    },
+    {
+      "epoch": 0.03065543293382382,
+      "learning_rate": 0.00019387286895296862,
+      "loss": 0.0739,
+      "mean_token_accuracy": 0.9733968496322631,
+      "num_tokens": 2919953.0,
+      "step": 1550
+    },
+    {
+      "epoch": 0.030853209920493653,
+      "learning_rate": 0.0001938333135556347,
+      "loss": 0.0706,
+      "mean_token_accuracy": 0.9725773215293885,
+      "num_tokens": 2939003.0,
+      "step": 1560
+    },
+    {
+      "epoch": 0.031050986907163482,
+      "learning_rate": 0.00019379375815830073,
+      "loss": 0.0655,
+      "mean_token_accuracy": 0.9753368616104126,
+      "num_tokens": 2957853.0,
+      "step": 1570
+    },
+    {
+      "epoch": 0.031248763893833315,
+      "learning_rate": 0.00019375420276096674,
+      "loss": 0.0667,
+      "mean_token_accuracy": 0.9791980743408203,
+      "num_tokens": 2976557.0,
+      "step": 1580
+    },
+    {
+      "epoch": 0.031446540880503145,
+      "learning_rate": 0.00019371464736363277,
+      "loss": 0.0748,
+      "mean_token_accuracy": 0.9767062723636627,
+      "num_tokens": 2995058.0,
+      "step": 1590
+    },
+    {
+      "epoch": 0.03164431786717298,
+      "learning_rate": 0.0001936750919662988,
+      "loss": 0.0628,
+      "mean_token_accuracy": 0.9783797085285186,
+      "num_tokens": 3014165.0,
+      "step": 1600
+    },
+    {
+      "epoch": 0.03184209485384281,
+      "learning_rate": 0.00019363553656896485,
+      "loss": 0.0564,
+      "mean_token_accuracy": 0.9830330014228821,
+      "num_tokens": 3032924.0,
+      "step": 1610
+    },
+    {
+      "epoch": 0.032039871840512636,
+      "learning_rate": 0.00019359598117163086,
+      "loss": 0.0783,
+      "mean_token_accuracy": 0.9734647035598755,
+      "num_tokens": 3051960.0,
+      "step": 1620
+    },
+    {
+      "epoch": 0.03223764882718247,
+      "learning_rate": 0.00019355642577429692,
+      "loss": 0.0827,
+      "mean_token_accuracy": 0.9718622207641602,
+      "num_tokens": 3070644.0,
+      "step": 1630
+    },
+    {
+      "epoch": 0.0324354258138523,
+      "learning_rate": 0.00019351687037696296,
+      "loss": 0.071,
+      "mean_token_accuracy": 0.980438482761383,
+      "num_tokens": 3089393.0,
+      "step": 1640
+    },
+    {
+      "epoch": 0.03263320280052213,
+      "learning_rate": 0.00019347731497962897,
+      "loss": 0.0656,
+      "mean_token_accuracy": 0.9727302372455597,
+      "num_tokens": 3108260.0,
+      "step": 1650
+    },
+    {
+      "epoch": 0.03283097978719196,
+      "learning_rate": 0.000193437759582295,
+      "loss": 0.0847,
+      "mean_token_accuracy": 0.9706348955631257,
+      "num_tokens": 3127157.0,
+      "step": 1660
+    },
+    {
+      "epoch": 0.033028756773861793,
+      "learning_rate": 0.00019339820418496105,
+      "loss": 0.0721,
+      "mean_token_accuracy": 0.9732413589954376,
+      "num_tokens": 3146126.0,
+      "step": 1670
+    },
+    {
+      "epoch": 0.033226533760531626,
+      "learning_rate": 0.00019335864878762708,
+      "loss": 0.0836,
+      "mean_token_accuracy": 0.9706507205963135,
+      "num_tokens": 3165091.0,
+      "step": 1680
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 50562,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 1685,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5391152511647744e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.0

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-2247/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1826 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.033329872287405256,
+  "eval_steps": 500,
+  "global_step": 2247,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00014833053977483424,
+      "learning_rate": 0.00019997330050284054,
+      "loss": 0.3338,
+      "mean_token_accuracy": 0.8992965638637542,
+      "num_tokens": 14338.0,
+      "step": 10
+    },
+    {
+      "epoch": 0.0002966610795496685,
+      "learning_rate": 0.00019994363439488556,
+      "loss": 0.1339,
+      "mean_token_accuracy": 0.9485014617443085,
+      "num_tokens": 28640.0,
+      "step": 20
+    },
+    {
+      "epoch": 0.0004449916193245027,
+      "learning_rate": 0.0001999139682869306,
+      "loss": 0.1615,
+      "mean_token_accuracy": 0.9449628531932831,
+      "num_tokens": 42898.0,
+      "step": 30
+    },
+    {
+      "epoch": 0.000593322159099337,
+      "learning_rate": 0.00019988430217897563,
+      "loss": 0.1449,
+      "mean_token_accuracy": 0.9448729813098907,
+      "num_tokens": 57046.0,
+      "step": 40
+    },
+    {
+      "epoch": 0.0007416526988741712,
+      "learning_rate": 0.00019985463607102066,
+      "loss": 0.1058,
+      "mean_token_accuracy": 0.95491161942482,
+      "num_tokens": 71338.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.0008899832386490054,
+      "learning_rate": 0.0001998249699630657,
+      "loss": 0.0956,
+      "mean_token_accuracy": 0.9604991674423218,
+      "num_tokens": 85472.0,
+      "step": 60
+    },
+    {
+      "epoch": 0.0010383137784238396,
+      "learning_rate": 0.00019979530385511073,
+      "loss": 0.121,
+      "mean_token_accuracy": 0.9559885621070862,
+      "num_tokens": 99522.0,
+      "step": 70
+    },
+    {
+      "epoch": 0.001186644318198674,
+      "learning_rate": 0.00019976563774715578,
+      "loss": 0.136,
+      "mean_token_accuracy": 0.9489055037498474,
+      "num_tokens": 113467.0,
+      "step": 80
+    },
+    {
+      "epoch": 0.0013349748579735083,
+      "learning_rate": 0.0001997359716392008,
+      "loss": 0.1069,
+      "mean_token_accuracy": 0.9645210564136505,
+      "num_tokens": 127613.0,
+      "step": 90
+    },
+    {
+      "epoch": 0.0014833053977483424,
+      "learning_rate": 0.00019970630553124583,
+      "loss": 0.102,
+      "mean_token_accuracy": 0.9665270745754242,
+      "num_tokens": 141419.0,
+      "step": 100
+    },
+    {
+      "epoch": 0.0016316359375231767,
+      "learning_rate": 0.00019967663942329088,
+      "loss": 0.1031,
+      "mean_token_accuracy": 0.9620752274990082,
+      "num_tokens": 155506.0,
+      "step": 110
+    },
+    {
+      "epoch": 0.0017799664772980108,
+      "learning_rate": 0.0001996469733153359,
+      "loss": 0.1227,
+      "mean_token_accuracy": 0.9612519204616546,
+      "num_tokens": 169783.0,
+      "step": 120
+    },
+    {
+      "epoch": 0.0019282970170728451,
+      "learning_rate": 0.00019961730720738095,
+      "loss": 0.1132,
+      "mean_token_accuracy": 0.9575821399688721,
+      "num_tokens": 183864.0,
+      "step": 130
+    },
+    {
+      "epoch": 0.0020766275568476792,
+      "learning_rate": 0.00019958764109942597,
+      "loss": 0.099,
+      "mean_token_accuracy": 0.9662568092346191,
+      "num_tokens": 197941.0,
+      "step": 140
+    },
+    {
+      "epoch": 0.0022249580966225138,
+      "learning_rate": 0.000199557974991471,
+      "loss": 0.0861,
+      "mean_token_accuracy": 0.9733373045921325,
+      "num_tokens": 212305.0,
+      "step": 150
+    },
+    {
+      "epoch": 0.002373288636397348,
+      "learning_rate": 0.00019952830888351605,
+      "loss": 0.0786,
+      "mean_token_accuracy": 0.9724234759807586,
+      "num_tokens": 226639.0,
+      "step": 160
+    },
+    {
+      "epoch": 0.002521619176172182,
+      "learning_rate": 0.00019949864277556107,
+      "loss": 0.1153,
+      "mean_token_accuracy": 0.9579457581043244,
+      "num_tokens": 240608.0,
+      "step": 170
+    },
+    {
+      "epoch": 0.0026699497159470165,
+      "learning_rate": 0.00019946897666760612,
+      "loss": 0.099,
+      "mean_token_accuracy": 0.9629672944545746,
+      "num_tokens": 254679.0,
+      "step": 180
+    },
+    {
+      "epoch": 0.0028182802557218506,
+      "learning_rate": 0.00019943931055965111,
+      "loss": 0.1052,
+      "mean_token_accuracy": 0.958859795331955,
+      "num_tokens": 268884.0,
+      "step": 190
+    },
+    {
+      "epoch": 0.0029666107954966848,
+      "learning_rate": 0.00019940964445169616,
+      "loss": 0.103,
+      "mean_token_accuracy": 0.9612604022026062,
+      "num_tokens": 283062.0,
+      "step": 200
+    },
+    {
+      "epoch": 0.003114941335271519,
+      "learning_rate": 0.00019937997834374121,
+      "loss": 0.0959,
+      "mean_token_accuracy": 0.9604475021362304,
+      "num_tokens": 297014.0,
+      "step": 210
+    },
+    {
+      "epoch": 0.0032632718750463534,
+      "learning_rate": 0.00019935031223578624,
+      "loss": 0.101,
+      "mean_token_accuracy": 0.9652803599834442,
+      "num_tokens": 311066.0,
+      "step": 220
+    },
+    {
+      "epoch": 0.0034116024148211875,
+      "learning_rate": 0.0001993206461278313,
+      "loss": 0.0996,
+      "mean_token_accuracy": 0.957928591966629,
+      "num_tokens": 325289.0,
+      "step": 230
+    },
+    {
+      "epoch": 0.0035599329545960216,
+      "learning_rate": 0.00019929098001987628,
+      "loss": 0.0692,
+      "mean_token_accuracy": 0.9736397683620452,
+      "num_tokens": 339616.0,
+      "step": 240
+    },
+    {
+      "epoch": 0.003708263494370856,
+      "learning_rate": 0.00019926131391192133,
+      "loss": 0.095,
+      "mean_token_accuracy": 0.9670307815074921,
+      "num_tokens": 353777.0,
+      "step": 250
+    },
+    {
+      "epoch": 0.0038565940341456903,
+      "learning_rate": 0.00019923164780396636,
+      "loss": 0.0968,
+      "mean_token_accuracy": 0.9648948311805725,
+      "num_tokens": 367871.0,
+      "step": 260
+    },
+    {
+      "epoch": 0.004004924573920525,
+      "learning_rate": 0.0001992019816960114,
+      "loss": 0.1252,
+      "mean_token_accuracy": 0.9623154461383819,
+      "num_tokens": 381768.0,
+      "step": 270
+    },
+    {
+      "epoch": 0.0041532551136953585,
+      "learning_rate": 0.00019917231558805643,
+      "loss": 0.0933,
+      "mean_token_accuracy": 0.9669747233390809,
+      "num_tokens": 396018.0,
+      "step": 280
+    },
+    {
+      "epoch": 0.004301585653470193,
+      "learning_rate": 0.00019914264948010145,
+      "loss": 0.0903,
+      "mean_token_accuracy": 0.9654496192932129,
+      "num_tokens": 410375.0,
+      "step": 290
+    },
+    {
+      "epoch": 0.0044499161932450276,
+      "learning_rate": 0.0001991129833721465,
+      "loss": 0.1308,
+      "mean_token_accuracy": 0.9515012204647064,
+      "num_tokens": 424339.0,
+      "step": 300
+    },
+    {
+      "epoch": 0.004598246733019861,
+      "learning_rate": 0.00019908331726419153,
+      "loss": 0.0989,
+      "mean_token_accuracy": 0.9694083333015442,
+      "num_tokens": 438444.0,
+      "step": 310
+    },
+    {
+      "epoch": 0.004746577272794696,
+      "learning_rate": 0.00019905365115623658,
+      "loss": 0.093,
+      "mean_token_accuracy": 0.971742856502533,
+      "num_tokens": 452483.0,
+      "step": 320
+    },
+    {
+      "epoch": 0.00489490781256953,
+      "learning_rate": 0.0001990239850482816,
+      "loss": 0.102,
+      "mean_token_accuracy": 0.9643003046512604,
+      "num_tokens": 466532.0,
+      "step": 330
+    },
+    {
+      "epoch": 0.005043238352344364,
+      "learning_rate": 0.00019899431894032662,
+      "loss": 0.0964,
+      "mean_token_accuracy": 0.9672096133232116,
+      "num_tokens": 480771.0,
+      "step": 340
+    },
+    {
+      "epoch": 0.0051915688921191985,
+      "learning_rate": 0.00019896465283237167,
+      "loss": 0.1035,
+      "mean_token_accuracy": 0.9658852636814117,
+      "num_tokens": 494954.0,
+      "step": 350
+    },
+    {
+      "epoch": 0.005339899431894033,
+      "learning_rate": 0.0001989349867244167,
+      "loss": 0.1085,
+      "mean_token_accuracy": 0.9633121192455292,
+      "num_tokens": 509115.0,
+      "step": 360
+    },
+    {
+      "epoch": 0.005488229971668867,
+      "learning_rate": 0.00019890532061646175,
+      "loss": 0.067,
+      "mean_token_accuracy": 0.9755404174327851,
+      "num_tokens": 523364.0,
+      "step": 370
+    },
+    {
+      "epoch": 0.005636560511443701,
+      "learning_rate": 0.00019887565450850677,
+      "loss": 0.0846,
+      "mean_token_accuracy": 0.9748851418495178,
+      "num_tokens": 537483.0,
+      "step": 380
+    },
+    {
+      "epoch": 0.005784891051218535,
+      "learning_rate": 0.0001988459884005518,
+      "loss": 0.09,
+      "mean_token_accuracy": 0.9707007527351379,
+      "num_tokens": 551329.0,
+      "step": 390
+    },
+    {
+      "epoch": 0.0059332215909933695,
+      "learning_rate": 0.00019881632229259684,
+      "loss": 0.0923,
+      "mean_token_accuracy": 0.9627965211868286,
+      "num_tokens": 565279.0,
+      "step": 400
+    },
+    {
+      "epoch": 0.006081552130768204,
+      "learning_rate": 0.00019878665618464186,
+      "loss": 0.1026,
+      "mean_token_accuracy": 0.9648779332637787,
+      "num_tokens": 579408.0,
+      "step": 410
+    },
+    {
+      "epoch": 0.006229882670543038,
+      "learning_rate": 0.0001987569900766869,
+      "loss": 0.0626,
+      "mean_token_accuracy": 0.9769876301288605,
+      "num_tokens": 593666.0,
+      "step": 420
+    },
+    {
+      "epoch": 0.006378213210317872,
+      "learning_rate": 0.00019872732396873194,
+      "loss": 0.0824,
+      "mean_token_accuracy": 0.9722626864910126,
+      "num_tokens": 607862.0,
+      "step": 430
+    },
+    {
+      "epoch": 0.006526543750092707,
+      "learning_rate": 0.00019869765786077696,
+      "loss": 0.0815,
+      "mean_token_accuracy": 0.9756675064563751,
+      "num_tokens": 621860.0,
+      "step": 440
+    },
+    {
+      "epoch": 0.0066748742898675405,
+      "learning_rate": 0.000198667991752822,
+      "loss": 0.0864,
+      "mean_token_accuracy": 0.9753140985965729,
+      "num_tokens": 635967.0,
+      "step": 450
+    },
+    {
+      "epoch": 0.006823204829642375,
+      "learning_rate": 0.00019863832564486703,
+      "loss": 0.1159,
+      "mean_token_accuracy": 0.9547139942646027,
+      "num_tokens": 650037.0,
+      "step": 460
+    },
+    {
+      "epoch": 0.0069715353694172096,
+      "learning_rate": 0.00019860865953691206,
+      "loss": 0.1035,
+      "mean_token_accuracy": 0.9647024512290955,
+      "num_tokens": 664415.0,
+      "step": 470
+    },
+    {
+      "epoch": 0.007119865909192043,
+      "learning_rate": 0.0001985789934289571,
+      "loss": 0.1115,
+      "mean_token_accuracy": 0.9622772753238678,
+      "num_tokens": 678570.0,
+      "step": 480
+    },
+    {
+      "epoch": 0.007268196448966878,
+      "learning_rate": 0.00019854932732100213,
+      "loss": 0.1018,
+      "mean_token_accuracy": 0.9635290026664733,
+      "num_tokens": 692639.0,
+      "step": 490
+    },
+    {
+      "epoch": 0.007416526988741712,
+      "learning_rate": 0.00019851966121304718,
+      "loss": 0.073,
+      "mean_token_accuracy": 0.9765343546867371,
+      "num_tokens": 706812.0,
+      "step": 500
+    },
+    {
+      "epoch": 0.007564857528516546,
+      "learning_rate": 0.0001984899951050922,
+      "loss": 0.0673,
+      "mean_token_accuracy": 0.9734417915344238,
+      "num_tokens": 721155.0,
+      "step": 510
+    },
+    {
+      "epoch": 0.0077131880682913805,
+      "learning_rate": 0.00019846032899713723,
+      "loss": 0.0523,
+      "mean_token_accuracy": 0.9798437833786011,
+      "num_tokens": 735500.0,
+      "step": 520
+    },
+    {
+      "epoch": 0.007861518608066215,
+      "learning_rate": 0.00019843066288918225,
+      "loss": 0.0972,
+      "mean_token_accuracy": 0.9699061930179596,
+      "num_tokens": 749718.0,
+      "step": 530
+    },
+    {
+      "epoch": 0.00800984914784105,
+      "learning_rate": 0.0001984009967812273,
+      "loss": 0.0948,
+      "mean_token_accuracy": 0.9745772778987885,
+      "num_tokens": 763762.0,
+      "step": 540
+    },
+    {
+      "epoch": 0.008158179687615882,
+      "learning_rate": 0.00019837133067327235,
+      "loss": 0.0879,
+      "mean_token_accuracy": 0.9725308179855346,
+      "num_tokens": 777808.0,
+      "step": 550
+    },
+    {
+      "epoch": 0.008306510227390717,
+      "learning_rate": 0.00019834166456531734,
+      "loss": 0.083,
+      "mean_token_accuracy": 0.9699824392795563,
+      "num_tokens": 791900.0,
+      "step": 560
+    },
+    {
+      "epoch": 0.008454840767165551,
+      "learning_rate": 0.0001983119984573624,
+      "loss": 0.0724,
+      "mean_token_accuracy": 0.9774831712245942,
+      "num_tokens": 805873.0,
+      "step": 570
+    },
+    {
+      "epoch": 0.008603171306940386,
+      "learning_rate": 0.00019828233234940742,
+      "loss": 0.0867,
+      "mean_token_accuracy": 0.9701934456825256,
+      "num_tokens": 820203.0,
+      "step": 580
+    },
+    {
+      "epoch": 0.00875150184671522,
+      "learning_rate": 0.00019825266624145247,
+      "loss": 0.0868,
+      "mean_token_accuracy": 0.9678167402744293,
+      "num_tokens": 834478.0,
+      "step": 590
+    },
+    {
+      "epoch": 0.008899832386490055,
+      "learning_rate": 0.00019822300013349752,
+      "loss": 0.087,
+      "mean_token_accuracy": 0.9668359816074371,
+      "num_tokens": 848321.0,
+      "step": 600
+    },
+    {
+      "epoch": 0.009048162926264888,
+      "learning_rate": 0.00019819333402554251,
+      "loss": 0.0579,
+      "mean_token_accuracy": 0.980604612827301,
+      "num_tokens": 862770.0,
+      "step": 610
+    },
+    {
+      "epoch": 0.009196493466039722,
+      "learning_rate": 0.00019816366791758756,
+      "loss": 0.1058,
+      "mean_token_accuracy": 0.9675537347793579,
+      "num_tokens": 877137.0,
+      "step": 620
+    },
+    {
+      "epoch": 0.009344824005814557,
+      "learning_rate": 0.0001981340018096326,
+      "loss": 0.0641,
+      "mean_token_accuracy": 0.9790356934070588,
+      "num_tokens": 891203.0,
+      "step": 630
+    },
+    {
+      "epoch": 0.009493154545589392,
+      "learning_rate": 0.00019810433570167764,
+      "loss": 0.0783,
+      "mean_token_accuracy": 0.9747758269309997,
+      "num_tokens": 905312.0,
+      "step": 640
+    },
+    {
+      "epoch": 0.009641485085364226,
+      "learning_rate": 0.00019807466959372266,
+      "loss": 0.0792,
+      "mean_token_accuracy": 0.9732525169849395,
+      "num_tokens": 919371.0,
+      "step": 650
+    },
+    {
+      "epoch": 0.00978981562513906,
+      "learning_rate": 0.00019804500348576768,
+      "loss": 0.065,
+      "mean_token_accuracy": 0.9778533697128295,
+      "num_tokens": 933687.0,
+      "step": 660
+    },
+    {
+      "epoch": 0.009938146164913893,
+      "learning_rate": 0.00019801533737781273,
+      "loss": 0.0754,
+      "mean_token_accuracy": 0.9709767520427703,
+      "num_tokens": 947526.0,
+      "step": 670
+    },
+    {
+      "epoch": 0.010086476704688728,
+      "learning_rate": 0.00019798567126985776,
+      "loss": 0.1087,
+      "mean_token_accuracy": 0.9693429231643677,
+      "num_tokens": 961548.0,
+      "step": 680
+    },
+    {
+      "epoch": 0.010234807244463563,
+      "learning_rate": 0.0001979560051619028,
+      "loss": 0.0875,
+      "mean_token_accuracy": 0.9671817898750306,
+      "num_tokens": 975556.0,
+      "step": 690
+    },
+    {
+      "epoch": 0.010383137784238397,
+      "learning_rate": 0.00019792633905394783,
+      "loss": 0.0931,
+      "mean_token_accuracy": 0.9703514873981476,
+      "num_tokens": 989813.0,
+      "step": 700
+    },
+    {
+      "epoch": 0.010531468324013232,
+      "learning_rate": 0.00019789667294599285,
+      "loss": 0.0768,
+      "mean_token_accuracy": 0.973369836807251,
+      "num_tokens": 1003889.0,
+      "step": 710
+    },
+    {
+      "epoch": 0.010679798863788066,
+      "learning_rate": 0.0001978670068380379,
+      "loss": 0.1042,
+      "mean_token_accuracy": 0.9632522821426391,
+      "num_tokens": 1018053.0,
+      "step": 720
+    },
+    {
+      "epoch": 0.010828129403562899,
+      "learning_rate": 0.00019783734073008293,
+      "loss": 0.0748,
+      "mean_token_accuracy": 0.9709040760993958,
+      "num_tokens": 1032016.0,
+      "step": 730
+    },
+    {
+      "epoch": 0.010976459943337733,
+      "learning_rate": 0.00019780767462212798,
+      "loss": 0.0944,
+      "mean_token_accuracy": 0.9654510498046875,
+      "num_tokens": 1046250.0,
+      "step": 740
+    },
+    {
+      "epoch": 0.011124790483112568,
+      "learning_rate": 0.000197778008514173,
+      "loss": 0.0982,
+      "mean_token_accuracy": 0.966586810350418,
+      "num_tokens": 1060619.0,
+      "step": 750
+    },
+    {
+      "epoch": 0.011273121022887403,
+      "learning_rate": 0.00019774834240621802,
+      "loss": 0.0694,
+      "mean_token_accuracy": 0.9705922186374665,
+      "num_tokens": 1074706.0,
+      "step": 760
+    },
+    {
+      "epoch": 0.011421451562662237,
+      "learning_rate": 0.00019771867629826307,
+      "loss": 0.0875,
+      "mean_token_accuracy": 0.9699962615966797,
+      "num_tokens": 1088843.0,
+      "step": 770
+    },
+    {
+      "epoch": 0.01156978210243707,
+      "learning_rate": 0.0001976890101903081,
+      "loss": 0.0962,
+      "mean_token_accuracy": 0.9701560854911804,
+      "num_tokens": 1102921.0,
+      "step": 780
+    },
+    {
+      "epoch": 0.011718112642211904,
+      "learning_rate": 0.00019765934408235312,
+      "loss": 0.0641,
+      "mean_token_accuracy": 0.9813205659389496,
+      "num_tokens": 1117155.0,
+      "step": 790
+    },
+    {
+      "epoch": 0.011866443181986739,
+      "learning_rate": 0.00019762967797439814,
+      "loss": 0.0923,
+      "mean_token_accuracy": 0.9718149483203888,
+      "num_tokens": 1131435.0,
+      "step": 800
+    },
+    {
+      "epoch": 0.012014773721761574,
+      "learning_rate": 0.0001976000118664432,
+      "loss": 0.0798,
+      "mean_token_accuracy": 0.9723540186882019,
+      "num_tokens": 1145444.0,
+      "step": 810
+    },
+    {
+      "epoch": 0.012163104261536408,
+      "learning_rate": 0.00019757034575848824,
+      "loss": 0.1028,
+      "mean_token_accuracy": 0.9645315170288086,
+      "num_tokens": 1159625.0,
+      "step": 820
+    },
+    {
+      "epoch": 0.012311434801311243,
+      "learning_rate": 0.00019754067965053326,
+      "loss": 0.0893,
+      "mean_token_accuracy": 0.9674223959445953,
+      "num_tokens": 1173790.0,
+      "step": 830
+    },
+    {
+      "epoch": 0.012459765341086075,
+      "learning_rate": 0.0001975110135425783,
+      "loss": 0.0703,
+      "mean_token_accuracy": 0.9750796139240265,
+      "num_tokens": 1187787.0,
+      "step": 840
+    },
+    {
+      "epoch": 0.01260809588086091,
+      "learning_rate": 0.0001974813474346233,
+      "loss": 0.0885,
+      "mean_token_accuracy": 0.9665023148059845,
+      "num_tokens": 1202159.0,
+      "step": 850
+    },
+    {
+      "epoch": 0.012756426420635745,
+      "learning_rate": 0.00019745168132666836,
+      "loss": 0.0793,
+      "mean_token_accuracy": 0.9742439985275269,
+      "num_tokens": 1216318.0,
+      "step": 860
+    },
+    {
+      "epoch": 0.012904756960410579,
+      "learning_rate": 0.0001974220152187134,
+      "loss": 0.0654,
+      "mean_token_accuracy": 0.9788648605346679,
+      "num_tokens": 1230383.0,
+      "step": 870
+    },
+    {
+      "epoch": 0.013053087500185414,
+      "learning_rate": 0.00019739234911075843,
+      "loss": 0.0862,
+      "mean_token_accuracy": 0.9693156242370605,
+      "num_tokens": 1244340.0,
+      "step": 880
+    },
+    {
+      "epoch": 0.013201418039960248,
+      "learning_rate": 0.00019736268300280346,
+      "loss": 0.091,
+      "mean_token_accuracy": 0.969611394405365,
+      "num_tokens": 1258576.0,
+      "step": 890
+    },
+    {
+      "epoch": 0.013349748579735081,
+      "learning_rate": 0.00019733301689484848,
+      "loss": 0.0907,
+      "mean_token_accuracy": 0.9679802298545838,
+      "num_tokens": 1272705.0,
+      "step": 900
+    },
+    {
+      "epoch": 0.013498079119509915,
+      "learning_rate": 0.00019730335078689353,
+      "loss": 0.098,
+      "mean_token_accuracy": 0.9632154762744903,
+      "num_tokens": 1286700.0,
+      "step": 910
+    },
+    {
+      "epoch": 0.01364640965928475,
+      "learning_rate": 0.00019727368467893855,
+      "loss": 0.0713,
+      "mean_token_accuracy": 0.973181027173996,
+      "num_tokens": 1300730.0,
+      "step": 920
+    },
+    {
+      "epoch": 0.013794740199059585,
+      "learning_rate": 0.00019724401857098357,
+      "loss": 0.0926,
+      "mean_token_accuracy": 0.9724487364292145,
+      "num_tokens": 1314843.0,
+      "step": 930
+    },
+    {
+      "epoch": 0.013943070738834419,
+      "learning_rate": 0.00019721435246302862,
+      "loss": 0.078,
+      "mean_token_accuracy": 0.9706206858158112,
+      "num_tokens": 1328967.0,
+      "step": 940
+    },
+    {
+      "epoch": 0.014091401278609254,
+      "learning_rate": 0.00019718468635507365,
+      "loss": 0.11,
+      "mean_token_accuracy": 0.962195897102356,
+      "num_tokens": 1343046.0,
+      "step": 950
+    },
+    {
+      "epoch": 0.014239731818384086,
+      "learning_rate": 0.0001971550202471187,
+      "loss": 0.0722,
+      "mean_token_accuracy": 0.9760941386222839,
+      "num_tokens": 1357072.0,
+      "step": 960
+    },
+    {
+      "epoch": 0.014388062358158921,
+      "learning_rate": 0.00019712535413916372,
+      "loss": 0.0626,
+      "mean_token_accuracy": 0.9810939252376556,
+      "num_tokens": 1371301.0,
+      "step": 970
+    },
+    {
+      "epoch": 0.014536392897933756,
+      "learning_rate": 0.00019709568803120874,
+      "loss": 0.0972,
+      "mean_token_accuracy": 0.9752714991569519,
+      "num_tokens": 1385224.0,
+      "step": 980
+    },
+    {
+      "epoch": 0.01468472343770859,
+      "learning_rate": 0.0001970660219232538,
+      "loss": 0.065,
+      "mean_token_accuracy": 0.9749573111534119,
+      "num_tokens": 1399236.0,
+      "step": 990
+    },
+    {
+      "epoch": 0.014833053977483425,
+      "learning_rate": 0.00019703635581529882,
+      "loss": 0.0733,
+      "mean_token_accuracy": 0.971584141254425,
+      "num_tokens": 1413329.0,
+      "step": 1000
+    },
+    {
+      "epoch": 0.01498138451725826,
+      "learning_rate": 0.00019700668970734387,
+      "loss": 0.0984,
+      "mean_token_accuracy": 0.9667350590229035,
+      "num_tokens": 1427449.0,
+      "step": 1010
+    },
+    {
+      "epoch": 0.015129715057033092,
+      "learning_rate": 0.0001969770235993889,
+      "loss": 0.0758,
+      "mean_token_accuracy": 0.9693971753120423,
+      "num_tokens": 1441671.0,
+      "step": 1020
+    },
+    {
+      "epoch": 0.015278045596807927,
+      "learning_rate": 0.0001969473574914339,
+      "loss": 0.0877,
+      "mean_token_accuracy": 0.9640409171581268,
+      "num_tokens": 1455599.0,
+      "step": 1030
+    },
+    {
+      "epoch": 0.015426376136582761,
+      "learning_rate": 0.00019691769138347896,
+      "loss": 0.078,
+      "mean_token_accuracy": 0.973029488325119,
+      "num_tokens": 1469741.0,
+      "step": 1040
+    },
+    {
+      "epoch": 0.015574706676357596,
+      "learning_rate": 0.00019688802527552399,
+      "loss": 0.0709,
+      "mean_token_accuracy": 0.97868133187294,
+      "num_tokens": 1483981.0,
+      "step": 1050
+    },
+    {
+      "epoch": 0.01572303721613243,
+      "learning_rate": 0.00019685835916756904,
+      "loss": 0.0718,
+      "mean_token_accuracy": 0.9745873928070068,
+      "num_tokens": 1497968.0,
+      "step": 1060
+    },
+    {
+      "epoch": 0.015871367755907265,
+      "learning_rate": 0.00019682869305961403,
+      "loss": 0.073,
+      "mean_token_accuracy": 0.9762903869152069,
+      "num_tokens": 1511999.0,
+      "step": 1070
+    },
+    {
+      "epoch": 0.0160196982956821,
+      "learning_rate": 0.00019679902695165908,
+      "loss": 0.0854,
+      "mean_token_accuracy": 0.9715777993202209,
+      "num_tokens": 1526208.0,
+      "step": 1080
+    },
+    {
+      "epoch": 0.016168028835456934,
+      "learning_rate": 0.00019676936084370413,
+      "loss": 0.0589,
+      "mean_token_accuracy": 0.9842264592647553,
+      "num_tokens": 1540482.0,
+      "step": 1090
+    },
+    {
+      "epoch": 0.016316359375231765,
+      "learning_rate": 0.00019673969473574916,
+      "loss": 0.0702,
+      "mean_token_accuracy": 0.9759678483009339,
+      "num_tokens": 1554508.0,
+      "step": 1100
+    },
+    {
+      "epoch": 0.0164646899150066,
+      "learning_rate": 0.0001967100286277942,
+      "loss": 0.0768,
+      "mean_token_accuracy": 0.9776164293289185,
+      "num_tokens": 1568694.0,
+      "step": 1110
+    },
+    {
+      "epoch": 0.016613020454781434,
+      "learning_rate": 0.0001966803625198392,
+      "loss": 0.0879,
+      "mean_token_accuracy": 0.973237669467926,
+      "num_tokens": 1582893.0,
+      "step": 1120
+    },
+    {
+      "epoch": 0.01676135099455627,
+      "learning_rate": 0.00019665069641188425,
+      "loss": 0.0974,
+      "mean_token_accuracy": 0.974377167224884,
+      "num_tokens": 1597044.0,
+      "step": 1130
+    },
+    {
+      "epoch": 0.016909681534331103,
+      "learning_rate": 0.00019662103030392927,
+      "loss": 0.0856,
+      "mean_token_accuracy": 0.9713896453380585,
+      "num_tokens": 1611013.0,
+      "step": 1140
+    },
+    {
+      "epoch": 0.017058012074105938,
+      "learning_rate": 0.00019659136419597432,
+      "loss": 0.079,
+      "mean_token_accuracy": 0.9759809911251068,
+      "num_tokens": 1625267.0,
+      "step": 1150
+    },
+    {
+      "epoch": 0.017206342613880772,
+      "learning_rate": 0.00019656169808801935,
+      "loss": 0.0837,
+      "mean_token_accuracy": 0.9738470792770386,
+      "num_tokens": 1639399.0,
+      "step": 1160
+    },
+    {
+      "epoch": 0.017354673153655607,
+      "learning_rate": 0.00019653203198006437,
+      "loss": 0.0851,
+      "mean_token_accuracy": 0.9707063674926758,
+      "num_tokens": 1653578.0,
+      "step": 1170
+    },
+    {
+      "epoch": 0.01750300369343044,
+      "learning_rate": 0.00019650236587210942,
+      "loss": 0.0848,
+      "mean_token_accuracy": 0.9718270719051361,
+      "num_tokens": 1667589.0,
+      "step": 1180
+    },
+    {
+      "epoch": 0.017651334233205276,
+      "learning_rate": 0.00019647269976415444,
+      "loss": 0.0594,
+      "mean_token_accuracy": 0.9799826085567475,
+      "num_tokens": 1682134.0,
+      "step": 1190
+    },
+    {
+      "epoch": 0.01779966477298011,
+      "learning_rate": 0.0001964430336561995,
+      "loss": 0.0945,
+      "mean_token_accuracy": 0.9730036854743958,
+      "num_tokens": 1696174.0,
+      "step": 1200
+    },
+    {
+      "epoch": 0.017947995312754945,
+      "learning_rate": 0.00019641336754824452,
+      "loss": 0.077,
+      "mean_token_accuracy": 0.9751695334911347,
+      "num_tokens": 1710134.0,
+      "step": 1210
+    },
+    {
+      "epoch": 0.018096325852529776,
+      "learning_rate": 0.00019638370144028954,
+      "loss": 0.0812,
+      "mean_token_accuracy": 0.9705005526542664,
+      "num_tokens": 1724066.0,
+      "step": 1220
+    },
+    {
+      "epoch": 0.01824465639230461,
+      "learning_rate": 0.0001963540353323346,
+      "loss": 0.0704,
+      "mean_token_accuracy": 0.9745881497859955,
+      "num_tokens": 1738123.0,
+      "step": 1230
+    },
+    {
+      "epoch": 0.018392986932079445,
+      "learning_rate": 0.0001963243692243796,
+      "loss": 0.0702,
+      "mean_token_accuracy": 0.975337028503418,
+      "num_tokens": 1752512.0,
+      "step": 1240
+    },
+    {
+      "epoch": 0.01854131747185428,
+      "learning_rate": 0.00019629470311642466,
+      "loss": 0.0658,
+      "mean_token_accuracy": 0.9756161510944367,
+      "num_tokens": 1766708.0,
+      "step": 1250
+    },
+    {
+      "epoch": 0.018689648011629114,
+      "learning_rate": 0.00019626503700846969,
+      "loss": 0.0774,
+      "mean_token_accuracy": 0.9716881215572357,
+      "num_tokens": 1780735.0,
+      "step": 1260
+    },
+    {
+      "epoch": 0.01883797855140395,
+      "learning_rate": 0.0001962353709005147,
+      "loss": 0.0714,
+      "mean_token_accuracy": 0.9714816689491272,
+      "num_tokens": 1794777.0,
+      "step": 1270
+    },
+    {
+      "epoch": 0.018986309091178783,
+      "learning_rate": 0.00019620570479255976,
+      "loss": 0.0668,
+      "mean_token_accuracy": 0.9810837268829345,
+      "num_tokens": 1809000.0,
+      "step": 1280
+    },
+    {
+      "epoch": 0.019134639630953618,
+      "learning_rate": 0.00019617603868460478,
+      "loss": 0.0793,
+      "mean_token_accuracy": 0.9687060952186585,
+      "num_tokens": 1822994.0,
+      "step": 1290
+    },
+    {
+      "epoch": 0.019282970170728452,
+      "learning_rate": 0.0001961463725766498,
+      "loss": 0.1032,
+      "mean_token_accuracy": 0.9625421404838562,
+      "num_tokens": 1836944.0,
+      "step": 1300
+    },
+    {
+      "epoch": 0.019431300710503287,
+      "learning_rate": 0.00019611670646869485,
+      "loss": 0.0492,
+      "mean_token_accuracy": 0.9850762605667114,
+      "num_tokens": 1851184.0,
+      "step": 1310
+    },
+    {
+      "epoch": 0.01957963125027812,
+      "learning_rate": 0.00019608704036073988,
+      "loss": 0.057,
+      "mean_token_accuracy": 0.9841500043869018,
+      "num_tokens": 1865381.0,
+      "step": 1320
+    },
+    {
+      "epoch": 0.019727961790052952,
+      "learning_rate": 0.00019605737425278493,
+      "loss": 0.0896,
+      "mean_token_accuracy": 0.9729780375957489,
+      "num_tokens": 1879523.0,
+      "step": 1330
+    },
+    {
+      "epoch": 0.019876292329827787,
+      "learning_rate": 0.00019602770814482995,
+      "loss": 0.0853,
+      "mean_token_accuracy": 0.9706455945968628,
+      "num_tokens": 1893719.0,
+      "step": 1340
+    },
+    {
+      "epoch": 0.02002462286960262,
+      "learning_rate": 0.00019599804203687497,
+      "loss": 0.0721,
+      "mean_token_accuracy": 0.9733758211135864,
+      "num_tokens": 1908020.0,
+      "step": 1350
+    },
+    {
+      "epoch": 0.020172953409377456,
+      "learning_rate": 0.00019596837592892002,
+      "loss": 0.0645,
+      "mean_token_accuracy": 0.9787311971187591,
+      "num_tokens": 1922020.0,
+      "step": 1360
+    },
+    {
+      "epoch": 0.02032128394915229,
+      "learning_rate": 0.00019593870982096505,
+      "loss": 0.0669,
+      "mean_token_accuracy": 0.974337100982666,
+      "num_tokens": 1936105.0,
+      "step": 1370
+    },
+    {
+      "epoch": 0.020469614488927125,
+      "learning_rate": 0.0001959090437130101,
+      "loss": 0.076,
+      "mean_token_accuracy": 0.9723476529121399,
+      "num_tokens": 1950298.0,
+      "step": 1380
+    },
+    {
+      "epoch": 0.02061794502870196,
+      "learning_rate": 0.0001958793776050551,
+      "loss": 0.0691,
+      "mean_token_accuracy": 0.9764381349086761,
+      "num_tokens": 1964529.0,
+      "step": 1390
+    },
+    {
+      "epoch": 0.020766275568476794,
+      "learning_rate": 0.00019584971149710014,
+      "loss": 0.0593,
+      "mean_token_accuracy": 0.9796685576438904,
+      "num_tokens": 1978773.0,
+      "step": 1400
+    },
+    {
+      "epoch": 0.02091460610825163,
+      "learning_rate": 0.00019582004538914517,
+      "loss": 0.0995,
+      "mean_token_accuracy": 0.9613340020179748,
+      "num_tokens": 1993002.0,
+      "step": 1410
+    },
+    {
+      "epoch": 0.021062936648026463,
+      "learning_rate": 0.00019579037928119022,
+      "loss": 0.1051,
+      "mean_token_accuracy": 0.9684881687164306,
+      "num_tokens": 2007048.0,
+      "step": 1420
+    },
+    {
+      "epoch": 0.021211267187801298,
+      "learning_rate": 0.00019576071317323527,
+      "loss": 0.0869,
+      "mean_token_accuracy": 0.9721337735652924,
+      "num_tokens": 2021156.0,
+      "step": 1430
+    },
+    {
+      "epoch": 0.021359597727576132,
+      "learning_rate": 0.00019573104706528026,
+      "loss": 0.0897,
+      "mean_token_accuracy": 0.9715201795101166,
+      "num_tokens": 2035271.0,
+      "step": 1440
+    },
+    {
+      "epoch": 0.021507928267350963,
+      "learning_rate": 0.0001957013809573253,
+      "loss": 0.0706,
+      "mean_token_accuracy": 0.9721573770046235,
+      "num_tokens": 2049388.0,
+      "step": 1450
+    },
+    {
+      "epoch": 0.021656258807125798,
+      "learning_rate": 0.00019567171484937034,
+      "loss": 0.0846,
+      "mean_token_accuracy": 0.9714311838150025,
+      "num_tokens": 2063480.0,
+      "step": 1460
+    },
+    {
+      "epoch": 0.021804589346900632,
+      "learning_rate": 0.00019564204874141539,
+      "loss": 0.0765,
+      "mean_token_accuracy": 0.9730508744716644,
+      "num_tokens": 2077370.0,
+      "step": 1470
+    },
+    {
+      "epoch": 0.021952919886675467,
+      "learning_rate": 0.00019561238263346044,
+      "loss": 0.0791,
+      "mean_token_accuracy": 0.9674266993999481,
+      "num_tokens": 2091582.0,
+      "step": 1480
+    },
+    {
+      "epoch": 0.0221012504264503,
+      "learning_rate": 0.00019558271652550543,
+      "loss": 0.0528,
+      "mean_token_accuracy": 0.9833854794502258,
+      "num_tokens": 2105530.0,
+      "step": 1490
+    },
+    {
+      "epoch": 0.022249580966225136,
+      "learning_rate": 0.00019555305041755048,
+      "loss": 0.0687,
+      "mean_token_accuracy": 0.9794368088245392,
+      "num_tokens": 2119632.0,
+      "step": 1500
+    },
+    {
+      "epoch": 0.02239791150599997,
+      "learning_rate": 0.0001955233843095955,
+      "loss": 0.0636,
+      "mean_token_accuracy": 0.9763643145561218,
+      "num_tokens": 2133725.0,
+      "step": 1510
+    },
+    {
+      "epoch": 0.022546242045774805,
+      "learning_rate": 0.00019549371820164055,
+      "loss": 0.0827,
+      "mean_token_accuracy": 0.9748819410800934,
+      "num_tokens": 2147866.0,
+      "step": 1520
+    },
+    {
+      "epoch": 0.02269457258554964,
+      "learning_rate": 0.00019546405209368558,
+      "loss": 0.0582,
+      "mean_token_accuracy": 0.9838931798934937,
+      "num_tokens": 2161994.0,
+      "step": 1530
+    },
+    {
+      "epoch": 0.022842903125324474,
+      "learning_rate": 0.0001954343859857306,
+      "loss": 0.059,
+      "mean_token_accuracy": 0.9762311697006225,
+      "num_tokens": 2176227.0,
+      "step": 1540
+    },
+    {
+      "epoch": 0.02299123366509931,
+      "learning_rate": 0.00019540471987777565,
+      "loss": 0.0773,
+      "mean_token_accuracy": 0.977224487066269,
+      "num_tokens": 2190154.0,
+      "step": 1550
+    },
+    {
+      "epoch": 0.02313956420487414,
+      "learning_rate": 0.00019537505376982067,
+      "loss": 0.0893,
+      "mean_token_accuracy": 0.9703286468982697,
+      "num_tokens": 2204261.0,
+      "step": 1560
+    },
+    {
+      "epoch": 0.023287894744648974,
+      "learning_rate": 0.00019534538766186572,
+      "loss": 0.0752,
+      "mean_token_accuracy": 0.974709951877594,
+      "num_tokens": 2218636.0,
+      "step": 1570
+    },
+    {
+      "epoch": 0.02343622528442381,
+      "learning_rate": 0.00019531572155391075,
+      "loss": 0.0841,
+      "mean_token_accuracy": 0.9769960045814514,
+      "num_tokens": 2232764.0,
+      "step": 1580
+    },
+    {
+      "epoch": 0.023584555824198643,
+      "learning_rate": 0.00019528605544595577,
+      "loss": 0.0642,
+      "mean_token_accuracy": 0.977448046207428,
+      "num_tokens": 2246804.0,
+      "step": 1590
+    },
+    {
+      "epoch": 0.023732886363973478,
+      "learning_rate": 0.00019525638933800082,
+      "loss": 0.1069,
+      "mean_token_accuracy": 0.9677329897880554,
+      "num_tokens": 2260874.0,
+      "step": 1600
+    },
+    {
+      "epoch": 0.023881216903748313,
+      "learning_rate": 0.00019522672323004584,
+      "loss": 0.0833,
+      "mean_token_accuracy": 0.9727708697319031,
+      "num_tokens": 2275017.0,
+      "step": 1610
+    },
+    {
+      "epoch": 0.024029547443523147,
+      "learning_rate": 0.0001951970571220909,
+      "loss": 0.0785,
+      "mean_token_accuracy": 0.9746884107589722,
+      "num_tokens": 2289036.0,
+      "step": 1620
+    },
+    {
+      "epoch": 0.02417787798329798,
+      "learning_rate": 0.00019516739101413592,
+      "loss": 0.0609,
+      "mean_token_accuracy": 0.9793126463890076,
+      "num_tokens": 2303029.0,
+      "step": 1630
+    },
+    {
+      "epoch": 0.024326208523072816,
+      "learning_rate": 0.00019513772490618094,
+      "loss": 0.0772,
+      "mean_token_accuracy": 0.9758535027503967,
+      "num_tokens": 2317190.0,
+      "step": 1640
+    },
+    {
+      "epoch": 0.02447453906284765,
+      "learning_rate": 0.000195108058798226,
+      "loss": 0.08,
+      "mean_token_accuracy": 0.9721879363059998,
+      "num_tokens": 2331217.0,
+      "step": 1650
+    },
+    {
+      "epoch": 0.024622869602622485,
+      "learning_rate": 0.000195078392690271,
+      "loss": 0.0801,
+      "mean_token_accuracy": 0.9673756003379822,
+      "num_tokens": 2345300.0,
+      "step": 1660
+    },
+    {
+      "epoch": 0.02477120014239732,
+      "learning_rate": 0.00019504872658231603,
+      "loss": 0.0749,
+      "mean_token_accuracy": 0.9750915884971618,
+      "num_tokens": 2359603.0,
+      "step": 1670
+    },
+    {
+      "epoch": 0.02491953068217215,
+      "learning_rate": 0.00019501906047436106,
+      "loss": 0.0913,
+      "mean_token_accuracy": 0.975084537267685,
+      "num_tokens": 2373919.0,
+      "step": 1680
+    },
+    {
+      "epoch": 0.025067861221946985,
+      "learning_rate": 0.0001949893943664061,
+      "loss": 0.057,
+      "mean_token_accuracy": 0.9772068738937378,
+      "num_tokens": 2388316.0,
+      "step": 1690
+    },
+    {
+      "epoch": 0.02521619176172182,
+      "learning_rate": 0.00019495972825845116,
+      "loss": 0.0667,
+      "mean_token_accuracy": 0.9791467607021331,
+      "num_tokens": 2402418.0,
+      "step": 1700
+    },
+    {
+      "epoch": 0.025364522301496654,
+      "learning_rate": 0.00019493006215049618,
+      "loss": 0.0688,
+      "mean_token_accuracy": 0.9734372615814209,
+      "num_tokens": 2416607.0,
+      "step": 1710
+    },
+    {
+      "epoch": 0.02551285284127149,
+      "learning_rate": 0.0001949003960425412,
+      "loss": 0.0738,
+      "mean_token_accuracy": 0.9727434098720551,
+      "num_tokens": 2430760.0,
+      "step": 1720
+    },
+    {
+      "epoch": 0.025661183381046324,
+      "learning_rate": 0.00019487072993458623,
+      "loss": 0.0631,
+      "mean_token_accuracy": 0.9753291308879852,
+      "num_tokens": 2444839.0,
+      "step": 1730
+    },
+    {
+      "epoch": 0.025809513920821158,
+      "learning_rate": 0.00019484106382663128,
+      "loss": 0.0688,
+      "mean_token_accuracy": 0.9749854207038879,
+      "num_tokens": 2459179.0,
+      "step": 1740
+    },
+    {
+      "epoch": 0.025957844460595993,
+      "learning_rate": 0.00019481139771867633,
+      "loss": 0.0798,
+      "mean_token_accuracy": 0.9702626287937164,
+      "num_tokens": 2473205.0,
+      "step": 1750
+    },
+    {
+      "epoch": 0.026106175000370827,
+      "learning_rate": 0.00019478173161072132,
+      "loss": 0.09,
+      "mean_token_accuracy": 0.9717526614665986,
+      "num_tokens": 2487522.0,
+      "step": 1760
+    },
+    {
+      "epoch": 0.026254505540145662,
+      "learning_rate": 0.00019475206550276637,
+      "loss": 0.0785,
+      "mean_token_accuracy": 0.9719234526157379,
+      "num_tokens": 2501550.0,
+      "step": 1770
+    },
+    {
+      "epoch": 0.026402836079920496,
+      "learning_rate": 0.0001947223993948114,
+      "loss": 0.0924,
+      "mean_token_accuracy": 0.9727768480777741,
+      "num_tokens": 2515563.0,
+      "step": 1780
+    },
+    {
+      "epoch": 0.026551166619695327,
+      "learning_rate": 0.00019469273328685645,
+      "loss": 0.0961,
+      "mean_token_accuracy": 0.9694974541664123,
+      "num_tokens": 2529739.0,
+      "step": 1790
+    },
+    {
+      "epoch": 0.026699497159470162,
+      "learning_rate": 0.00019466306717890147,
+      "loss": 0.0959,
+      "mean_token_accuracy": 0.9663477540016174,
+      "num_tokens": 2543804.0,
+      "step": 1800
+    },
+    {
+      "epoch": 0.026847827699244996,
+      "learning_rate": 0.0001946334010709465,
+      "loss": 0.0963,
+      "mean_token_accuracy": 0.9670729875564575,
+      "num_tokens": 2557916.0,
+      "step": 1810
+    },
+    {
+      "epoch": 0.02699615823901983,
+      "learning_rate": 0.00019460373496299154,
+      "loss": 0.0897,
+      "mean_token_accuracy": 0.9735462188720703,
+      "num_tokens": 2571905.0,
+      "step": 1820
+    },
+    {
+      "epoch": 0.027144488778794666,
+      "learning_rate": 0.00019457406885503657,
+      "loss": 0.0811,
+      "mean_token_accuracy": 0.9717726945877075,
+      "num_tokens": 2585903.0,
+      "step": 1830
+    },
+    {
+      "epoch": 0.0272928193185695,
+      "learning_rate": 0.00019454440274708162,
+      "loss": 0.0845,
+      "mean_token_accuracy": 0.9706952691078186,
+      "num_tokens": 2600073.0,
+      "step": 1840
+    },
+    {
+      "epoch": 0.027441149858344335,
+      "learning_rate": 0.00019451473663912664,
+      "loss": 0.0768,
+      "mean_token_accuracy": 0.9728710412979126,
+      "num_tokens": 2614150.0,
+      "step": 1850
+    },
+    {
+      "epoch": 0.02758948039811917,
+      "learning_rate": 0.00019448507053117166,
+      "loss": 0.0758,
+      "mean_token_accuracy": 0.9708646655082702,
+      "num_tokens": 2628224.0,
+      "step": 1860
+    },
+    {
+      "epoch": 0.027737810937894004,
+      "learning_rate": 0.0001944554044232167,
+      "loss": 0.0773,
+      "mean_token_accuracy": 0.9729286253452301,
+      "num_tokens": 2642113.0,
+      "step": 1870
+    },
+    {
+      "epoch": 0.027886141477668838,
+      "learning_rate": 0.00019442573831526173,
+      "loss": 0.0927,
+      "mean_token_accuracy": 0.9710256516933441,
+      "num_tokens": 2656078.0,
+      "step": 1880
+    },
+    {
+      "epoch": 0.028034472017443673,
+      "learning_rate": 0.00019439607220730678,
+      "loss": 0.0632,
+      "mean_token_accuracy": 0.978791344165802,
+      "num_tokens": 2670128.0,
+      "step": 1890
+    },
+    {
+      "epoch": 0.028182802557218507,
+      "learning_rate": 0.0001943664060993518,
+      "loss": 0.0543,
+      "mean_token_accuracy": 0.9814699769020081,
+      "num_tokens": 2684474.0,
+      "step": 1900
+    },
+    {
+      "epoch": 0.02833113309699334,
+      "learning_rate": 0.00019433673999139683,
+      "loss": 0.063,
+      "mean_token_accuracy": 0.9756844103336334,
+      "num_tokens": 2698436.0,
+      "step": 1910
+    },
+    {
+      "epoch": 0.028479463636768173,
+      "learning_rate": 0.00019430707388344188,
+      "loss": 0.0945,
+      "mean_token_accuracy": 0.9691503286361695,
+      "num_tokens": 2712395.0,
+      "step": 1920
+    },
+    {
+      "epoch": 0.028627794176543007,
+      "learning_rate": 0.0001942774077754869,
+      "loss": 0.0601,
+      "mean_token_accuracy": 0.9762123942375183,
+      "num_tokens": 2726506.0,
+      "step": 1930
+    },
+    {
+      "epoch": 0.028776124716317842,
+      "learning_rate": 0.00019424774166753195,
+      "loss": 0.0912,
+      "mean_token_accuracy": 0.9622864723205566,
+      "num_tokens": 2740293.0,
+      "step": 1940
+    },
+    {
+      "epoch": 0.028924455256092677,
+      "learning_rate": 0.00019421807555957695,
+      "loss": 0.0758,
+      "mean_token_accuracy": 0.9752536177635193,
+      "num_tokens": 2754509.0,
+      "step": 1950
+    },
+    {
+      "epoch": 0.02907278579586751,
+      "learning_rate": 0.000194188409451622,
+      "loss": 0.0666,
+      "mean_token_accuracy": 0.9746933698654174,
+      "num_tokens": 2768540.0,
+      "step": 1960
+    },
+    {
+      "epoch": 0.029221116335642346,
+      "learning_rate": 0.00019415874334366705,
+      "loss": 0.0546,
+      "mean_token_accuracy": 0.9834832668304443,
+      "num_tokens": 2782780.0,
+      "step": 1970
+    },
+    {
+      "epoch": 0.02936944687541718,
+      "learning_rate": 0.00019412907723571207,
+      "loss": 0.0671,
+      "mean_token_accuracy": 0.9755673408508301,
+      "num_tokens": 2796973.0,
+      "step": 1980
+    },
+    {
+      "epoch": 0.029517777415192015,
+      "learning_rate": 0.00019409941112775712,
+      "loss": 0.0639,
+      "mean_token_accuracy": 0.9805159747600556,
+      "num_tokens": 2811149.0,
+      "step": 1990
+    },
+    {
+      "epoch": 0.02966610795496685,
+      "learning_rate": 0.00019406974501980212,
+      "loss": 0.0627,
+      "mean_token_accuracy": 0.9782679080963135,
+      "num_tokens": 2825280.0,
+      "step": 2000
+    },
+    {
+      "epoch": 0.029814438494741684,
+      "learning_rate": 0.00019404007891184717,
+      "loss": 0.0661,
+      "mean_token_accuracy": 0.9821899354457855,
+      "num_tokens": 2839485.0,
+      "step": 2010
+    },
+    {
+      "epoch": 0.02996276903451652,
+      "learning_rate": 0.00019401041280389222,
+      "loss": 0.0564,
+      "mean_token_accuracy": 0.9785571038722992,
+      "num_tokens": 2853745.0,
+      "step": 2020
+    },
+    {
+      "epoch": 0.03011109957429135,
+      "learning_rate": 0.00019398074669593724,
+      "loss": 0.0562,
+      "mean_token_accuracy": 0.9799038827419281,
+      "num_tokens": 2867981.0,
+      "step": 2030
+    },
+    {
+      "epoch": 0.030259430114066184,
+      "learning_rate": 0.00019395108058798226,
+      "loss": 0.0535,
+      "mean_token_accuracy": 0.9810381293296814,
+      "num_tokens": 2882309.0,
+      "step": 2040
+    },
+    {
+      "epoch": 0.03040776065384102,
+      "learning_rate": 0.0001939214144800273,
+      "loss": 0.0713,
+      "mean_token_accuracy": 0.9679700911045075,
+      "num_tokens": 2896302.0,
+      "step": 2050
+    },
+    {
+      "epoch": 0.030556091193615853,
+      "learning_rate": 0.00019389174837207234,
+      "loss": 0.064,
+      "mean_token_accuracy": 0.9801303565502166,
+      "num_tokens": 2910539.0,
+      "step": 2060
+    },
+    {
+      "epoch": 0.030704421733390688,
+      "learning_rate": 0.00019386208226411736,
+      "loss": 0.0705,
+      "mean_token_accuracy": 0.9750793814659119,
+      "num_tokens": 2924693.0,
+      "step": 2070
+    },
+    {
+      "epoch": 0.030852752273165522,
+      "learning_rate": 0.0001938324161561624,
+      "loss": 0.0782,
+      "mean_token_accuracy": 0.9702070772647857,
+      "num_tokens": 2939003.0,
+      "step": 2080
+    },
+    {
+      "epoch": 0.031001082812940357,
+      "learning_rate": 0.00019380275004820743,
+      "loss": 0.0673,
+      "mean_token_accuracy": 0.9760224461555481,
+      "num_tokens": 2953090.0,
+      "step": 2090
+    },
+    {
+      "epoch": 0.03114941335271519,
+      "learning_rate": 0.00019377308394025246,
+      "loss": 0.0589,
+      "mean_token_accuracy": 0.9770624697208404,
+      "num_tokens": 2967200.0,
+      "step": 2100
+    },
+    {
+      "epoch": 0.03129774389249002,
+      "learning_rate": 0.0001937434178322975,
+      "loss": 0.0711,
+      "mean_token_accuracy": 0.9783848404884339,
+      "num_tokens": 2981214.0,
+      "step": 2110
+    },
+    {
+      "epoch": 0.03144607443226486,
+      "learning_rate": 0.00019371375172434253,
+      "loss": 0.074,
+      "mean_token_accuracy": 0.9754434108734131,
+      "num_tokens": 2995111.0,
+      "step": 2120
+    },
+    {
+      "epoch": 0.03159440497203969,
+      "learning_rate": 0.00019368408561638758,
+      "loss": 0.0717,
+      "mean_token_accuracy": 0.9739742994308471,
+      "num_tokens": 3009288.0,
+      "step": 2130
+    },
+    {
+      "epoch": 0.03174273551181453,
+      "learning_rate": 0.0001936544195084326,
+      "loss": 0.0722,
+      "mean_token_accuracy": 0.9753423690795898,
+      "num_tokens": 3023460.0,
+      "step": 2140
+    },
+    {
+      "epoch": 0.03189106605158936,
+      "learning_rate": 0.00019362475340047763,
+      "loss": 0.0463,
+      "mean_token_accuracy": 0.982510793209076,
+      "num_tokens": 3037765.0,
+      "step": 2150
+    },
+    {
+      "epoch": 0.0320393965913642,
+      "learning_rate": 0.00019359508729252268,
+      "loss": 0.0858,
+      "mean_token_accuracy": 0.9690262913703919,
+      "num_tokens": 3051960.0,
+      "step": 2160
+    },
+    {
+      "epoch": 0.03218772713113903,
+      "learning_rate": 0.0001935654211845677,
+      "loss": 0.0836,
+      "mean_token_accuracy": 0.9711355090141296,
+      "num_tokens": 3065890.0,
+      "step": 2170
+    },
+    {
+      "epoch": 0.03233605767091387,
+      "learning_rate": 0.00019353575507661272,
+      "loss": 0.0631,
+      "mean_token_accuracy": 0.9817444443702698,
+      "num_tokens": 3080088.0,
+      "step": 2180
+    },
+    {
+      "epoch": 0.0324843882106887,
+      "learning_rate": 0.00019350608896865777,
+      "loss": 0.076,
+      "mean_token_accuracy": 0.974403315782547,
+      "num_tokens": 3094137.0,
+      "step": 2190
+    },
+    {
+      "epoch": 0.03263271875046353,
+      "learning_rate": 0.0001934764228607028,
+      "loss": 0.0728,
+      "mean_token_accuracy": 0.9692744731903076,
+      "num_tokens": 3108250.0,
+      "step": 2200
+    },
+    {
+      "epoch": 0.03278104929023837,
+      "learning_rate": 0.00019344675675274785,
+      "loss": 0.0605,
+      "mean_token_accuracy": 0.9794303894042968,
+      "num_tokens": 3122535.0,
+      "step": 2210
+    },
+    {
+      "epoch": 0.0329293798300132,
+      "learning_rate": 0.00019341709064479287,
+      "loss": 0.0932,
+      "mean_token_accuracy": 0.9737550437450408,
+      "num_tokens": 3136693.0,
+      "step": 2220
+    },
+    {
+      "epoch": 0.03307771036978804,
+      "learning_rate": 0.0001933874245368379,
+      "loss": 0.0726,
+      "mean_token_accuracy": 0.972658348083496,
+      "num_tokens": 3150833.0,
+      "step": 2230
+    },
+    {
+      "epoch": 0.03322604090956287,
+      "learning_rate": 0.00019335775842888294,
+      "loss": 0.0894,
+      "mean_token_accuracy": 0.9667915642261505,
+      "num_tokens": 3165091.0,
+      "step": 2240
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 67417,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 2247,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5068943178039296e+17,
+  "train_batch_size": 12,
+  "trial_name": null,
+  "trial_params": null
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.0

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-4494/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.0

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
+size 14244

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8112171b5385c5b37366ef9bade4f4b9781d2d1470892eab36e63919d55a16
+size 1064

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_102413/checkpoint-6741/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.0

gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-10110/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: /scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.0

gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/scratch-shared/gwijngaard/models/GAMA-IT/Llama-2-7b-chat-hf-qformer/",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gama/gama-semantic_mc_qa--semantic_elements-20250424_130127/checkpoint-13480/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}