Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- .hydra/config.yaml +81 -0
- .hydra/hydra.yaml +178 -0
- .hydra/overrides.yaml +16 -0
- config.yaml +83 -0
- model@0.pt +3 -0
- model@300.pt +3 -0
- model@600.pt +3 -0
- model@900.pt +3 -0
- nanogpt.log +916 -0
- wandb/debug-internal.log +52 -0
- wandb/debug.log +23 -0
- wandb/run-20250911_200644-y9v5i9gr/files/config.yaml +195 -0
- wandb/run-20250911_200644-y9v5i9gr/files/output.log +936 -0
- wandb/run-20250911_200644-y9v5i9gr/files/requirements.txt +125 -0
- wandb/run-20250911_200644-y9v5i9gr/files/wandb-metadata.json +114 -0
- wandb/run-20250911_200644-y9v5i9gr/files/wandb-summary.json +1 -0
- wandb/run-20250911_200644-y9v5i9gr/logs/debug-internal.log +52 -0
- wandb/run-20250911_200644-y9v5i9gr/logs/debug.log +23 -0
- wandb/run-20250911_200644-y9v5i9gr/run-y9v5i9gr.wandb +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
wandb/run-20250911_200644-y9v5i9gr/run-y9v5i9gr.wandb filter=lfs diff=lfs merge=lfs -text
|
.hydra/config.yaml
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compile: true
|
| 2 |
+
device: cuda
|
| 3 |
+
from_checkpoint: null
|
| 4 |
+
name: nanogpt
|
| 5 |
+
training:
|
| 6 |
+
random_seed: 13
|
| 7 |
+
batch_size: 256
|
| 8 |
+
device_batch_size: 1
|
| 9 |
+
sequence_length: 8192
|
| 10 |
+
num_iterations: 900
|
| 11 |
+
learning_rate: 0.0003
|
| 12 |
+
use_scheduler: false
|
| 13 |
+
save_model: true
|
| 14 |
+
save_optimizer: true
|
| 15 |
+
save_model_every: 100
|
| 16 |
+
val_loss_every: 100
|
| 17 |
+
val_tokens: 4194304
|
| 18 |
+
expname: lr-3e-4-no-lora-cp-n-8-r-8
|
| 19 |
+
model:
|
| 20 |
+
name: mtp
|
| 21 |
+
beta: 0.0
|
| 22 |
+
gamma: 1
|
| 23 |
+
kl_algorithm: full
|
| 24 |
+
kl_type: forward
|
| 25 |
+
model:
|
| 26 |
+
_target_: mtp.models.mtp.MultiTokenLM
|
| 27 |
+
lm: ${lm.model}
|
| 28 |
+
circuit: ${circuit.model}
|
| 29 |
+
mt_head_kwargs: ${mt_head.hyperparameters}
|
| 30 |
+
init_from_lm_head: true
|
| 31 |
+
kl_type: ${model.kl_type}
|
| 32 |
+
kl_algorithm: ${model.kl_algorithm}
|
| 33 |
+
beta: 0
|
| 34 |
+
gamma: 0.9
|
| 35 |
+
circuit:
|
| 36 |
+
name: cp
|
| 37 |
+
n_token: 8
|
| 38 |
+
n_component: 8
|
| 39 |
+
model:
|
| 40 |
+
_target_: mtp.models.circuits.CircuitModel
|
| 41 |
+
vocab_size: ${data.vocab_size}
|
| 42 |
+
n_token: ${circuit.n_token}
|
| 43 |
+
n_component: ${circuit.n_component}
|
| 44 |
+
kind: cp
|
| 45 |
+
mt_head:
|
| 46 |
+
name: linear-evabyte
|
| 47 |
+
hyperparameters:
|
| 48 |
+
type: evabyte
|
| 49 |
+
n_embd: ${lm.n_embd}
|
| 50 |
+
transformer_n_head: ${lm.n_head}
|
| 51 |
+
transformer_n_layer: 0
|
| 52 |
+
expander_type: linear
|
| 53 |
+
expander_n_layer: 1
|
| 54 |
+
freeze_vocab_unembedding: false
|
| 55 |
+
share_sum_weights: false
|
| 56 |
+
contextual_hmm_weights: true
|
| 57 |
+
init_hmm_identity: true
|
| 58 |
+
adaptor:
|
| 59 |
+
name: none
|
| 60 |
+
hyperparameters: null
|
| 61 |
+
lm:
|
| 62 |
+
name: evabyte
|
| 63 |
+
n_embd: 4096
|
| 64 |
+
n_head: 32
|
| 65 |
+
model:
|
| 66 |
+
_target_: mtp.models.lm.LM
|
| 67 |
+
lm: null
|
| 68 |
+
encoder_only: true
|
| 69 |
+
from_checkpoint: null
|
| 70 |
+
from_huggingface: EvaByte/EvaByte-SFT
|
| 71 |
+
adaptor_kwargs: ${adaptor.hyperparameters}
|
| 72 |
+
ref_enc: model
|
| 73 |
+
ref_head: lm_head
|
| 74 |
+
freeze: true
|
| 75 |
+
data:
|
| 76 |
+
name: tulu3-evabyte
|
| 77 |
+
train_bin: agrv/tulu-v3-sft-evabyte-packed-seq-len-8192
|
| 78 |
+
val_bin: null
|
| 79 |
+
vocab_size: 320
|
| 80 |
+
generate:
|
| 81 |
+
speculative: false
|
.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task:
|
| 115 |
+
- data=tulu3-evabyte-packed
|
| 116 |
+
- training=tulu3-evabyte-1epoch
|
| 117 |
+
- lm=evabyte
|
| 118 |
+
- model=mtp
|
| 119 |
+
- adaptor=none
|
| 120 |
+
- mt_head=linear-evabyte
|
| 121 |
+
- circuit=cp
|
| 122 |
+
- circuit.n_token=8
|
| 123 |
+
- circuit.n_component=8
|
| 124 |
+
- training.device_batch_size=1
|
| 125 |
+
- data.vocab_size=320
|
| 126 |
+
- model.model.beta=0
|
| 127 |
+
- model.model.gamma=0.9
|
| 128 |
+
- data.val_bin=null
|
| 129 |
+
- training.learning_rate=0.0003
|
| 130 |
+
- training.expname=lr-3e-4-no-lora-cp-n-8-r-8
|
| 131 |
+
job:
|
| 132 |
+
name: ${name}
|
| 133 |
+
chdir: true
|
| 134 |
+
override_dirname: adaptor=none,circuit.n_component=8,circuit.n_token=8,circuit=cp,data.val_bin=null,data.vocab_size=320,data=tulu3-evabyte-packed,lm=evabyte,model.model.beta=0,model.model.gamma=0.9,model=mtp,mt_head=linear-evabyte,training.device_batch_size=1,training.expname=lr-3e-4-no-lora-cp-n-8-r-8,training.learning_rate=0.0003,training=tulu3-evabyte-1epoch
|
| 135 |
+
id: ???
|
| 136 |
+
num: ???
|
| 137 |
+
config_name: config
|
| 138 |
+
env_set: {}
|
| 139 |
+
env_copy: []
|
| 140 |
+
config:
|
| 141 |
+
override_dirname:
|
| 142 |
+
kv_sep: '='
|
| 143 |
+
item_sep: ','
|
| 144 |
+
exclude_keys: []
|
| 145 |
+
runtime:
|
| 146 |
+
version: 1.3.2
|
| 147 |
+
version_base: '1.3'
|
| 148 |
+
cwd: /disk/scratch/agrivas/nanoGPT
|
| 149 |
+
config_sources:
|
| 150 |
+
- path: hydra.conf
|
| 151 |
+
schema: pkg
|
| 152 |
+
provider: hydra
|
| 153 |
+
- path: /disk/scratch/agrivas/nanoGPT/configs
|
| 154 |
+
schema: file
|
| 155 |
+
provider: main
|
| 156 |
+
- path: ''
|
| 157 |
+
schema: structured
|
| 158 |
+
provider: schema
|
| 159 |
+
output_dir: /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16
|
| 160 |
+
choices:
|
| 161 |
+
generate: default
|
| 162 |
+
data: tulu3-evabyte-packed
|
| 163 |
+
lm: evabyte
|
| 164 |
+
adaptor: none
|
| 165 |
+
mt_head: linear-evabyte
|
| 166 |
+
circuit: cp
|
| 167 |
+
model: mtp
|
| 168 |
+
training: tulu3-evabyte-1epoch
|
| 169 |
+
hydra/env: default
|
| 170 |
+
hydra/callbacks: null
|
| 171 |
+
hydra/job_logging: default
|
| 172 |
+
hydra/hydra_logging: default
|
| 173 |
+
hydra/hydra_help: default
|
| 174 |
+
hydra/help: default
|
| 175 |
+
hydra/sweeper: basic
|
| 176 |
+
hydra/launcher: basic
|
| 177 |
+
hydra/output: default
|
| 178 |
+
verbose: false
|
.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- data=tulu3-evabyte-packed
|
| 2 |
+
- training=tulu3-evabyte-1epoch
|
| 3 |
+
- lm=evabyte
|
| 4 |
+
- model=mtp
|
| 5 |
+
- adaptor=none
|
| 6 |
+
- mt_head=linear-evabyte
|
| 7 |
+
- circuit=cp
|
| 8 |
+
- circuit.n_token=8
|
| 9 |
+
- circuit.n_component=8
|
| 10 |
+
- training.device_batch_size=1
|
| 11 |
+
- data.vocab_size=320
|
| 12 |
+
- model.model.beta=0
|
| 13 |
+
- model.model.gamma=0.9
|
| 14 |
+
- data.val_bin=null
|
| 15 |
+
- training.learning_rate=0.0003
|
| 16 |
+
- training.expname=lr-3e-4-no-lora-cp-n-8-r-8
|
config.yaml
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
compile: true
|
| 2 |
+
device: cuda
|
| 3 |
+
from_checkpoint: null
|
| 4 |
+
name: nanogpt
|
| 5 |
+
training:
|
| 6 |
+
random_seed: 13
|
| 7 |
+
batch_size: 256
|
| 8 |
+
device_batch_size: 1
|
| 9 |
+
sequence_length: 8192
|
| 10 |
+
num_iterations: 900
|
| 11 |
+
learning_rate: 0.0003
|
| 12 |
+
use_scheduler: false
|
| 13 |
+
save_model: true
|
| 14 |
+
save_optimizer: true
|
| 15 |
+
save_model_every: 100
|
| 16 |
+
val_loss_every: 100
|
| 17 |
+
val_tokens: 4194304
|
| 18 |
+
expname: lr-3e-4-no-lora-cp-n-8-r-8
|
| 19 |
+
model:
|
| 20 |
+
name: mtp
|
| 21 |
+
beta: 0.0
|
| 22 |
+
gamma: 1
|
| 23 |
+
kl_algorithm: full
|
| 24 |
+
kl_type: forward
|
| 25 |
+
model:
|
| 26 |
+
_target_: mtp.models.mtp.MultiTokenLM
|
| 27 |
+
lm: ${lm.model}
|
| 28 |
+
circuit: ${circuit.model}
|
| 29 |
+
mt_head_kwargs: ${mt_head.hyperparameters}
|
| 30 |
+
init_from_lm_head: true
|
| 31 |
+
kl_type: ${model.kl_type}
|
| 32 |
+
kl_algorithm: ${model.kl_algorithm}
|
| 33 |
+
beta: 0
|
| 34 |
+
gamma: 0.9
|
| 35 |
+
circuit:
|
| 36 |
+
name: cp
|
| 37 |
+
n_token: 8
|
| 38 |
+
n_component: 8
|
| 39 |
+
model:
|
| 40 |
+
_target_: mtp.models.circuits.CircuitModel
|
| 41 |
+
vocab_size: 320
|
| 42 |
+
n_token: 8
|
| 43 |
+
n_component: 8
|
| 44 |
+
kind: cp
|
| 45 |
+
mt_head:
|
| 46 |
+
name: linear-evabyte
|
| 47 |
+
hyperparameters:
|
| 48 |
+
type: evabyte
|
| 49 |
+
n_embd: 4096
|
| 50 |
+
transformer_n_head: 32
|
| 51 |
+
transformer_n_layer: 0
|
| 52 |
+
expander_type: linear
|
| 53 |
+
expander_n_layer: 1
|
| 54 |
+
freeze_vocab_unembedding: false
|
| 55 |
+
share_sum_weights: false
|
| 56 |
+
contextual_hmm_weights: true
|
| 57 |
+
init_hmm_identity: true
|
| 58 |
+
adaptor:
|
| 59 |
+
name: none
|
| 60 |
+
hyperparameters: null
|
| 61 |
+
lm:
|
| 62 |
+
name: evabyte
|
| 63 |
+
n_embd: 4096
|
| 64 |
+
n_head: 32
|
| 65 |
+
model:
|
| 66 |
+
_target_: mtp.models.lm.LM
|
| 67 |
+
lm: null
|
| 68 |
+
encoder_only: true
|
| 69 |
+
from_checkpoint: null
|
| 70 |
+
from_huggingface: EvaByte/EvaByte-SFT
|
| 71 |
+
adaptor_kwargs: null
|
| 72 |
+
ref_enc: model
|
| 73 |
+
ref_head: lm_head
|
| 74 |
+
freeze: true
|
| 75 |
+
data:
|
| 76 |
+
name: tulu3-evabyte
|
| 77 |
+
train_bin: agrv/tulu-v3-sft-evabyte-packed-seq-len-8192
|
| 78 |
+
val_bin: null
|
| 79 |
+
vocab_size: 320
|
| 80 |
+
generate:
|
| 81 |
+
speculative: false
|
| 82 |
+
expname: lr-3e-4-no-lora-cp-n-8-r-8
|
| 83 |
+
wandb_run_id: y9v5i9gr
|
model@0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49b32f3432aedda946c39f670a3092b4ba8e1afcf352a260eb91a75f760c00cf
|
| 3 |
+
size 167886916
|
model@300.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94a448ea9a2443052525acd82e48458ada80b6e74225312c21193404445233db
|
| 3 |
+
size 503564433
|
model@600.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfdccabb110a7f91c95f4f688b9ae8d557017f0b664c64d1abca769d9eedca70
|
| 3 |
+
size 503564433
|
model@900.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fd7891bdf8e6767e17d7276d3f0b7eca933e4f66740662f9d3cef90eb6f10a2
|
| 3 |
+
size 503564433
|
nanogpt.log
ADDED
|
@@ -0,0 +1,916 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2025-09-11 20:06:35,177] - Setting up model... compile=True...
|
| 2 |
+
[2025-09-11 20:06:46,551] - Saving config and checkpoints to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16...
|
| 3 |
+
[2025-09-11 20:06:46,551] - Save model: True...
|
| 4 |
+
[2025-09-11 20:06:46,552] - Save optimizer: True...
|
| 5 |
+
[2025-09-11 20:06:46,558] - Training on agrv/tulu-v3-sft-evabyte-packed-seq-len-8192...
|
| 6 |
+
[2025-09-11 20:07:21,844] - Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
|
| 7 |
+
[2025-09-11 20:07:23,771] - step:0/900 Saving model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@0.pt...
|
| 8 |
+
[2025-09-11 20:10:21,957] - step:1/900 train_loss:0.5686 lr:0.0003000000 time/step:177.94s
|
| 9 |
+
[2025-09-11 20:12:19,200] - step:2/900 train_loss:0.5480 lr:0.0003000000 time/step:117.24s
|
| 10 |
+
[2025-09-11 20:14:16,539] - step:3/900 train_loss:0.5220 lr:0.0003000000 time/step:117.34s
|
| 11 |
+
[2025-09-11 20:16:13,861] - step:4/900 train_loss:0.5383 lr:0.0003000000 time/step:117.32s
|
| 12 |
+
[2025-09-11 20:18:10,435] - step:5/900 train_loss:0.5371 lr:0.0003000000 time/step:116.57s
|
| 13 |
+
[2025-09-11 20:20:08,627] - step:6/900 train_loss:0.5227 lr:0.0003000000 time/step:118.19s
|
| 14 |
+
[2025-09-11 20:22:06,149] - step:7/900 train_loss:0.5128 lr:0.0003000000 time/step:117.51s
|
| 15 |
+
[2025-09-11 20:24:03,890] - step:8/900 train_loss:0.5420 lr:0.0003000000 time/step:117.74s
|
| 16 |
+
[2025-09-11 20:26:01,252] - step:9/900 train_loss:0.5426 lr:0.0003000000 time/step:117.36s
|
| 17 |
+
[2025-09-11 20:27:59,602] - step:10/900 train_loss:0.5236 lr:0.0003000000 time/step:118.34s
|
| 18 |
+
[2025-09-11 20:29:56,227] - step:11/900 train_loss:0.4860 lr:0.0003000000 time/step:116.61s
|
| 19 |
+
[2025-09-11 20:31:54,042] - step:12/900 train_loss:0.5105 lr:0.0003000000 time/step:117.81s
|
| 20 |
+
[2025-09-11 20:33:51,406] - step:13/900 train_loss:0.4993 lr:0.0003000000 time/step:117.36s
|
| 21 |
+
[2025-09-11 20:35:48,717] - step:14/900 train_loss:0.4925 lr:0.0003000000 time/step:117.31s
|
| 22 |
+
[2025-09-11 20:37:47,558] - step:15/900 train_loss:0.5207 lr:0.0003000000 time/step:118.83s
|
| 23 |
+
[2025-09-11 20:39:45,850] - step:16/900 train_loss:0.4827 lr:0.0003000000 time/step:118.28s
|
| 24 |
+
[2025-09-11 20:41:42,738] - step:17/900 train_loss:0.5033 lr:0.0003000000 time/step:116.88s
|
| 25 |
+
[2025-09-11 20:43:39,898] - step:18/900 train_loss:0.5082 lr:0.0003000000 time/step:117.15s
|
| 26 |
+
[2025-09-11 20:45:37,029] - step:19/900 train_loss:0.4910 lr:0.0003000000 time/step:117.13s
|
| 27 |
+
[2025-09-11 20:47:34,571] - step:20/900 train_loss:0.5006 lr:0.0003000000 time/step:117.54s
|
| 28 |
+
[2025-09-11 20:49:32,312] - step:21/900 train_loss:0.4936 lr:0.0003000000 time/step:117.73s
|
| 29 |
+
[2025-09-11 20:51:29,213] - step:22/900 train_loss:0.4941 lr:0.0003000000 time/step:116.90s
|
| 30 |
+
[2025-09-11 20:53:26,056] - step:23/900 train_loss:0.5131 lr:0.0003000000 time/step:116.83s
|
| 31 |
+
[2025-09-11 20:55:22,982] - step:24/900 train_loss:0.4826 lr:0.0003000000 time/step:116.92s
|
| 32 |
+
[2025-09-11 20:57:20,427] - step:25/900 train_loss:0.4913 lr:0.0003000000 time/step:117.44s
|
| 33 |
+
[2025-09-11 20:59:18,626] - step:26/900 train_loss:0.4607 lr:0.0003000000 time/step:118.18s
|
| 34 |
+
[2025-09-11 21:01:15,710] - step:27/900 train_loss:0.4908 lr:0.0003000000 time/step:117.08s
|
| 35 |
+
[2025-09-11 21:03:12,633] - step:28/900 train_loss:0.4910 lr:0.0003000000 time/step:116.91s
|
| 36 |
+
[2025-09-11 21:05:09,636] - step:29/900 train_loss:0.4657 lr:0.0003000000 time/step:117.00s
|
| 37 |
+
[2025-09-11 21:07:06,700] - step:30/900 train_loss:0.4594 lr:0.0003000000 time/step:117.06s
|
| 38 |
+
[2025-09-11 21:09:04,683] - step:31/900 train_loss:0.4755 lr:0.0003000000 time/step:117.97s
|
| 39 |
+
[2025-09-11 21:11:01,763] - step:32/900 train_loss:0.4541 lr:0.0003000000 time/step:117.08s
|
| 40 |
+
[2025-09-11 21:12:59,791] - step:33/900 train_loss:0.4807 lr:0.0003000000 time/step:118.02s
|
| 41 |
+
[2025-09-11 21:14:55,836] - step:34/900 train_loss:0.4870 lr:0.0003000000 time/step:116.03s
|
| 42 |
+
[2025-09-11 21:16:52,899] - step:35/900 train_loss:0.4625 lr:0.0003000000 time/step:117.06s
|
| 43 |
+
[2025-09-11 21:18:51,003] - step:36/900 train_loss:0.4791 lr:0.0003000000 time/step:118.09s
|
| 44 |
+
[2025-09-11 21:20:48,545] - step:37/900 train_loss:0.4473 lr:0.0003000000 time/step:117.53s
|
| 45 |
+
[2025-09-11 21:22:45,589] - step:38/900 train_loss:0.4752 lr:0.0003000000 time/step:117.04s
|
| 46 |
+
[2025-09-11 21:24:43,273] - step:39/900 train_loss:0.4637 lr:0.0003000000 time/step:117.68s
|
| 47 |
+
[2025-09-11 21:26:39,295] - step:40/900 train_loss:0.4792 lr:0.0003000000 time/step:116.01s
|
| 48 |
+
[2025-09-11 21:28:36,435] - step:41/900 train_loss:0.4486 lr:0.0003000000 time/step:117.13s
|
| 49 |
+
[2025-09-11 21:30:33,920] - step:42/900 train_loss:0.4401 lr:0.0003000000 time/step:117.48s
|
| 50 |
+
[2025-09-11 21:32:30,825] - step:43/900 train_loss:0.4647 lr:0.0003000000 time/step:116.90s
|
| 51 |
+
[2025-09-11 21:34:28,329] - step:44/900 train_loss:0.4925 lr:0.0003000000 time/step:117.50s
|
| 52 |
+
[2025-09-11 21:36:25,926] - step:45/900 train_loss:0.4660 lr:0.0003000000 time/step:117.59s
|
| 53 |
+
[2025-09-11 21:38:22,375] - step:46/900 train_loss:0.4459 lr:0.0003000000 time/step:116.44s
|
| 54 |
+
[2025-09-11 21:40:19,319] - step:47/900 train_loss:0.4487 lr:0.0003000000 time/step:116.93s
|
| 55 |
+
[2025-09-11 21:42:17,801] - step:48/900 train_loss:0.4378 lr:0.0003000000 time/step:118.48s
|
| 56 |
+
[2025-09-11 21:44:15,250] - step:49/900 train_loss:0.4623 lr:0.0003000000 time/step:117.44s
|
| 57 |
+
[2025-09-11 21:46:12,028] - step:50/900 train_loss:0.4788 lr:0.0003000000 time/step:116.77s
|
| 58 |
+
[2025-09-11 21:48:08,924] - step:51/900 train_loss:0.4612 lr:0.0003000000 time/step:116.89s
|
| 59 |
+
[2025-09-11 21:50:05,277] - step:52/900 train_loss:0.4670 lr:0.0003000000 time/step:116.34s
|
| 60 |
+
[2025-09-11 21:52:03,579] - step:53/900 train_loss:0.4948 lr:0.0003000000 time/step:118.20s
|
| 61 |
+
[2025-09-11 21:54:00,439] - step:54/900 train_loss:0.4474 lr:0.0003000000 time/step:116.86s
|
| 62 |
+
[2025-09-11 21:55:57,226] - step:55/900 train_loss:0.4696 lr:0.0003000000 time/step:116.78s
|
| 63 |
+
[2025-09-11 21:57:54,070] - step:56/900 train_loss:0.4636 lr:0.0003000000 time/step:116.84s
|
| 64 |
+
[2025-09-11 21:59:51,015] - step:57/900 train_loss:0.4567 lr:0.0003000000 time/step:116.93s
|
| 65 |
+
[2025-09-11 22:01:48,416] - step:58/900 train_loss:0.4600 lr:0.0003000000 time/step:117.40s
|
| 66 |
+
[2025-09-11 22:03:46,720] - step:59/900 train_loss:0.4678 lr:0.0003000000 time/step:118.30s
|
| 67 |
+
[2025-09-11 22:05:43,544] - step:60/900 train_loss:0.4619 lr:0.0003000000 time/step:116.82s
|
| 68 |
+
[2025-09-11 22:07:40,424] - step:61/900 train_loss:0.4553 lr:0.0003000000 time/step:116.87s
|
| 69 |
+
[2025-09-11 22:09:37,873] - step:62/900 train_loss:0.4719 lr:0.0003000000 time/step:117.43s
|
| 70 |
+
[2025-09-11 22:11:34,969] - step:63/900 train_loss:0.4582 lr:0.0003000000 time/step:117.09s
|
| 71 |
+
[2025-09-11 22:13:31,914] - step:64/900 train_loss:0.4430 lr:0.0003000000 time/step:116.94s
|
| 72 |
+
[2025-09-11 22:15:28,799] - step:65/900 train_loss:0.4268 lr:0.0003000000 time/step:116.88s
|
| 73 |
+
[2025-09-11 22:17:25,704] - step:66/900 train_loss:0.4669 lr:0.0003000000 time/step:116.90s
|
| 74 |
+
[2025-09-11 22:19:22,827] - step:67/900 train_loss:0.4380 lr:0.0003000000 time/step:117.11s
|
| 75 |
+
[2025-09-11 22:21:20,150] - step:68/900 train_loss:0.4785 lr:0.0003000000 time/step:117.32s
|
| 76 |
+
[2025-09-11 22:23:16,126] - step:69/900 train_loss:0.4678 lr:0.0003000000 time/step:115.97s
|
| 77 |
+
[2025-09-11 22:25:13,659] - step:70/900 train_loss:0.4456 lr:0.0003000000 time/step:117.53s
|
| 78 |
+
[2025-09-11 22:27:10,581] - step:71/900 train_loss:0.4403 lr:0.0003000000 time/step:116.91s
|
| 79 |
+
[2025-09-11 22:29:07,930] - step:72/900 train_loss:0.4318 lr:0.0003000000 time/step:117.34s
|
| 80 |
+
[2025-09-11 22:31:05,566] - step:73/900 train_loss:0.4546 lr:0.0003000000 time/step:117.63s
|
| 81 |
+
[2025-09-11 22:33:02,531] - step:74/900 train_loss:0.4860 lr:0.0003000000 time/step:116.96s
|
| 82 |
+
[2025-09-11 22:34:59,254] - step:75/900 train_loss:0.4499 lr:0.0003000000 time/step:116.72s
|
| 83 |
+
[2025-09-11 22:36:57,138] - step:76/900 train_loss:0.4490 lr:0.0003000000 time/step:117.88s
|
| 84 |
+
[2025-09-11 22:38:54,164] - step:77/900 train_loss:0.4490 lr:0.0003000000 time/step:117.02s
|
| 85 |
+
[2025-09-11 22:40:51,448] - step:78/900 train_loss:0.4455 lr:0.0003000000 time/step:117.27s
|
| 86 |
+
[2025-09-11 22:42:48,430] - step:79/900 train_loss:0.4274 lr:0.0003000000 time/step:116.98s
|
| 87 |
+
[2025-09-11 22:44:45,934] - step:80/900 train_loss:0.4519 lr:0.0003000000 time/step:117.50s
|
| 88 |
+
[2025-09-11 22:46:42,798] - step:81/900 train_loss:0.4429 lr:0.0003000000 time/step:116.85s
|
| 89 |
+
[2025-09-11 22:48:39,720] - step:82/900 train_loss:0.4436 lr:0.0003000000 time/step:116.92s
|
| 90 |
+
[2025-09-11 22:50:37,164] - step:83/900 train_loss:0.4713 lr:0.0003000000 time/step:117.43s
|
| 91 |
+
[2025-09-11 22:52:33,983] - step:84/900 train_loss:0.4399 lr:0.0003000000 time/step:116.82s
|
| 92 |
+
[2025-09-11 22:54:31,605] - step:85/900 train_loss:0.4343 lr:0.0003000000 time/step:117.62s
|
| 93 |
+
[2025-09-11 22:56:29,383] - step:86/900 train_loss:0.4587 lr:0.0003000000 time/step:117.77s
|
| 94 |
+
[2025-09-11 22:58:26,338] - step:87/900 train_loss:0.4550 lr:0.0003000000 time/step:116.95s
|
| 95 |
+
[2025-09-11 23:00:23,614] - step:88/900 train_loss:0.4437 lr:0.0003000000 time/step:117.26s
|
| 96 |
+
[2025-09-11 23:02:20,358] - step:89/900 train_loss:0.4575 lr:0.0003000000 time/step:116.74s
|
| 97 |
+
[2025-09-11 23:04:17,289] - step:90/900 train_loss:0.4361 lr:0.0003000000 time/step:116.93s
|
| 98 |
+
[2025-09-11 23:06:15,307] - step:91/900 train_loss:0.4259 lr:0.0003000000 time/step:118.02s
|
| 99 |
+
[2025-09-11 23:08:12,562] - step:92/900 train_loss:0.4340 lr:0.0003000000 time/step:117.25s
|
| 100 |
+
[2025-09-11 23:10:10,001] - step:93/900 train_loss:0.4424 lr:0.0003000000 time/step:117.43s
|
| 101 |
+
[2025-09-11 23:12:07,171] - step:94/900 train_loss:0.4240 lr:0.0003000000 time/step:117.16s
|
| 102 |
+
[2025-09-11 23:14:05,158] - step:95/900 train_loss:0.4425 lr:0.0003000000 time/step:117.99s
|
| 103 |
+
[2025-09-11 23:16:02,641] - step:96/900 train_loss:0.4575 lr:0.0003000000 time/step:117.48s
|
| 104 |
+
[2025-09-11 23:17:59,591] - step:97/900 train_loss:0.4435 lr:0.0003000000 time/step:116.94s
|
| 105 |
+
[2025-09-11 23:19:55,399] - step:98/900 train_loss:0.4466 lr:0.0003000000 time/step:115.80s
|
| 106 |
+
[2025-09-11 23:21:53,531] - step:99/900 train_loss:0.4469 lr:0.0003000000 time/step:118.12s
|
| 107 |
+
[2025-09-11 23:23:52,424] - step:100/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@100.pt...
|
| 108 |
+
[2025-09-11 23:23:52,425] - step:100/900 train_loss:0.4467 lr:0.0003000000 time/step:118.25s
|
| 109 |
+
[2025-09-11 23:25:49,555] - step:101/900 train_loss:0.4462 lr:0.0003000000 time/step:117.13s
|
| 110 |
+
[2025-09-11 23:27:46,681] - step:102/900 train_loss:0.4479 lr:0.0003000000 time/step:117.12s
|
| 111 |
+
[2025-09-11 23:29:43,985] - step:103/900 train_loss:0.4212 lr:0.0003000000 time/step:117.30s
|
| 112 |
+
[2025-09-11 23:31:40,749] - step:104/900 train_loss:0.4385 lr:0.0003000000 time/step:116.76s
|
| 113 |
+
[2025-09-11 23:33:37,611] - step:105/900 train_loss:0.4490 lr:0.0003000000 time/step:116.86s
|
| 114 |
+
[2025-09-11 23:35:34,674] - step:106/900 train_loss:0.4537 lr:0.0003000000 time/step:117.06s
|
| 115 |
+
[2025-09-11 23:37:32,277] - step:107/900 train_loss:0.4278 lr:0.0003000000 time/step:117.60s
|
| 116 |
+
[2025-09-11 23:39:29,569] - step:108/900 train_loss:0.4413 lr:0.0003000000 time/step:117.28s
|
| 117 |
+
[2025-09-11 23:41:26,965] - step:109/900 train_loss:0.4219 lr:0.0003000000 time/step:117.39s
|
| 118 |
+
[2025-09-11 23:43:23,608] - step:110/900 train_loss:0.4455 lr:0.0003000000 time/step:116.64s
|
| 119 |
+
[2025-09-11 23:45:20,608] - step:111/900 train_loss:0.4581 lr:0.0003000000 time/step:117.00s
|
| 120 |
+
[2025-09-11 23:47:17,496] - step:112/900 train_loss:0.4501 lr:0.0003000000 time/step:116.89s
|
| 121 |
+
[2025-09-11 23:49:15,179] - step:113/900 train_loss:0.4332 lr:0.0003000000 time/step:117.66s
|
| 122 |
+
[2025-09-11 23:51:13,020] - step:114/900 train_loss:0.4311 lr:0.0003000000 time/step:117.83s
|
| 123 |
+
[2025-09-11 23:53:10,646] - step:115/900 train_loss:0.4449 lr:0.0003000000 time/step:117.62s
|
| 124 |
+
[2025-09-11 23:55:06,688] - step:116/900 train_loss:0.4424 lr:0.0003000000 time/step:116.04s
|
| 125 |
+
[2025-09-11 23:57:03,652] - step:117/900 train_loss:0.4392 lr:0.0003000000 time/step:116.96s
|
| 126 |
+
[2025-09-11 23:59:01,394] - step:118/900 train_loss:0.4246 lr:0.0003000000 time/step:117.74s
|
| 127 |
+
[2025-09-12 00:00:58,798] - step:119/900 train_loss:0.4339 lr:0.0003000000 time/step:117.39s
|
| 128 |
+
[2025-09-12 00:02:56,142] - step:120/900 train_loss:0.4064 lr:0.0003000000 time/step:117.33s
|
| 129 |
+
[2025-09-12 00:04:53,044] - step:121/900 train_loss:0.4421 lr:0.0003000000 time/step:116.90s
|
| 130 |
+
[2025-09-12 00:06:49,048] - step:122/900 train_loss:0.4306 lr:0.0003000000 time/step:116.00s
|
| 131 |
+
[2025-09-12 00:08:46,671] - step:123/900 train_loss:0.4163 lr:0.0003000000 time/step:117.62s
|
| 132 |
+
[2025-09-12 00:10:44,735] - step:124/900 train_loss:0.4428 lr:0.0003000000 time/step:118.05s
|
| 133 |
+
[2025-09-12 00:12:42,019] - step:125/900 train_loss:0.4188 lr:0.0003000000 time/step:117.27s
|
| 134 |
+
[2025-09-12 00:14:38,901] - step:126/900 train_loss:0.4226 lr:0.0003000000 time/step:116.88s
|
| 135 |
+
[2025-09-12 00:16:35,356] - step:127/900 train_loss:0.4379 lr:0.0003000000 time/step:116.45s
|
| 136 |
+
[2025-09-12 00:18:31,808] - step:128/900 train_loss:0.4475 lr:0.0003000000 time/step:116.45s
|
| 137 |
+
[2025-09-12 00:20:31,092] - step:129/900 train_loss:0.4579 lr:0.0003000000 time/step:119.27s
|
| 138 |
+
[2025-09-12 00:22:28,417] - step:130/900 train_loss:0.4504 lr:0.0003000000 time/step:117.31s
|
| 139 |
+
[2025-09-12 00:24:25,417] - step:131/900 train_loss:0.4345 lr:0.0003000000 time/step:116.99s
|
| 140 |
+
[2025-09-12 00:26:22,282] - step:132/900 train_loss:0.4567 lr:0.0003000000 time/step:116.86s
|
| 141 |
+
[2025-09-12 00:28:18,304] - step:133/900 train_loss:0.4396 lr:0.0003000000 time/step:116.02s
|
| 142 |
+
[2025-09-12 00:30:15,628] - step:134/900 train_loss:0.4440 lr:0.0003000000 time/step:117.32s
|
| 143 |
+
[2025-09-12 00:32:13,051] - step:135/900 train_loss:0.4384 lr:0.0003000000 time/step:117.42s
|
| 144 |
+
[2025-09-12 00:34:10,336] - step:136/900 train_loss:0.4276 lr:0.0003000000 time/step:117.28s
|
| 145 |
+
[2025-09-12 00:36:07,098] - step:137/900 train_loss:0.4424 lr:0.0003000000 time/step:116.76s
|
| 146 |
+
[2025-09-12 00:38:03,861] - step:138/900 train_loss:0.4288 lr:0.0003000000 time/step:116.76s
|
| 147 |
+
[2025-09-12 00:40:00,304] - step:139/900 train_loss:0.4333 lr:0.0003000000 time/step:116.43s
|
| 148 |
+
[2025-09-12 00:41:57,928] - step:140/900 train_loss:0.4347 lr:0.0003000000 time/step:117.62s
|
| 149 |
+
[2025-09-12 00:43:56,252] - step:141/900 train_loss:0.4515 lr:0.0003000000 time/step:118.32s
|
| 150 |
+
[2025-09-12 00:45:53,156] - step:142/900 train_loss:0.4531 lr:0.0003000000 time/step:116.90s
|
| 151 |
+
[2025-09-12 00:47:50,037] - step:143/900 train_loss:0.4426 lr:0.0003000000 time/step:116.88s
|
| 152 |
+
[2025-09-12 00:49:46,863] - step:144/900 train_loss:0.4100 lr:0.0003000000 time/step:116.81s
|
| 153 |
+
[2025-09-12 00:51:42,986] - step:145/900 train_loss:0.4185 lr:0.0003000000 time/step:116.12s
|
| 154 |
+
[2025-09-12 00:53:40,748] - step:146/900 train_loss:0.4556 lr:0.0003000000 time/step:117.75s
|
| 155 |
+
[2025-09-12 00:55:38,614] - step:147/900 train_loss:0.4580 lr:0.0003000000 time/step:117.86s
|
| 156 |
+
[2025-09-12 00:57:35,395] - step:148/900 train_loss:0.4432 lr:0.0003000000 time/step:116.77s
|
| 157 |
+
[2025-09-12 00:59:32,300] - step:149/900 train_loss:0.4260 lr:0.0003000000 time/step:116.90s
|
| 158 |
+
[2025-09-12 01:01:29,963] - step:150/900 train_loss:0.4369 lr:0.0003000000 time/step:117.65s
|
| 159 |
+
[2025-09-12 01:03:26,107] - step:151/900 train_loss:0.4121 lr:0.0003000000 time/step:116.14s
|
| 160 |
+
[2025-09-12 01:05:23,232] - step:152/900 train_loss:0.4488 lr:0.0003000000 time/step:117.12s
|
| 161 |
+
[2025-09-12 01:07:21,054] - step:153/900 train_loss:0.4290 lr:0.0003000000 time/step:117.82s
|
| 162 |
+
[2025-09-12 01:09:17,934] - step:154/900 train_loss:0.4126 lr:0.0003000000 time/step:116.88s
|
| 163 |
+
[2025-09-12 01:11:15,437] - step:155/900 train_loss:0.4201 lr:0.0003000000 time/step:117.49s
|
| 164 |
+
[2025-09-12 01:13:12,295] - step:156/900 train_loss:0.4294 lr:0.0003000000 time/step:116.85s
|
| 165 |
+
[2025-09-12 01:15:08,687] - step:157/900 train_loss:0.4340 lr:0.0003000000 time/step:116.38s
|
| 166 |
+
[2025-09-12 01:17:05,708] - step:158/900 train_loss:0.4543 lr:0.0003000000 time/step:117.01s
|
| 167 |
+
[2025-09-12 01:19:03,353] - step:159/900 train_loss:0.4211 lr:0.0003000000 time/step:117.64s
|
| 168 |
+
[2025-09-12 01:21:00,871] - step:160/900 train_loss:0.4400 lr:0.0003000000 time/step:117.51s
|
| 169 |
+
[2025-09-12 01:22:57,738] - step:161/900 train_loss:0.4259 lr:0.0003000000 time/step:116.86s
|
| 170 |
+
[2025-09-12 01:24:55,051] - step:162/900 train_loss:0.4150 lr:0.0003000000 time/step:117.31s
|
| 171 |
+
[2025-09-12 01:26:51,147] - step:163/900 train_loss:0.4168 lr:0.0003000000 time/step:116.09s
|
| 172 |
+
[2025-09-12 01:28:48,833] - step:164/900 train_loss:0.4024 lr:0.0003000000 time/step:117.68s
|
| 173 |
+
[2025-09-12 01:30:46,610] - step:165/900 train_loss:0.4476 lr:0.0003000000 time/step:117.77s
|
| 174 |
+
[2025-09-12 01:32:43,517] - step:166/900 train_loss:0.4241 lr:0.0003000000 time/step:116.90s
|
| 175 |
+
[2025-09-12 01:34:41,001] - step:167/900 train_loss:0.4268 lr:0.0003000000 time/step:117.48s
|
| 176 |
+
[2025-09-12 01:36:37,582] - step:168/900 train_loss:0.3846 lr:0.0003000000 time/step:116.57s
|
| 177 |
+
[2025-09-12 01:38:34,908] - step:169/900 train_loss:0.4199 lr:0.0003000000 time/step:117.32s
|
| 178 |
+
[2025-09-12 01:40:33,014] - step:170/900 train_loss:0.4037 lr:0.0003000000 time/step:118.09s
|
| 179 |
+
[2025-09-12 01:42:29,854] - step:171/900 train_loss:0.4579 lr:0.0003000000 time/step:116.84s
|
| 180 |
+
[2025-09-12 01:44:27,350] - step:172/900 train_loss:0.4435 lr:0.0003000000 time/step:117.48s
|
| 181 |
+
[2025-09-12 01:46:24,704] - step:173/900 train_loss:0.4139 lr:0.0003000000 time/step:117.34s
|
| 182 |
+
[2025-09-12 01:48:21,009] - step:174/900 train_loss:0.4308 lr:0.0003000000 time/step:116.30s
|
| 183 |
+
[2025-09-12 01:50:19,086] - step:175/900 train_loss:0.4156 lr:0.0003000000 time/step:118.06s
|
| 184 |
+
[2025-09-12 01:52:16,506] - step:176/900 train_loss:0.4204 lr:0.0003000000 time/step:117.41s
|
| 185 |
+
[2025-09-12 01:54:14,395] - step:177/900 train_loss:0.4211 lr:0.0003000000 time/step:117.87s
|
| 186 |
+
[2025-09-12 01:56:11,781] - step:178/900 train_loss:0.4399 lr:0.0003000000 time/step:117.38s
|
| 187 |
+
[2025-09-12 01:58:09,165] - step:179/900 train_loss:0.4327 lr:0.0003000000 time/step:117.38s
|
| 188 |
+
[2025-09-12 02:00:05,670] - step:180/900 train_loss:0.4362 lr:0.0003000000 time/step:116.49s
|
| 189 |
+
[2025-09-12 02:02:03,683] - step:181/900 train_loss:0.4204 lr:0.0003000000 time/step:118.01s
|
| 190 |
+
[2025-09-12 02:04:01,525] - step:182/900 train_loss:0.4528 lr:0.0003000000 time/step:117.84s
|
| 191 |
+
[2025-09-12 02:05:59,256] - step:183/900 train_loss:0.4115 lr:0.0003000000 time/step:117.72s
|
| 192 |
+
[2025-09-12 02:07:56,456] - step:184/900 train_loss:0.4527 lr:0.0003000000 time/step:117.20s
|
| 193 |
+
[2025-09-12 02:09:53,692] - step:185/900 train_loss:0.4378 lr:0.0003000000 time/step:117.23s
|
| 194 |
+
[2025-09-12 02:11:50,835] - step:186/900 train_loss:0.4322 lr:0.0003000000 time/step:117.14s
|
| 195 |
+
[2025-09-12 02:13:49,249] - step:187/900 train_loss:0.4503 lr:0.0003000000 time/step:118.41s
|
| 196 |
+
[2025-09-12 02:15:46,708] - step:188/900 train_loss:0.4137 lr:0.0003000000 time/step:117.45s
|
| 197 |
+
[2025-09-12 02:17:44,588] - step:189/900 train_loss:0.4373 lr:0.0003000000 time/step:117.87s
|
| 198 |
+
[2025-09-12 02:19:41,640] - step:190/900 train_loss:0.4390 lr:0.0003000000 time/step:117.04s
|
| 199 |
+
[2025-09-12 02:21:38,674] - step:191/900 train_loss:0.4540 lr:0.0003000000 time/step:117.02s
|
| 200 |
+
[2025-09-12 02:23:35,317] - step:192/900 train_loss:0.4401 lr:0.0003000000 time/step:116.64s
|
| 201 |
+
[2025-09-12 02:25:32,403] - step:193/900 train_loss:0.4325 lr:0.0003000000 time/step:117.08s
|
| 202 |
+
[2025-09-12 02:27:29,545] - step:194/900 train_loss:0.4249 lr:0.0003000000 time/step:117.13s
|
| 203 |
+
[2025-09-12 02:29:26,648] - step:195/900 train_loss:0.4074 lr:0.0003000000 time/step:117.09s
|
| 204 |
+
[2025-09-12 02:31:23,432] - step:196/900 train_loss:0.4212 lr:0.0003000000 time/step:116.77s
|
| 205 |
+
[2025-09-12 02:33:21,256] - step:197/900 train_loss:0.4408 lr:0.0003000000 time/step:117.82s
|
| 206 |
+
[2025-09-12 02:35:18,019] - step:198/900 train_loss:0.4229 lr:0.0003000000 time/step:116.76s
|
| 207 |
+
[2025-09-12 02:37:15,403] - step:199/900 train_loss:0.4517 lr:0.0003000000 time/step:117.38s
|
| 208 |
+
[2025-09-12 02:39:13,125] - step:200/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@200.pt...
|
| 209 |
+
[2025-09-12 02:39:13,129] - step:200/900 train_loss:0.4149 lr:0.0003000000 time/step:117.11s
|
| 210 |
+
[2025-09-12 02:41:09,907] - step:201/900 train_loss:0.4258 lr:0.0003000000 time/step:116.76s
|
| 211 |
+
[2025-09-12 02:43:06,972] - step:202/900 train_loss:0.4207 lr:0.0003000000 time/step:117.06s
|
| 212 |
+
[2025-09-12 02:45:03,575] - step:203/900 train_loss:0.4432 lr:0.0003000000 time/step:116.60s
|
| 213 |
+
[2025-09-12 02:47:00,062] - step:204/900 train_loss:0.4072 lr:0.0003000000 time/step:116.48s
|
| 214 |
+
[2025-09-12 02:48:57,543] - step:205/900 train_loss:0.4404 lr:0.0003000000 time/step:117.47s
|
| 215 |
+
[2025-09-12 02:50:54,165] - step:206/900 train_loss:0.4151 lr:0.0003000000 time/step:116.61s
|
| 216 |
+
[2025-09-12 02:52:50,609] - step:207/900 train_loss:0.4256 lr:0.0003000000 time/step:116.44s
|
| 217 |
+
[2025-09-12 02:54:48,171] - step:208/900 train_loss:0.4200 lr:0.0003000000 time/step:117.56s
|
| 218 |
+
[2025-09-12 02:56:44,244] - step:209/900 train_loss:0.4159 lr:0.0003000000 time/step:116.06s
|
| 219 |
+
[2025-09-12 02:58:41,740] - step:210/900 train_loss:0.4080 lr:0.0003000000 time/step:117.49s
|
| 220 |
+
[2025-09-12 03:00:38,362] - step:211/900 train_loss:0.4394 lr:0.0003000000 time/step:116.61s
|
| 221 |
+
[2025-09-12 03:02:34,864] - step:212/900 train_loss:0.4461 lr:0.0003000000 time/step:116.49s
|
| 222 |
+
[2025-09-12 03:04:32,289] - step:213/900 train_loss:0.4310 lr:0.0003000000 time/step:117.42s
|
| 223 |
+
[2025-09-12 03:06:29,834] - step:214/900 train_loss:0.4458 lr:0.0003000000 time/step:117.53s
|
| 224 |
+
[2025-09-12 03:08:26,395] - step:215/900 train_loss:0.4322 lr:0.0003000000 time/step:116.56s
|
| 225 |
+
[2025-09-12 03:10:23,441] - step:216/900 train_loss:0.3979 lr:0.0003000000 time/step:117.03s
|
| 226 |
+
[2025-09-12 03:12:19,963] - step:217/900 train_loss:0.4011 lr:0.0003000000 time/step:116.51s
|
| 227 |
+
[2025-09-12 03:14:17,627] - step:218/900 train_loss:0.4372 lr:0.0003000000 time/step:117.66s
|
| 228 |
+
[2025-09-12 03:16:15,332] - step:219/900 train_loss:0.4281 lr:0.0003000000 time/step:117.70s
|
| 229 |
+
[2025-09-12 03:18:11,833] - step:220/900 train_loss:0.4330 lr:0.0003000000 time/step:116.49s
|
| 230 |
+
[2025-09-12 03:20:08,497] - step:221/900 train_loss:0.4534 lr:0.0003000000 time/step:116.65s
|
| 231 |
+
[2025-09-12 03:22:05,021] - step:222/900 train_loss:0.4076 lr:0.0003000000 time/step:116.52s
|
| 232 |
+
[2025-09-12 03:24:01,826] - step:223/900 train_loss:0.4211 lr:0.0003000000 time/step:116.79s
|
| 233 |
+
[2025-09-12 03:25:58,807] - step:224/900 train_loss:0.4075 lr:0.0003000000 time/step:116.98s
|
| 234 |
+
[2025-09-12 03:27:56,427] - step:225/900 train_loss:0.3977 lr:0.0003000000 time/step:117.61s
|
| 235 |
+
[2025-09-12 03:29:53,271] - step:226/900 train_loss:0.4331 lr:0.0003000000 time/step:116.84s
|
| 236 |
+
[2025-09-12 03:31:48,818] - step:227/900 train_loss:0.4424 lr:0.0003000000 time/step:115.53s
|
| 237 |
+
[2025-09-12 03:33:46,260] - step:228/900 train_loss:0.4265 lr:0.0003000000 time/step:117.44s
|
| 238 |
+
[2025-09-12 03:35:42,726] - step:229/900 train_loss:0.4018 lr:0.0003000000 time/step:116.46s
|
| 239 |
+
[2025-09-12 03:37:39,927] - step:230/900 train_loss:0.4277 lr:0.0003000000 time/step:117.20s
|
| 240 |
+
[2025-09-12 03:39:37,253] - step:231/900 train_loss:0.4229 lr:0.0003000000 time/step:117.32s
|
| 241 |
+
[2025-09-12 03:41:34,210] - step:232/900 train_loss:0.4231 lr:0.0003000000 time/step:116.94s
|
| 242 |
+
[2025-09-12 03:43:30,497] - step:233/900 train_loss:0.4125 lr:0.0003000000 time/step:116.28s
|
| 243 |
+
[2025-09-12 03:45:27,022] - step:234/900 train_loss:0.4181 lr:0.0003000000 time/step:116.52s
|
| 244 |
+
[2025-09-12 03:47:23,505] - step:235/900 train_loss:0.4364 lr:0.0003000000 time/step:116.48s
|
| 245 |
+
[2025-09-12 03:49:21,967] - step:236/900 train_loss:0.4135 lr:0.0003000000 time/step:118.46s
|
| 246 |
+
[2025-09-12 03:51:18,413] - step:237/900 train_loss:0.4139 lr:0.0003000000 time/step:116.43s
|
| 247 |
+
[2025-09-12 03:53:14,453] - step:238/900 train_loss:0.4341 lr:0.0003000000 time/step:116.03s
|
| 248 |
+
[2025-09-12 03:55:11,117] - step:239/900 train_loss:0.4174 lr:0.0003000000 time/step:116.66s
|
| 249 |
+
[2025-09-12 03:57:08,642] - step:240/900 train_loss:0.4449 lr:0.0003000000 time/step:117.52s
|
| 250 |
+
[2025-09-12 03:59:06,595] - step:241/900 train_loss:0.4303 lr:0.0003000000 time/step:117.95s
|
| 251 |
+
[2025-09-12 04:01:02,667] - step:242/900 train_loss:0.4350 lr:0.0003000000 time/step:116.06s
|
| 252 |
+
[2025-09-12 04:02:58,652] - step:243/900 train_loss:0.4332 lr:0.0003000000 time/step:115.97s
|
| 253 |
+
[2025-09-12 04:04:55,158] - step:244/900 train_loss:0.4170 lr:0.0003000000 time/step:116.50s
|
| 254 |
+
[2025-09-12 04:06:52,523] - step:245/900 train_loss:0.4325 lr:0.0003000000 time/step:117.35s
|
| 255 |
+
[2025-09-12 04:08:49,506] - step:246/900 train_loss:0.4140 lr:0.0003000000 time/step:116.98s
|
| 256 |
+
[2025-09-12 04:10:46,625] - step:247/900 train_loss:0.4244 lr:0.0003000000 time/step:117.10s
|
| 257 |
+
[2025-09-12 04:12:43,060] - step:248/900 train_loss:0.4435 lr:0.0003000000 time/step:116.43s
|
| 258 |
+
[2025-09-12 04:14:39,932] - step:249/900 train_loss:0.4188 lr:0.0003000000 time/step:116.87s
|
| 259 |
+
[2025-09-12 04:16:36,428] - step:250/900 train_loss:0.4138 lr:0.0003000000 time/step:116.49s
|
| 260 |
+
[2025-09-12 04:18:34,283] - step:251/900 train_loss:0.4045 lr:0.0003000000 time/step:117.84s
|
| 261 |
+
[2025-09-12 04:20:32,264] - step:252/900 train_loss:0.4128 lr:0.0003000000 time/step:117.96s
|
| 262 |
+
[2025-09-12 04:22:28,905] - step:253/900 train_loss:0.4352 lr:0.0003000000 time/step:116.63s
|
| 263 |
+
[2025-09-12 04:24:25,744] - step:254/900 train_loss:0.4090 lr:0.0003000000 time/step:116.83s
|
| 264 |
+
[2025-09-12 04:26:22,527] - step:255/900 train_loss:0.4125 lr:0.0003000000 time/step:116.78s
|
| 265 |
+
[2025-09-12 04:28:18,535] - step:256/900 train_loss:0.3974 lr:0.0003000000 time/step:116.00s
|
| 266 |
+
[2025-09-12 04:30:16,548] - step:257/900 train_loss:0.4056 lr:0.0003000000 time/step:118.00s
|
| 267 |
+
[2025-09-12 04:32:14,016] - step:258/900 train_loss:0.4158 lr:0.0003000000 time/step:117.45s
|
| 268 |
+
[2025-09-12 04:34:10,993] - step:259/900 train_loss:0.4080 lr:0.0003000000 time/step:116.97s
|
| 269 |
+
[2025-09-12 04:36:07,637] - step:260/900 train_loss:0.4217 lr:0.0003000000 time/step:116.64s
|
| 270 |
+
[2025-09-12 04:38:05,072] - step:261/900 train_loss:0.4157 lr:0.0003000000 time/step:117.43s
|
| 271 |
+
[2025-09-12 04:40:01,843] - step:262/900 train_loss:0.4139 lr:0.0003000000 time/step:116.76s
|
| 272 |
+
[2025-09-12 04:41:58,873] - step:263/900 train_loss:0.4401 lr:0.0003000000 time/step:117.01s
|
| 273 |
+
[2025-09-12 04:43:56,795] - step:264/900 train_loss:0.4272 lr:0.0003000000 time/step:117.92s
|
| 274 |
+
[2025-09-12 04:45:53,571] - step:265/900 train_loss:0.4228 lr:0.0003000000 time/step:116.76s
|
| 275 |
+
[2025-09-12 04:47:50,269] - step:266/900 train_loss:0.4242 lr:0.0003000000 time/step:116.69s
|
| 276 |
+
[2025-09-12 04:49:47,027] - step:267/900 train_loss:0.4361 lr:0.0003000000 time/step:116.75s
|
| 277 |
+
[2025-09-12 04:51:43,112] - step:268/900 train_loss:0.4224 lr:0.0003000000 time/step:116.07s
|
| 278 |
+
[2025-09-12 04:53:41,046] - step:269/900 train_loss:0.4076 lr:0.0003000000 time/step:117.92s
|
| 279 |
+
[2025-09-12 04:55:37,470] - step:270/900 train_loss:0.4172 lr:0.0003000000 time/step:116.42s
|
| 280 |
+
[2025-09-12 04:57:33,853] - step:271/900 train_loss:0.4219 lr:0.0003000000 time/step:116.38s
|
| 281 |
+
[2025-09-12 04:59:30,265] - step:272/900 train_loss:0.4281 lr:0.0003000000 time/step:116.41s
|
| 282 |
+
[2025-09-12 05:01:26,500] - step:273/900 train_loss:0.4105 lr:0.0003000000 time/step:116.22s
|
| 283 |
+
[2025-09-12 05:03:24,415] - step:274/900 train_loss:0.4247 lr:0.0003000000 time/step:117.91s
|
| 284 |
+
[2025-09-12 05:05:21,825] - step:275/900 train_loss:0.4172 lr:0.0003000000 time/step:117.40s
|
| 285 |
+
[2025-09-12 05:07:18,643] - step:276/900 train_loss:0.4281 lr:0.0003000000 time/step:116.81s
|
| 286 |
+
[2025-09-12 05:09:15,889] - step:277/900 train_loss:0.4140 lr:0.0003000000 time/step:117.23s
|
| 287 |
+
[2025-09-12 05:11:13,080] - step:278/900 train_loss:0.4459 lr:0.0003000000 time/step:117.18s
|
| 288 |
+
[2025-09-12 05:13:09,433] - step:279/900 train_loss:0.4128 lr:0.0003000000 time/step:116.35s
|
| 289 |
+
[2025-09-12 05:15:07,057] - step:280/900 train_loss:0.4171 lr:0.0003000000 time/step:117.62s
|
| 290 |
+
[2025-09-12 05:17:03,780] - step:281/900 train_loss:0.4083 lr:0.0003000000 time/step:116.71s
|
| 291 |
+
[2025-09-12 05:19:00,703] - step:282/900 train_loss:0.4214 lr:0.0003000000 time/step:116.92s
|
| 292 |
+
[2025-09-12 05:20:57,932] - step:283/900 train_loss:0.4072 lr:0.0003000000 time/step:117.19s
|
| 293 |
+
[2025-09-12 05:22:54,350] - step:284/900 train_loss:0.4471 lr:0.0003000000 time/step:116.39s
|
| 294 |
+
[2025-09-12 05:24:50,794] - step:285/900 train_loss:0.3946 lr:0.0003000000 time/step:116.44s
|
| 295 |
+
[2025-09-12 05:26:47,657] - step:286/900 train_loss:0.4510 lr:0.0003000000 time/step:116.86s
|
| 296 |
+
[2025-09-12 05:28:43,717] - step:287/900 train_loss:0.4409 lr:0.0003000000 time/step:116.05s
|
| 297 |
+
[2025-09-12 05:30:40,741] - step:288/900 train_loss:0.3887 lr:0.0003000000 time/step:117.01s
|
| 298 |
+
[2025-09-12 05:32:38,986] - step:289/900 train_loss:0.4207 lr:0.0003000000 time/step:118.24s
|
| 299 |
+
[2025-09-12 05:34:35,229] - step:290/900 train_loss:0.4018 lr:0.0003000000 time/step:116.24s
|
| 300 |
+
[2025-09-12 05:36:31,796] - step:291/900 train_loss:0.4233 lr:0.0003000000 time/step:116.56s
|
| 301 |
+
[2025-09-12 05:38:28,659] - step:292/900 train_loss:0.4223 lr:0.0003000000 time/step:116.86s
|
| 302 |
+
[2025-09-12 05:40:25,842] - step:293/900 train_loss:0.4412 lr:0.0003000000 time/step:117.18s
|
| 303 |
+
[2025-09-12 05:42:22,767] - step:294/900 train_loss:0.3965 lr:0.0003000000 time/step:116.91s
|
| 304 |
+
[2025-09-12 05:44:20,588] - step:295/900 train_loss:0.4155 lr:0.0003000000 time/step:117.81s
|
| 305 |
+
[2025-09-12 05:46:17,250] - step:296/900 train_loss:0.4051 lr:0.0003000000 time/step:116.66s
|
| 306 |
+
[2025-09-12 05:48:13,495] - step:297/900 train_loss:0.4186 lr:0.0003000000 time/step:116.24s
|
| 307 |
+
[2025-09-12 05:50:10,418] - step:298/900 train_loss:0.4280 lr:0.0003000000 time/step:116.91s
|
| 308 |
+
[2025-09-12 05:52:07,903] - step:299/900 train_loss:0.4225 lr:0.0003000000 time/step:117.46s
|
| 309 |
+
[2025-09-12 05:54:04,575] - step:300/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@300.pt...
|
| 310 |
+
[2025-09-12 05:54:04,576] - step:300/900 train_loss:0.4086 lr:0.0003000000 time/step:116.07s
|
| 311 |
+
[2025-09-12 05:56:02,285] - step:301/900 train_loss:0.4136 lr:0.0003000000 time/step:117.71s
|
| 312 |
+
[2025-09-12 05:58:00,171] - step:302/900 train_loss:0.4114 lr:0.0003000000 time/step:117.88s
|
| 313 |
+
[2025-09-12 05:59:55,795] - step:303/900 train_loss:0.4200 lr:0.0003000000 time/step:115.62s
|
| 314 |
+
[2025-09-12 06:01:52,652] - step:304/900 train_loss:0.4085 lr:0.0003000000 time/step:116.84s
|
| 315 |
+
[2025-09-12 06:03:49,824] - step:305/900 train_loss:0.4311 lr:0.0003000000 time/step:117.16s
|
| 316 |
+
[2025-09-12 06:05:46,285] - step:306/900 train_loss:0.4367 lr:0.0003000000 time/step:116.45s
|
| 317 |
+
[2025-09-12 06:07:44,209] - step:307/900 train_loss:0.4345 lr:0.0003000000 time/step:117.92s
|
| 318 |
+
[2025-09-12 06:09:41,568] - step:308/900 train_loss:0.4016 lr:0.0003000000 time/step:117.35s
|
| 319 |
+
[2025-09-12 06:11:38,074] - step:309/900 train_loss:0.4102 lr:0.0003000000 time/step:116.49s
|
| 320 |
+
[2025-09-12 06:13:34,857] - step:310/900 train_loss:0.4332 lr:0.0003000000 time/step:116.77s
|
| 321 |
+
[2025-09-12 06:15:32,302] - step:311/900 train_loss:0.4186 lr:0.0003000000 time/step:117.43s
|
| 322 |
+
[2025-09-12 06:17:29,124] - step:312/900 train_loss:0.4371 lr:0.0003000000 time/step:116.82s
|
| 323 |
+
[2025-09-12 06:19:26,289] - step:313/900 train_loss:0.4130 lr:0.0003000000 time/step:117.16s
|
| 324 |
+
[2025-09-12 06:21:22,830] - step:314/900 train_loss:0.4031 lr:0.0003000000 time/step:116.53s
|
| 325 |
+
[2025-09-12 06:23:19,454] - step:315/900 train_loss:0.4286 lr:0.0003000000 time/step:116.62s
|
| 326 |
+
[2025-09-12 06:25:17,324] - step:316/900 train_loss:0.4007 lr:0.0003000000 time/step:117.86s
|
| 327 |
+
[2025-09-12 06:27:14,242] - step:317/900 train_loss:0.4114 lr:0.0003000000 time/step:116.91s
|
| 328 |
+
[2025-09-12 06:29:11,325] - step:318/900 train_loss:0.4251 lr:0.0003000000 time/step:117.08s
|
| 329 |
+
[2025-09-12 06:31:08,368] - step:319/900 train_loss:0.4448 lr:0.0003000000 time/step:117.03s
|
| 330 |
+
[2025-09-12 06:33:04,509] - step:320/900 train_loss:0.4103 lr:0.0003000000 time/step:116.14s
|
| 331 |
+
[2025-09-12 06:35:02,658] - step:321/900 train_loss:0.4142 lr:0.0003000000 time/step:118.14s
|
| 332 |
+
[2025-09-12 06:36:59,639] - step:322/900 train_loss:0.3985 lr:0.0003000000 time/step:116.97s
|
| 333 |
+
[2025-09-12 06:38:56,063] - step:323/900 train_loss:0.4057 lr:0.0003000000 time/step:116.42s
|
| 334 |
+
[2025-09-12 06:40:53,684] - step:324/900 train_loss:0.4223 lr:0.0003000000 time/step:117.62s
|
| 335 |
+
[2025-09-12 06:42:50,547] - step:325/900 train_loss:0.4205 lr:0.0003000000 time/step:116.85s
|
| 336 |
+
[2025-09-12 06:44:46,896] - step:326/900 train_loss:0.4172 lr:0.0003000000 time/step:116.34s
|
| 337 |
+
[2025-09-12 06:46:45,176] - step:327/900 train_loss:0.4186 lr:0.0003000000 time/step:118.27s
|
| 338 |
+
[2025-09-12 06:48:42,119] - step:328/900 train_loss:0.4294 lr:0.0003000000 time/step:116.93s
|
| 339 |
+
[2025-09-12 06:50:38,781] - step:329/900 train_loss:0.4072 lr:0.0003000000 time/step:116.66s
|
| 340 |
+
[2025-09-12 06:52:36,425] - step:330/900 train_loss:0.4248 lr:0.0003000000 time/step:117.63s
|
| 341 |
+
[2025-09-12 06:54:33,431] - step:331/900 train_loss:0.4141 lr:0.0003000000 time/step:117.00s
|
| 342 |
+
[2025-09-12 06:56:30,074] - step:332/900 train_loss:0.4124 lr:0.0003000000 time/step:116.64s
|
| 343 |
+
[2025-09-12 06:58:26,556] - step:333/900 train_loss:0.4281 lr:0.0003000000 time/step:116.47s
|
| 344 |
+
[2025-09-12 07:00:23,620] - step:334/900 train_loss:0.4141 lr:0.0003000000 time/step:117.06s
|
| 345 |
+
[2025-09-12 07:02:21,404] - step:335/900 train_loss:0.4197 lr:0.0003000000 time/step:117.77s
|
| 346 |
+
[2025-09-12 07:04:17,967] - step:336/900 train_loss:0.4356 lr:0.0003000000 time/step:116.56s
|
| 347 |
+
[2025-09-12 07:06:16,192] - step:337/900 train_loss:0.3934 lr:0.0003000000 time/step:118.22s
|
| 348 |
+
[2025-09-12 07:08:12,291] - step:338/900 train_loss:0.3917 lr:0.0003000000 time/step:116.09s
|
| 349 |
+
[2025-09-12 07:10:08,910] - step:339/900 train_loss:0.4353 lr:0.0003000000 time/step:116.61s
|
| 350 |
+
[2025-09-12 07:12:06,665] - step:340/900 train_loss:0.4537 lr:0.0003000000 time/step:117.74s
|
| 351 |
+
[2025-09-12 07:14:03,621] - step:341/900 train_loss:0.4146 lr:0.0003000000 time/step:116.95s
|
| 352 |
+
[2025-09-12 07:16:00,835] - step:342/900 train_loss:0.4194 lr:0.0003000000 time/step:117.20s
|
| 353 |
+
[2025-09-12 07:17:57,387] - step:343/900 train_loss:0.4117 lr:0.0003000000 time/step:116.54s
|
| 354 |
+
[2025-09-12 07:19:53,951] - step:344/900 train_loss:0.3925 lr:0.0003000000 time/step:116.56s
|
| 355 |
+
[2025-09-12 07:21:50,959] - step:345/900 train_loss:0.4268 lr:0.0003000000 time/step:117.00s
|
| 356 |
+
[2025-09-12 07:23:49,546] - step:346/900 train_loss:0.4113 lr:0.0003000000 time/step:118.58s
|
| 357 |
+
[2025-09-12 07:25:46,639] - step:347/900 train_loss:0.4211 lr:0.0003000000 time/step:117.08s
|
| 358 |
+
[2025-09-12 07:27:43,350] - step:348/900 train_loss:0.4183 lr:0.0003000000 time/step:116.70s
|
| 359 |
+
[2025-09-12 07:29:39,127] - step:349/900 train_loss:0.4313 lr:0.0003000000 time/step:115.77s
|
| 360 |
+
[2025-09-12 07:31:35,852] - step:350/900 train_loss:0.3881 lr:0.0003000000 time/step:116.71s
|
| 361 |
+
[2025-09-12 07:33:34,104] - step:351/900 train_loss:0.4243 lr:0.0003000000 time/step:118.24s
|
| 362 |
+
[2025-09-12 07:35:31,118] - step:352/900 train_loss:0.4273 lr:0.0003000000 time/step:117.00s
|
| 363 |
+
[2025-09-12 07:37:28,208] - step:353/900 train_loss:0.3925 lr:0.0003000000 time/step:117.06s
|
| 364 |
+
[2025-09-12 07:39:25,351] - step:354/900 train_loss:0.4223 lr:0.0003000000 time/step:117.14s
|
| 365 |
+
[2025-09-12 07:41:21,430] - step:355/900 train_loss:0.3996 lr:0.0003000000 time/step:116.07s
|
| 366 |
+
[2025-09-12 07:43:18,880] - step:356/900 train_loss:0.4095 lr:0.0003000000 time/step:117.45s
|
| 367 |
+
[2025-09-12 07:45:16,716] - step:357/900 train_loss:0.4204 lr:0.0003000000 time/step:117.83s
|
| 368 |
+
[2025-09-12 07:47:14,287] - step:358/900 train_loss:0.4157 lr:0.0003000000 time/step:117.56s
|
| 369 |
+
[2025-09-12 07:49:11,022] - step:359/900 train_loss:0.4179 lr:0.0003000000 time/step:116.72s
|
| 370 |
+
[2025-09-12 07:51:08,126] - step:360/900 train_loss:0.4490 lr:0.0003000000 time/step:117.10s
|
| 371 |
+
[2025-09-12 07:53:04,336] - step:361/900 train_loss:0.4100 lr:0.0003000000 time/step:116.20s
|
| 372 |
+
[2025-09-12 07:55:00,814] - step:362/900 train_loss:0.4050 lr:0.0003000000 time/step:116.47s
|
| 373 |
+
[2025-09-12 07:56:58,814] - step:363/900 train_loss:0.4299 lr:0.0003000000 time/step:117.99s
|
| 374 |
+
[2025-09-12 07:58:55,677] - step:364/900 train_loss:0.3970 lr:0.0003000000 time/step:116.85s
|
| 375 |
+
[2025-09-12 08:00:53,062] - step:365/900 train_loss:0.4180 lr:0.0003000000 time/step:117.38s
|
| 376 |
+
[2025-09-12 08:02:49,522] - step:366/900 train_loss:0.4307 lr:0.0003000000 time/step:116.45s
|
| 377 |
+
[2025-09-12 08:04:45,597] - step:367/900 train_loss:0.4335 lr:0.0003000000 time/step:116.07s
|
| 378 |
+
[2025-09-12 08:06:43,333] - step:368/900 train_loss:0.3967 lr:0.0003000000 time/step:117.73s
|
| 379 |
+
[2025-09-12 08:08:40,432] - step:369/900 train_loss:0.4226 lr:0.0003000000 time/step:117.09s
|
| 380 |
+
[2025-09-12 08:10:38,337] - step:370/900 train_loss:0.4086 lr:0.0003000000 time/step:117.90s
|
| 381 |
+
[2025-09-12 08:12:35,283] - step:371/900 train_loss:0.3949 lr:0.0003000000 time/step:116.93s
|
| 382 |
+
[2025-09-12 08:14:31,782] - step:372/900 train_loss:0.4219 lr:0.0003000000 time/step:116.49s
|
| 383 |
+
[2025-09-12 08:16:29,230] - step:373/900 train_loss:0.4088 lr:0.0003000000 time/step:117.44s
|
| 384 |
+
[2025-09-12 08:18:26,952] - step:374/900 train_loss:0.4184 lr:0.0003000000 time/step:117.71s
|
| 385 |
+
[2025-09-12 08:20:23,596] - step:375/900 train_loss:0.4110 lr:0.0003000000 time/step:116.64s
|
| 386 |
+
[2025-09-12 08:22:20,047] - step:376/900 train_loss:0.4305 lr:0.0003000000 time/step:116.44s
|
| 387 |
+
[2025-09-12 08:24:16,398] - step:377/900 train_loss:0.4143 lr:0.0003000000 time/step:116.35s
|
| 388 |
+
[2025-09-12 08:26:13,665] - step:378/900 train_loss:0.4139 lr:0.0003000000 time/step:117.26s
|
| 389 |
+
[2025-09-12 08:28:09,796] - step:379/900 train_loss:0.4060 lr:0.0003000000 time/step:116.13s
|
| 390 |
+
[2025-09-12 08:30:07,613] - step:380/900 train_loss:0.3921 lr:0.0003000000 time/step:117.81s
|
| 391 |
+
[2025-09-12 08:32:04,597] - step:381/900 train_loss:0.4239 lr:0.0003000000 time/step:116.97s
|
| 392 |
+
[2025-09-12 08:34:01,394] - step:382/900 train_loss:0.4041 lr:0.0003000000 time/step:116.79s
|
| 393 |
+
[2025-09-12 08:35:58,263] - step:383/900 train_loss:0.4115 lr:0.0003000000 time/step:116.86s
|
| 394 |
+
[2025-09-12 08:37:54,649] - step:384/900 train_loss:0.4216 lr:0.0003000000 time/step:116.38s
|
| 395 |
+
[2025-09-12 08:39:51,866] - step:385/900 train_loss:0.4057 lr:0.0003000000 time/step:117.21s
|
| 396 |
+
[2025-09-12 08:41:49,473] - step:386/900 train_loss:0.4021 lr:0.0003000000 time/step:117.60s
|
| 397 |
+
[2025-09-12 08:43:46,456] - step:387/900 train_loss:0.4235 lr:0.0003000000 time/step:116.98s
|
| 398 |
+
[2025-09-12 08:45:42,939] - step:388/900 train_loss:0.4309 lr:0.0003000000 time/step:116.48s
|
| 399 |
+
[2025-09-12 08:47:40,164] - step:389/900 train_loss:0.3930 lr:0.0003000000 time/step:117.22s
|
| 400 |
+
[2025-09-12 08:49:36,386] - step:390/900 train_loss:0.4063 lr:0.0003000000 time/step:116.22s
|
| 401 |
+
[2025-09-12 08:51:33,830] - step:391/900 train_loss:0.4034 lr:0.0003000000 time/step:117.43s
|
| 402 |
+
[2025-09-12 08:53:30,812] - step:392/900 train_loss:0.4071 lr:0.0003000000 time/step:116.97s
|
| 403 |
+
[2025-09-12 08:55:28,574] - step:393/900 train_loss:0.4296 lr:0.0003000000 time/step:117.75s
|
| 404 |
+
[2025-09-12 08:57:25,899] - step:394/900 train_loss:0.4171 lr:0.0003000000 time/step:117.31s
|
| 405 |
+
[2025-09-12 08:59:22,463] - step:395/900 train_loss:0.4167 lr:0.0003000000 time/step:116.56s
|
| 406 |
+
[2025-09-12 09:01:19,086] - step:396/900 train_loss:0.4119 lr:0.0003000000 time/step:116.62s
|
| 407 |
+
[2025-09-12 09:03:16,267] - step:397/900 train_loss:0.4057 lr:0.0003000000 time/step:117.17s
|
| 408 |
+
[2025-09-12 09:05:13,175] - step:398/900 train_loss:0.4064 lr:0.0003000000 time/step:116.90s
|
| 409 |
+
[2025-09-12 09:07:10,958] - step:399/900 train_loss:0.3913 lr:0.0003000000 time/step:117.77s
|
| 410 |
+
[2025-09-12 09:09:08,523] - step:400/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@400.pt...
|
| 411 |
+
[2025-09-12 09:09:08,524] - step:400/900 train_loss:0.4028 lr:0.0003000000 time/step:116.93s
|
| 412 |
+
[2025-09-12 09:11:04,902] - step:401/900 train_loss:0.3889 lr:0.0003000000 time/step:116.38s
|
| 413 |
+
[2025-09-12 09:13:01,467] - step:402/900 train_loss:0.4192 lr:0.0003000000 time/step:116.55s
|
| 414 |
+
[2025-09-12 09:14:58,472] - step:403/900 train_loss:0.4211 lr:0.0003000000 time/step:117.00s
|
| 415 |
+
[2025-09-12 09:16:55,036] - step:404/900 train_loss:0.4354 lr:0.0003000000 time/step:116.56s
|
| 416 |
+
[2025-09-12 09:18:52,741] - step:405/900 train_loss:0.4290 lr:0.0003000000 time/step:117.69s
|
| 417 |
+
[2025-09-12 09:20:49,701] - step:406/900 train_loss:0.4290 lr:0.0003000000 time/step:116.95s
|
| 418 |
+
[2025-09-12 09:22:46,403] - step:407/900 train_loss:0.4257 lr:0.0003000000 time/step:116.69s
|
| 419 |
+
[2025-09-12 09:24:43,048] - step:408/900 train_loss:0.4252 lr:0.0003000000 time/step:116.64s
|
| 420 |
+
[2025-09-12 09:26:39,532] - step:409/900 train_loss:0.3992 lr:0.0003000000 time/step:116.48s
|
| 421 |
+
[2025-09-12 09:28:37,397] - step:410/900 train_loss:0.4191 lr:0.0003000000 time/step:117.86s
|
| 422 |
+
[2025-09-12 09:30:33,697] - step:411/900 train_loss:0.3892 lr:0.0003000000 time/step:116.29s
|
| 423 |
+
[2025-09-12 09:32:30,021] - step:412/900 train_loss:0.3843 lr:0.0003000000 time/step:116.32s
|
| 424 |
+
[2025-09-12 09:34:27,365] - step:413/900 train_loss:0.4010 lr:0.0003000000 time/step:117.34s
|
| 425 |
+
[2025-09-12 09:36:24,146] - step:414/900 train_loss:0.4190 lr:0.0003000000 time/step:116.77s
|
| 426 |
+
[2025-09-12 09:38:20,888] - step:415/900 train_loss:0.4182 lr:0.0003000000 time/step:116.73s
|
| 427 |
+
[2025-09-12 09:40:17,896] - step:416/900 train_loss:0.4236 lr:0.0003000000 time/step:117.00s
|
| 428 |
+
[2025-09-12 09:42:14,418] - step:417/900 train_loss:0.4016 lr:0.0003000000 time/step:116.51s
|
| 429 |
+
[2025-09-12 09:44:11,142] - step:418/900 train_loss:0.4054 lr:0.0003000000 time/step:116.72s
|
| 430 |
+
[2025-09-12 09:46:07,906] - step:419/900 train_loss:0.4162 lr:0.0003000000 time/step:116.75s
|
| 431 |
+
[2025-09-12 09:48:05,609] - step:420/900 train_loss:0.3856 lr:0.0003000000 time/step:117.70s
|
| 432 |
+
[2025-09-12 09:50:02,634] - step:421/900 train_loss:0.3832 lr:0.0003000000 time/step:117.02s
|
| 433 |
+
[2025-09-12 09:51:59,099] - step:422/900 train_loss:0.4000 lr:0.0003000000 time/step:116.45s
|
| 434 |
+
[2025-09-12 09:53:56,083] - step:423/900 train_loss:0.4182 lr:0.0003000000 time/step:116.98s
|
| 435 |
+
[2025-09-12 09:55:53,683] - step:424/900 train_loss:0.4064 lr:0.0003000000 time/step:117.60s
|
| 436 |
+
[2025-09-12 09:57:49,838] - step:425/900 train_loss:0.4186 lr:0.0003000000 time/step:116.14s
|
| 437 |
+
[2025-09-12 09:59:47,210] - step:426/900 train_loss:0.4251 lr:0.0003000000 time/step:117.36s
|
| 438 |
+
[2025-09-12 10:01:43,887] - step:427/900 train_loss:0.3975 lr:0.0003000000 time/step:116.67s
|
| 439 |
+
[2025-09-12 10:03:40,560] - step:428/900 train_loss:0.4212 lr:0.0003000000 time/step:116.66s
|
| 440 |
+
[2025-09-12 10:05:37,859] - step:429/900 train_loss:0.4118 lr:0.0003000000 time/step:117.29s
|
| 441 |
+
[2025-09-12 10:07:35,749] - step:430/900 train_loss:0.3981 lr:0.0003000000 time/step:117.88s
|
| 442 |
+
[2025-09-12 10:09:32,291] - step:431/900 train_loss:0.4237 lr:0.0003000000 time/step:116.53s
|
| 443 |
+
[2025-09-12 10:11:29,229] - step:432/900 train_loss:0.3926 lr:0.0003000000 time/step:116.93s
|
| 444 |
+
[2025-09-12 10:13:26,136] - step:433/900 train_loss:0.4208 lr:0.0003000000 time/step:116.90s
|
| 445 |
+
[2025-09-12 10:15:22,577] - step:434/900 train_loss:0.4102 lr:0.0003000000 time/step:116.44s
|
| 446 |
+
[2025-09-12 10:17:19,961] - step:435/900 train_loss:0.4373 lr:0.0003000000 time/step:117.38s
|
| 447 |
+
[2025-09-12 10:19:18,170] - step:436/900 train_loss:0.4159 lr:0.0003000000 time/step:118.20s
|
| 448 |
+
[2025-09-12 10:21:13,810] - step:437/900 train_loss:0.4083 lr:0.0003000000 time/step:115.63s
|
| 449 |
+
[2025-09-12 10:23:10,450] - step:438/900 train_loss:0.4361 lr:0.0003000000 time/step:116.63s
|
| 450 |
+
[2025-09-12 10:25:07,257] - step:439/900 train_loss:0.4152 lr:0.0003000000 time/step:116.80s
|
| 451 |
+
[2025-09-12 10:27:04,621] - step:440/900 train_loss:0.4100 lr:0.0003000000 time/step:117.36s
|
| 452 |
+
[2025-09-12 10:29:01,561] - step:441/900 train_loss:0.4003 lr:0.0003000000 time/step:116.93s
|
| 453 |
+
[2025-09-12 10:30:58,928] - step:442/900 train_loss:0.4296 lr:0.0003000000 time/step:117.36s
|
| 454 |
+
[2025-09-12 10:32:54,885] - step:443/900 train_loss:0.4175 lr:0.0003000000 time/step:115.95s
|
| 455 |
+
[2025-09-12 10:34:51,250] - step:444/900 train_loss:0.4220 lr:0.0003000000 time/step:116.36s
|
| 456 |
+
[2025-09-12 10:36:48,671] - step:445/900 train_loss:0.4361 lr:0.0003000000 time/step:117.42s
|
| 457 |
+
[2025-09-12 10:38:46,902] - step:446/900 train_loss:0.4034 lr:0.0003000000 time/step:118.22s
|
| 458 |
+
[2025-09-12 10:40:44,143] - step:447/900 train_loss:0.4121 lr:0.0003000000 time/step:117.22s
|
| 459 |
+
[2025-09-12 10:42:40,558] - step:448/900 train_loss:0.4247 lr:0.0003000000 time/step:116.40s
|
| 460 |
+
[2025-09-12 10:44:37,203] - step:449/900 train_loss:0.4502 lr:0.0003000000 time/step:116.64s
|
| 461 |
+
[2025-09-12 10:46:34,074] - step:450/900 train_loss:0.4202 lr:0.0003000000 time/step:116.87s
|
| 462 |
+
[2025-09-12 10:48:32,574] - step:451/900 train_loss:0.4115 lr:0.0003000000 time/step:118.50s
|
| 463 |
+
[2025-09-12 10:50:30,519] - step:452/900 train_loss:0.4416 lr:0.0003000000 time/step:117.93s
|
| 464 |
+
[2025-09-12 10:52:27,400] - step:453/900 train_loss:0.4589 lr:0.0003000000 time/step:116.87s
|
| 465 |
+
[2025-09-12 10:54:23,502] - step:454/900 train_loss:0.4104 lr:0.0003000000 time/step:116.09s
|
| 466 |
+
[2025-09-12 10:56:20,043] - step:455/900 train_loss:0.4428 lr:0.0003000000 time/step:116.54s
|
| 467 |
+
[2025-09-12 10:58:18,649] - step:456/900 train_loss:0.3869 lr:0.0003000000 time/step:118.60s
|
| 468 |
+
[2025-09-12 11:00:16,434] - step:457/900 train_loss:0.3896 lr:0.0003000000 time/step:117.77s
|
| 469 |
+
[2025-09-12 11:02:12,853] - step:458/900 train_loss:0.4199 lr:0.0003000000 time/step:116.41s
|
| 470 |
+
[2025-09-12 11:04:09,871] - step:459/900 train_loss:0.4109 lr:0.0003000000 time/step:117.00s
|
| 471 |
+
[2025-09-12 11:06:05,943] - step:460/900 train_loss:0.4113 lr:0.0003000000 time/step:116.07s
|
| 472 |
+
[2025-09-12 11:08:02,527] - step:461/900 train_loss:0.3895 lr:0.0003000000 time/step:116.58s
|
| 473 |
+
[2025-09-12 11:10:00,790] - step:462/900 train_loss:0.4033 lr:0.0003000000 time/step:118.26s
|
| 474 |
+
[2025-09-12 11:11:58,115] - step:463/900 train_loss:0.4269 lr:0.0003000000 time/step:117.32s
|
| 475 |
+
[2025-09-12 11:13:54,593] - step:464/900 train_loss:0.4080 lr:0.0003000000 time/step:116.46s
|
| 476 |
+
[2025-09-12 11:15:51,480] - step:465/900 train_loss:0.4208 lr:0.0003000000 time/step:116.88s
|
| 477 |
+
[2025-09-12 11:17:48,283] - step:466/900 train_loss:0.4146 lr:0.0003000000 time/step:116.80s
|
| 478 |
+
[2025-09-12 11:19:44,666] - step:467/900 train_loss:0.4178 lr:0.0003000000 time/step:116.38s
|
| 479 |
+
[2025-09-12 11:21:43,091] - step:468/900 train_loss:0.4065 lr:0.0003000000 time/step:118.42s
|
| 480 |
+
[2025-09-12 11:23:40,099] - step:469/900 train_loss:0.4158 lr:0.0003000000 time/step:117.00s
|
| 481 |
+
[2025-09-12 11:25:36,537] - step:470/900 train_loss:0.3969 lr:0.0003000000 time/step:116.43s
|
| 482 |
+
[2025-09-12 11:27:34,080] - step:471/900 train_loss:0.4355 lr:0.0003000000 time/step:117.54s
|
| 483 |
+
[2025-09-12 11:29:30,162] - step:472/900 train_loss:0.3901 lr:0.0003000000 time/step:116.08s
|
| 484 |
+
[2025-09-12 11:31:28,047] - step:473/900 train_loss:0.4142 lr:0.0003000000 time/step:117.88s
|
| 485 |
+
[2025-09-12 11:33:24,570] - step:474/900 train_loss:0.4396 lr:0.0003000000 time/step:116.51s
|
| 486 |
+
[2025-09-12 11:35:21,454] - step:475/900 train_loss:0.3944 lr:0.0003000000 time/step:116.88s
|
| 487 |
+
[2025-09-12 11:37:18,778] - step:476/900 train_loss:0.4112 lr:0.0003000000 time/step:117.32s
|
| 488 |
+
[2025-09-12 11:39:15,275] - step:477/900 train_loss:0.4239 lr:0.0003000000 time/step:116.49s
|
| 489 |
+
[2025-09-12 11:41:11,285] - step:478/900 train_loss:0.4200 lr:0.0003000000 time/step:116.01s
|
| 490 |
+
[2025-09-12 11:43:08,711] - step:479/900 train_loss:0.4177 lr:0.0003000000 time/step:117.41s
|
| 491 |
+
[2025-09-12 11:45:05,127] - step:480/900 train_loss:0.3939 lr:0.0003000000 time/step:116.41s
|
| 492 |
+
[2025-09-12 11:47:02,193] - step:481/900 train_loss:0.4138 lr:0.0003000000 time/step:117.06s
|
| 493 |
+
[2025-09-12 11:48:59,561] - step:482/900 train_loss:0.4252 lr:0.0003000000 time/step:117.36s
|
| 494 |
+
[2025-09-12 11:50:55,554] - step:483/900 train_loss:0.4048 lr:0.0003000000 time/step:115.99s
|
| 495 |
+
[2025-09-12 11:52:52,805] - step:484/900 train_loss:0.4000 lr:0.0003000000 time/step:117.24s
|
| 496 |
+
[2025-09-12 11:54:49,667] - step:485/900 train_loss:0.4216 lr:0.0003000000 time/step:116.85s
|
| 497 |
+
[2025-09-12 11:56:46,072] - step:486/900 train_loss:0.4095 lr:0.0003000000 time/step:116.40s
|
| 498 |
+
[2025-09-12 11:58:43,074] - step:487/900 train_loss:0.4027 lr:0.0003000000 time/step:117.00s
|
| 499 |
+
[2025-09-12 12:00:40,979] - step:488/900 train_loss:0.4245 lr:0.0003000000 time/step:117.90s
|
| 500 |
+
[2025-09-12 12:02:38,064] - step:489/900 train_loss:0.3942 lr:0.0003000000 time/step:117.08s
|
| 501 |
+
[2025-09-12 12:04:34,804] - step:490/900 train_loss:0.4239 lr:0.0003000000 time/step:116.72s
|
| 502 |
+
[2025-09-12 12:06:31,269] - step:491/900 train_loss:0.3853 lr:0.0003000000 time/step:116.46s
|
| 503 |
+
[2025-09-12 12:08:28,111] - step:492/900 train_loss:0.4141 lr:0.0003000000 time/step:116.84s
|
| 504 |
+
[2025-09-12 12:10:24,954] - step:493/900 train_loss:0.4139 lr:0.0003000000 time/step:116.84s
|
| 505 |
+
[2025-09-12 12:12:22,937] - step:494/900 train_loss:0.4166 lr:0.0003000000 time/step:117.98s
|
| 506 |
+
[2025-09-12 12:14:20,061] - step:495/900 train_loss:0.3974 lr:0.0003000000 time/step:117.11s
|
| 507 |
+
[2025-09-12 12:16:16,526] - step:496/900 train_loss:0.4149 lr:0.0003000000 time/step:116.46s
|
| 508 |
+
[2025-09-12 12:18:13,009] - step:497/900 train_loss:0.4181 lr:0.0003000000 time/step:116.48s
|
| 509 |
+
[2025-09-12 12:20:09,790] - step:498/900 train_loss:0.4166 lr:0.0003000000 time/step:116.78s
|
| 510 |
+
[2025-09-12 12:22:06,615] - step:499/900 train_loss:0.4216 lr:0.0003000000 time/step:116.82s
|
| 511 |
+
[2025-09-12 12:24:05,337] - step:500/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@500.pt...
|
| 512 |
+
[2025-09-12 12:24:05,343] - step:500/900 train_loss:0.4161 lr:0.0003000000 time/step:118.13s
|
| 513 |
+
[2025-09-12 12:26:01,342] - step:501/900 train_loss:0.4010 lr:0.0003000000 time/step:116.00s
|
| 514 |
+
[2025-09-12 12:27:58,345] - step:502/900 train_loss:0.4042 lr:0.0003000000 time/step:116.99s
|
| 515 |
+
[2025-09-12 12:29:54,389] - step:503/900 train_loss:0.4216 lr:0.0003000000 time/step:116.04s
|
| 516 |
+
[2025-09-12 12:31:51,252] - step:504/900 train_loss:0.4127 lr:0.0003000000 time/step:116.86s
|
| 517 |
+
[2025-09-12 12:33:49,558] - step:505/900 train_loss:0.4019 lr:0.0003000000 time/step:118.29s
|
| 518 |
+
[2025-09-12 12:35:46,199] - step:506/900 train_loss:0.4076 lr:0.0003000000 time/step:116.64s
|
| 519 |
+
[2025-09-12 12:37:42,246] - step:507/900 train_loss:0.4207 lr:0.0003000000 time/step:116.04s
|
| 520 |
+
[2025-09-12 12:39:39,229] - step:508/900 train_loss:0.4258 lr:0.0003000000 time/step:116.98s
|
| 521 |
+
[2025-09-12 12:41:35,709] - step:509/900 train_loss:0.3826 lr:0.0003000000 time/step:116.48s
|
| 522 |
+
[2025-09-12 12:43:32,441] - step:510/900 train_loss:0.4092 lr:0.0003000000 time/step:116.72s
|
| 523 |
+
[2025-09-12 12:45:30,539] - step:511/900 train_loss:0.3954 lr:0.0003000000 time/step:118.09s
|
| 524 |
+
[2025-09-12 12:47:27,041] - step:512/900 train_loss:0.4335 lr:0.0003000000 time/step:116.49s
|
| 525 |
+
[2025-09-12 12:49:23,522] - step:513/900 train_loss:0.4216 lr:0.0003000000 time/step:116.47s
|
| 526 |
+
[2025-09-12 12:51:20,467] - step:514/900 train_loss:0.3952 lr:0.0003000000 time/step:116.94s
|
| 527 |
+
[2025-09-12 12:53:17,452] - step:515/900 train_loss:0.4052 lr:0.0003000000 time/step:116.98s
|
| 528 |
+
[2025-09-12 12:55:14,098] - step:516/900 train_loss:0.4145 lr:0.0003000000 time/step:116.64s
|
| 529 |
+
[2025-09-12 12:57:11,620] - step:517/900 train_loss:0.4292 lr:0.0003000000 time/step:117.51s
|
| 530 |
+
[2025-09-12 12:59:09,139] - step:518/900 train_loss:0.4204 lr:0.0003000000 time/step:117.51s
|
| 531 |
+
[2025-09-12 13:01:05,186] - step:519/900 train_loss:0.3932 lr:0.0003000000 time/step:116.04s
|
| 532 |
+
[2025-09-12 13:03:01,731] - step:520/900 train_loss:0.4226 lr:0.0003000000 time/step:116.54s
|
| 533 |
+
[2025-09-12 13:04:59,398] - step:521/900 train_loss:0.4080 lr:0.0003000000 time/step:117.65s
|
| 534 |
+
[2025-09-12 13:06:56,876] - step:522/900 train_loss:0.4079 lr:0.0003000000 time/step:117.47s
|
| 535 |
+
[2025-09-12 13:08:53,784] - step:523/900 train_loss:0.4375 lr:0.0003000000 time/step:116.90s
|
| 536 |
+
[2025-09-12 13:11:18,031] - step:524/900 train_loss:0.3876 lr:0.0003000000 time/step:144.24s
|
| 537 |
+
[2025-09-12 13:13:14,894] - step:525/900 train_loss:0.4133 lr:0.0003000000 time/step:116.82s
|
| 538 |
+
[2025-09-12 13:15:16,203] - step:526/900 train_loss:0.3961 lr:0.0003000000 time/step:118.95s
|
| 539 |
+
[2025-09-12 13:17:12,922] - step:527/900 train_loss:0.3895 lr:0.0003000000 time/step:116.71s
|
| 540 |
+
[2025-09-12 13:19:09,906] - step:528/900 train_loss:0.4204 lr:0.0003000000 time/step:116.98s
|
| 541 |
+
[2025-09-12 13:21:08,032] - step:529/900 train_loss:0.4078 lr:0.0003000000 time/step:118.12s
|
| 542 |
+
[2025-09-12 13:23:04,450] - step:530/900 train_loss:0.3973 lr:0.0003000000 time/step:116.41s
|
| 543 |
+
[2025-09-12 13:25:02,156] - step:531/900 train_loss:0.3875 lr:0.0003000000 time/step:117.69s
|
| 544 |
+
[2025-09-12 13:26:58,851] - step:532/900 train_loss:0.3979 lr:0.0003000000 time/step:116.69s
|
| 545 |
+
[2025-09-12 13:28:55,552] - step:533/900 train_loss:0.4210 lr:0.0003000000 time/step:116.69s
|
| 546 |
+
[2025-09-12 13:30:52,352] - step:534/900 train_loss:0.4016 lr:0.0003000000 time/step:116.80s
|
| 547 |
+
[2025-09-12 13:32:50,584] - step:535/900 train_loss:0.3971 lr:0.0003000000 time/step:118.23s
|
| 548 |
+
[2025-09-12 13:34:47,330] - step:536/900 train_loss:0.4167 lr:0.0003000000 time/step:116.73s
|
| 549 |
+
[2025-09-12 13:36:44,747] - step:537/900 train_loss:0.4366 lr:0.0003000000 time/step:117.39s
|
| 550 |
+
[2025-09-12 13:38:42,456] - step:538/900 train_loss:0.4267 lr:0.0003000000 time/step:117.71s
|
| 551 |
+
[2025-09-12 13:40:38,661] - step:539/900 train_loss:0.4092 lr:0.0003000000 time/step:116.20s
|
| 552 |
+
[2025-09-12 13:42:38,305] - step:540/900 train_loss:0.4273 lr:0.0003000000 time/step:119.62s
|
| 553 |
+
[2025-09-12 13:44:37,524] - step:541/900 train_loss:0.4157 lr:0.0003000000 time/step:119.17s
|
| 554 |
+
[2025-09-12 13:46:33,425] - step:542/900 train_loss:0.4237 lr:0.0003000000 time/step:115.89s
|
| 555 |
+
[2025-09-12 13:48:30,101] - step:543/900 train_loss:0.4052 lr:0.0003000000 time/step:116.67s
|
| 556 |
+
[2025-09-12 13:50:27,196] - step:544/900 train_loss:0.4260 lr:0.0003000000 time/step:117.09s
|
| 557 |
+
[2025-09-12 13:52:24,079] - step:545/900 train_loss:0.4021 lr:0.0003000000 time/step:116.88s
|
| 558 |
+
[2025-09-12 13:54:21,661] - step:546/900 train_loss:0.3897 lr:0.0003000000 time/step:117.57s
|
| 559 |
+
[2025-09-12 13:56:19,479] - step:547/900 train_loss:0.4029 lr:0.0003000000 time/step:117.81s
|
| 560 |
+
[2025-09-12 13:58:15,488] - step:548/900 train_loss:0.4107 lr:0.0003000000 time/step:116.00s
|
| 561 |
+
[2025-09-12 14:00:11,893] - step:549/900 train_loss:0.4159 lr:0.0003000000 time/step:116.40s
|
| 562 |
+
[2025-09-12 14:02:08,916] - step:550/900 train_loss:0.4075 lr:0.0003000000 time/step:117.01s
|
| 563 |
+
[2025-09-12 14:04:06,359] - step:551/900 train_loss:0.3932 lr:0.0003000000 time/step:117.43s
|
| 564 |
+
[2025-09-12 14:06:02,862] - step:552/900 train_loss:0.4110 lr:0.0003000000 time/step:116.49s
|
| 565 |
+
[2025-09-12 14:08:00,226] - step:553/900 train_loss:0.4250 lr:0.0003000000 time/step:117.36s
|
| 566 |
+
[2025-09-12 14:09:56,780] - step:554/900 train_loss:0.3990 lr:0.0003000000 time/step:116.54s
|
| 567 |
+
[2025-09-12 14:11:53,353] - step:555/900 train_loss:0.4041 lr:0.0003000000 time/step:116.56s
|
| 568 |
+
[2025-09-12 14:13:50,235] - step:556/900 train_loss:0.4062 lr:0.0003000000 time/step:116.87s
|
| 569 |
+
[2025-09-12 14:15:47,160] - step:557/900 train_loss:0.4144 lr:0.0003000000 time/step:116.92s
|
| 570 |
+
[2025-09-12 14:17:44,967] - step:558/900 train_loss:0.4032 lr:0.0003000000 time/step:117.80s
|
| 571 |
+
[2025-09-12 14:19:40,685] - step:559/900 train_loss:0.4082 lr:0.0003000000 time/step:115.71s
|
| 572 |
+
[2025-09-12 14:21:37,889] - step:560/900 train_loss:0.4140 lr:0.0003000000 time/step:117.20s
|
| 573 |
+
[2025-09-12 14:23:34,834] - step:561/900 train_loss:0.4284 lr:0.0003000000 time/step:116.94s
|
| 574 |
+
[2025-09-12 14:25:31,517] - step:562/900 train_loss:0.4096 lr:0.0003000000 time/step:116.67s
|
| 575 |
+
[2025-09-12 14:27:29,793] - step:563/900 train_loss:0.4017 lr:0.0003000000 time/step:118.26s
|
| 576 |
+
[2025-09-12 14:29:26,683] - step:564/900 train_loss:0.4014 lr:0.0003000000 time/step:116.88s
|
| 577 |
+
[2025-09-12 14:31:22,468] - step:565/900 train_loss:0.4061 lr:0.0003000000 time/step:115.78s
|
| 578 |
+
[2025-09-12 14:33:19,190] - step:566/900 train_loss:0.4188 lr:0.0003000000 time/step:116.72s
|
| 579 |
+
[2025-09-12 14:35:16,130] - step:567/900 train_loss:0.4305 lr:0.0003000000 time/step:116.93s
|
| 580 |
+
[2025-09-12 14:37:13,373] - step:568/900 train_loss:0.3922 lr:0.0003000000 time/step:117.24s
|
| 581 |
+
[2025-09-12 14:39:10,305] - step:569/900 train_loss:0.4190 lr:0.0003000000 time/step:116.92s
|
| 582 |
+
[2025-09-12 14:41:07,121] - step:570/900 train_loss:0.4047 lr:0.0003000000 time/step:116.81s
|
| 583 |
+
[2025-09-12 14:43:03,948] - step:571/900 train_loss:0.4152 lr:0.0003000000 time/step:116.82s
|
| 584 |
+
[2025-09-12 14:45:00,151] - step:572/900 train_loss:0.3946 lr:0.0003000000 time/step:116.19s
|
| 585 |
+
[2025-09-12 14:46:57,634] - step:573/900 train_loss:0.4138 lr:0.0003000000 time/step:117.48s
|
| 586 |
+
[2025-09-12 14:48:55,022] - step:574/900 train_loss:0.4231 lr:0.0003000000 time/step:117.37s
|
| 587 |
+
[2025-09-12 14:50:50,877] - step:575/900 train_loss:0.3978 lr:0.0003000000 time/step:115.85s
|
| 588 |
+
[2025-09-12 14:52:49,128] - step:576/900 train_loss:0.4169 lr:0.0003000000 time/step:118.25s
|
| 589 |
+
[2025-09-12 14:54:45,289] - step:577/900 train_loss:0.3971 lr:0.0003000000 time/step:116.15s
|
| 590 |
+
[2025-09-12 14:56:41,851] - step:578/900 train_loss:0.4058 lr:0.0003000000 time/step:116.56s
|
| 591 |
+
[2025-09-12 14:58:38,779] - step:579/900 train_loss:0.4105 lr:0.0003000000 time/step:116.92s
|
| 592 |
+
[2025-09-12 15:00:35,657] - step:580/900 train_loss:0.4145 lr:0.0003000000 time/step:116.87s
|
| 593 |
+
[2025-09-12 15:02:33,021] - step:581/900 train_loss:0.4067 lr:0.0003000000 time/step:117.36s
|
| 594 |
+
[2025-09-12 15:04:29,564] - step:582/900 train_loss:0.4209 lr:0.0003000000 time/step:116.53s
|
| 595 |
+
[2025-09-12 15:06:26,089] - step:583/900 train_loss:0.4106 lr:0.0003000000 time/step:116.52s
|
| 596 |
+
[2025-09-12 15:08:22,953] - step:584/900 train_loss:0.4220 lr:0.0003000000 time/step:116.86s
|
| 597 |
+
[2025-09-12 15:10:19,376] - step:585/900 train_loss:0.4001 lr:0.0003000000 time/step:116.41s
|
| 598 |
+
[2025-09-12 15:12:16,440] - step:586/900 train_loss:0.3963 lr:0.0003000000 time/step:117.06s
|
| 599 |
+
[2025-09-12 15:14:14,343] - step:587/900 train_loss:0.4118 lr:0.0003000000 time/step:117.89s
|
| 600 |
+
[2025-09-12 15:16:10,568] - step:588/900 train_loss:0.4285 lr:0.0003000000 time/step:116.22s
|
| 601 |
+
[2025-09-12 15:18:06,609] - step:589/900 train_loss:0.4177 lr:0.0003000000 time/step:116.04s
|
| 602 |
+
[2025-09-12 15:20:03,934] - step:590/900 train_loss:0.4256 lr:0.0003000000 time/step:117.32s
|
| 603 |
+
[2025-09-12 15:22:00,505] - step:591/900 train_loss:0.4258 lr:0.0003000000 time/step:116.57s
|
| 604 |
+
[2025-09-12 15:23:57,739] - step:592/900 train_loss:0.4031 lr:0.0003000000 time/step:117.19s
|
| 605 |
+
[2025-09-12 15:25:55,502] - step:593/900 train_loss:0.3975 lr:0.0003000000 time/step:117.76s
|
| 606 |
+
[2025-09-12 15:27:51,604] - step:594/900 train_loss:0.4098 lr:0.0003000000 time/step:116.10s
|
| 607 |
+
[2025-09-12 15:29:48,152] - step:595/900 train_loss:0.4044 lr:0.0003000000 time/step:116.54s
|
| 608 |
+
[2025-09-12 15:31:45,056] - step:596/900 train_loss:0.4394 lr:0.0003000000 time/step:116.89s
|
| 609 |
+
[2025-09-12 15:33:42,598] - step:597/900 train_loss:0.4166 lr:0.0003000000 time/step:117.54s
|
| 610 |
+
[2025-09-12 15:35:38,903] - step:598/900 train_loss:0.3857 lr:0.0003000000 time/step:116.29s
|
| 611 |
+
[2025-09-12 15:37:35,947] - step:599/900 train_loss:0.3944 lr:0.0003000000 time/step:117.04s
|
| 612 |
+
[2025-09-12 15:39:32,999] - step:600/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@600.pt...
|
| 613 |
+
[2025-09-12 15:39:33,008] - step:600/900 train_loss:0.4121 lr:0.0003000000 time/step:116.45s
|
| 614 |
+
[2025-09-12 15:41:29,871] - step:601/900 train_loss:0.4055 lr:0.0003000000 time/step:116.86s
|
| 615 |
+
[2025-09-12 15:43:27,496] - step:602/900 train_loss:0.4131 lr:0.0003000000 time/step:117.62s
|
| 616 |
+
[2025-09-12 15:45:24,775] - step:603/900 train_loss:0.4117 lr:0.0003000000 time/step:117.27s
|
| 617 |
+
[2025-09-12 15:47:21,843] - step:604/900 train_loss:0.4073 lr:0.0003000000 time/step:117.06s
|
| 618 |
+
[2025-09-12 15:49:19,207] - step:605/900 train_loss:0.3994 lr:0.0003000000 time/step:117.35s
|
| 619 |
+
[2025-09-12 15:51:15,705] - step:606/900 train_loss:0.4006 lr:0.0003000000 time/step:116.49s
|
| 620 |
+
[2025-09-12 15:53:12,651] - step:607/900 train_loss:0.4087 lr:0.0003000000 time/step:116.94s
|
| 621 |
+
[2025-09-12 15:55:10,370] - step:608/900 train_loss:0.4194 lr:0.0003000000 time/step:117.71s
|
| 622 |
+
[2025-09-12 15:57:07,183] - step:609/900 train_loss:0.4059 lr:0.0003000000 time/step:116.80s
|
| 623 |
+
[2025-09-12 15:59:03,945] - step:610/900 train_loss:0.3960 lr:0.0003000000 time/step:116.75s
|
| 624 |
+
[2025-09-12 16:01:01,845] - step:611/900 train_loss:0.4203 lr:0.0003000000 time/step:117.89s
|
| 625 |
+
[2025-09-12 16:02:57,870] - step:612/900 train_loss:0.4208 lr:0.0003000000 time/step:116.02s
|
| 626 |
+
[2025-09-12 16:04:55,209] - step:613/900 train_loss:0.4205 lr:0.0003000000 time/step:117.33s
|
| 627 |
+
[2025-09-12 16:06:52,213] - step:614/900 train_loss:0.4023 lr:0.0003000000 time/step:117.00s
|
| 628 |
+
[2025-09-12 16:08:49,316] - step:615/900 train_loss:0.4011 lr:0.0003000000 time/step:117.09s
|
| 629 |
+
[2025-09-12 16:10:47,341] - step:616/900 train_loss:0.3898 lr:0.0003000000 time/step:118.02s
|
| 630 |
+
[2025-09-12 16:12:44,306] - step:617/900 train_loss:0.4223 lr:0.0003000000 time/step:116.96s
|
| 631 |
+
[2025-09-12 16:14:41,166] - step:618/900 train_loss:0.4022 lr:0.0003000000 time/step:116.85s
|
| 632 |
+
[2025-09-12 16:16:38,068] - step:619/900 train_loss:0.4259 lr:0.0003000000 time/step:116.89s
|
| 633 |
+
[2025-09-12 16:18:35,272] - step:620/900 train_loss:0.4129 lr:0.0003000000 time/step:117.20s
|
| 634 |
+
[2025-09-12 16:20:32,436] - step:621/900 train_loss:0.4122 lr:0.0003000000 time/step:117.13s
|
| 635 |
+
[2025-09-12 16:22:30,553] - step:622/900 train_loss:0.4185 lr:0.0003000000 time/step:118.10s
|
| 636 |
+
[2025-09-12 16:24:27,881] - step:623/900 train_loss:0.3991 lr:0.0003000000 time/step:117.28s
|
| 637 |
+
[2025-09-12 16:26:24,425] - step:624/900 train_loss:0.4208 lr:0.0003000000 time/step:116.53s
|
| 638 |
+
[2025-09-12 16:28:21,471] - step:625/900 train_loss:0.4276 lr:0.0003000000 time/step:117.04s
|
| 639 |
+
[2025-09-12 16:30:19,129] - step:626/900 train_loss:0.4259 lr:0.0003000000 time/step:117.64s
|
| 640 |
+
[2025-09-12 16:32:19,616] - step:627/900 train_loss:0.3848 lr:0.0003000000 time/step:120.47s
|
| 641 |
+
[2025-09-12 16:34:17,638] - step:628/900 train_loss:0.4005 lr:0.0003000000 time/step:118.02s
|
| 642 |
+
[2025-09-12 16:36:14,359] - step:629/900 train_loss:0.3988 lr:0.0003000000 time/step:116.71s
|
| 643 |
+
[2025-09-12 16:38:11,222] - step:630/900 train_loss:0.4181 lr:0.0003000000 time/step:116.86s
|
| 644 |
+
[2025-09-12 16:40:08,509] - step:631/900 train_loss:0.4042 lr:0.0003000000 time/step:117.28s
|
| 645 |
+
[2025-09-12 16:42:06,712] - step:632/900 train_loss:0.4010 lr:0.0003000000 time/step:118.19s
|
| 646 |
+
[2025-09-12 16:44:03,814] - step:633/900 train_loss:0.4108 lr:0.0003000000 time/step:117.10s
|
| 647 |
+
[2025-09-12 16:46:01,576] - step:634/900 train_loss:0.4218 lr:0.0003000000 time/step:117.65s
|
| 648 |
+
[2025-09-12 16:47:57,601] - step:635/900 train_loss:0.4339 lr:0.0003000000 time/step:116.02s
|
| 649 |
+
[2025-09-12 16:49:54,473] - step:636/900 train_loss:0.4252 lr:0.0003000000 time/step:116.86s
|
| 650 |
+
[2025-09-12 16:51:52,707] - step:637/900 train_loss:0.3961 lr:0.0003000000 time/step:118.19s
|
| 651 |
+
[2025-09-12 16:53:50,406] - step:638/900 train_loss:0.4049 lr:0.0003000000 time/step:117.69s
|
| 652 |
+
[2025-09-12 16:55:48,233] - step:639/900 train_loss:0.4217 lr:0.0003000000 time/step:117.81s
|
| 653 |
+
[2025-09-12 16:57:44,596] - step:640/900 train_loss:0.4046 lr:0.0003000000 time/step:116.35s
|
| 654 |
+
[2025-09-12 16:59:40,200] - step:641/900 train_loss:0.4136 lr:0.0003000000 time/step:115.60s
|
| 655 |
+
[2025-09-12 17:01:37,286] - step:642/900 train_loss:0.4027 lr:0.0003000000 time/step:117.08s
|
| 656 |
+
[2025-09-12 17:03:35,226] - step:643/900 train_loss:0.3820 lr:0.0003000000 time/step:117.93s
|
| 657 |
+
[2025-09-12 17:05:33,570] - step:644/900 train_loss:0.4089 lr:0.0003000000 time/step:118.33s
|
| 658 |
+
[2025-09-12 17:07:30,395] - step:645/900 train_loss:0.3874 lr:0.0003000000 time/step:116.82s
|
| 659 |
+
[2025-09-12 17:09:27,297] - step:646/900 train_loss:0.4146 lr:0.0003000000 time/step:116.90s
|
| 660 |
+
[2025-09-12 17:11:23,362] - step:647/900 train_loss:0.3988 lr:0.0003000000 time/step:116.06s
|
| 661 |
+
[2025-09-12 17:13:20,787] - step:648/900 train_loss:0.4128 lr:0.0003000000 time/step:117.42s
|
| 662 |
+
[2025-09-12 17:15:18,588] - step:649/900 train_loss:0.4332 lr:0.0003000000 time/step:117.79s
|
| 663 |
+
[2025-09-12 17:17:16,062] - step:650/900 train_loss:0.4214 lr:0.0003000000 time/step:117.47s
|
| 664 |
+
[2025-09-12 17:19:12,730] - step:651/900 train_loss:0.4074 lr:0.0003000000 time/step:116.66s
|
| 665 |
+
[2025-09-12 17:21:09,550] - step:652/900 train_loss:0.4025 lr:0.0003000000 time/step:116.81s
|
| 666 |
+
[2025-09-12 17:23:05,702] - step:653/900 train_loss:0.4008 lr:0.0003000000 time/step:116.15s
|
| 667 |
+
[2025-09-12 17:25:03,925] - step:654/900 train_loss:0.4060 lr:0.0003000000 time/step:118.18s
|
| 668 |
+
[2025-09-12 17:27:02,401] - step:655/900 train_loss:0.3931 lr:0.0003000000 time/step:118.47s
|
| 669 |
+
[2025-09-12 17:28:59,392] - step:656/900 train_loss:0.3985 lr:0.0003000000 time/step:116.97s
|
| 670 |
+
[2025-09-12 17:30:56,335] - step:657/900 train_loss:0.4319 lr:0.0003000000 time/step:116.93s
|
| 671 |
+
[2025-09-12 17:32:52,897] - step:658/900 train_loss:0.4200 lr:0.0003000000 time/step:116.56s
|
| 672 |
+
[2025-09-12 17:34:50,643] - step:659/900 train_loss:0.3811 lr:0.0003000000 time/step:117.73s
|
| 673 |
+
[2025-09-12 17:36:47,661] - step:660/900 train_loss:0.3960 lr:0.0003000000 time/step:117.00s
|
| 674 |
+
[2025-09-12 17:38:45,367] - step:661/900 train_loss:0.3810 lr:0.0003000000 time/step:117.70s
|
| 675 |
+
[2025-09-12 17:40:42,471] - step:662/900 train_loss:0.3948 lr:0.0003000000 time/step:117.10s
|
| 676 |
+
[2025-09-12 17:42:39,354] - step:663/900 train_loss:0.4221 lr:0.0003000000 time/step:116.86s
|
| 677 |
+
[2025-09-12 17:44:37,177] - step:664/900 train_loss:0.4021 lr:0.0003000000 time/step:117.82s
|
| 678 |
+
[2025-09-12 17:46:33,621] - step:665/900 train_loss:0.4521 lr:0.0003000000 time/step:116.43s
|
| 679 |
+
[2025-09-12 17:48:31,225] - step:666/900 train_loss:0.4265 lr:0.0003000000 time/step:117.60s
|
| 680 |
+
[2025-09-12 17:50:28,126] - step:667/900 train_loss:0.4109 lr:0.0003000000 time/step:116.89s
|
| 681 |
+
[2025-09-12 17:52:25,032] - step:668/900 train_loss:0.4247 lr:0.0003000000 time/step:116.90s
|
| 682 |
+
[2025-09-12 17:54:22,433] - step:669/900 train_loss:0.4024 lr:0.0003000000 time/step:117.40s
|
| 683 |
+
[2025-09-12 17:56:19,263] - step:670/900 train_loss:0.4238 lr:0.0003000000 time/step:116.81s
|
| 684 |
+
[2025-09-12 17:58:15,840] - step:671/900 train_loss:0.4240 lr:0.0003000000 time/step:116.57s
|
| 685 |
+
[2025-09-12 18:00:13,196] - step:672/900 train_loss:0.4079 lr:0.0003000000 time/step:117.35s
|
| 686 |
+
[2025-09-12 18:02:09,946] - step:673/900 train_loss:0.4152 lr:0.0003000000 time/step:116.74s
|
| 687 |
+
[2025-09-12 18:04:08,272] - step:674/900 train_loss:0.4386 lr:0.0003000000 time/step:118.32s
|
| 688 |
+
[2025-09-12 18:06:05,695] - step:675/900 train_loss:0.3944 lr:0.0003000000 time/step:117.41s
|
| 689 |
+
[2025-09-12 18:08:01,761] - step:676/900 train_loss:0.3997 lr:0.0003000000 time/step:116.05s
|
| 690 |
+
[2025-09-12 18:09:59,340] - step:677/900 train_loss:0.4081 lr:0.0003000000 time/step:117.57s
|
| 691 |
+
[2025-09-12 18:11:56,223] - step:678/900 train_loss:0.4326 lr:0.0003000000 time/step:116.88s
|
| 692 |
+
[2025-09-12 18:13:53,528] - step:679/900 train_loss:0.4058 lr:0.0003000000 time/step:117.30s
|
| 693 |
+
[2025-09-12 18:15:51,604] - step:680/900 train_loss:0.4257 lr:0.0003000000 time/step:118.06s
|
| 694 |
+
[2025-09-12 18:17:48,495] - step:681/900 train_loss:0.4226 lr:0.0003000000 time/step:116.88s
|
| 695 |
+
[2025-09-12 18:19:44,618] - step:682/900 train_loss:0.3978 lr:0.0003000000 time/step:116.12s
|
| 696 |
+
[2025-09-12 18:21:41,760] - step:683/900 train_loss:0.4064 lr:0.0003000000 time/step:117.14s
|
| 697 |
+
[2025-09-12 18:23:38,665] - step:684/900 train_loss:0.3959 lr:0.0003000000 time/step:116.90s
|
| 698 |
+
[2025-09-12 18:25:36,029] - step:685/900 train_loss:0.4136 lr:0.0003000000 time/step:117.35s
|
| 699 |
+
[2025-09-12 18:27:33,774] - step:686/900 train_loss:0.4058 lr:0.0003000000 time/step:117.62s
|
| 700 |
+
[2025-09-12 18:29:30,658] - step:687/900 train_loss:0.4132 lr:0.0003000000 time/step:116.88s
|
| 701 |
+
[2025-09-12 18:31:27,420] - step:688/900 train_loss:0.4048 lr:0.0003000000 time/step:116.76s
|
| 702 |
+
[2025-09-12 18:33:24,361] - step:689/900 train_loss:0.4023 lr:0.0003000000 time/step:116.94s
|
| 703 |
+
[2025-09-12 18:35:21,754] - step:690/900 train_loss:0.3715 lr:0.0003000000 time/step:117.38s
|
| 704 |
+
[2025-09-12 18:37:19,552] - step:691/900 train_loss:0.4017 lr:0.0003000000 time/step:117.78s
|
| 705 |
+
[2025-09-12 18:39:16,412] - step:692/900 train_loss:0.4232 lr:0.0003000000 time/step:116.85s
|
| 706 |
+
[2025-09-12 18:41:13,974] - step:693/900 train_loss:0.4196 lr:0.0003000000 time/step:117.55s
|
| 707 |
+
[2025-09-12 18:43:10,197] - step:694/900 train_loss:0.4010 lr:0.0003000000 time/step:116.22s
|
| 708 |
+
[2025-09-12 18:45:07,263] - step:695/900 train_loss:0.3904 lr:0.0003000000 time/step:117.06s
|
| 709 |
+
[2025-09-12 18:47:05,813] - step:696/900 train_loss:0.4152 lr:0.0003000000 time/step:118.53s
|
| 710 |
+
[2025-09-12 18:49:02,863] - step:697/900 train_loss:0.4064 lr:0.0003000000 time/step:117.04s
|
| 711 |
+
[2025-09-12 18:50:59,812] - step:698/900 train_loss:0.3980 lr:0.0003000000 time/step:116.94s
|
| 712 |
+
[2025-09-12 18:52:57,370] - step:699/900 train_loss:0.3884 lr:0.0003000000 time/step:117.55s
|
| 713 |
+
[2025-09-12 18:54:54,648] - step:700/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@700.pt...
|
| 714 |
+
[2025-09-12 18:54:54,648] - step:700/900 train_loss:0.3973 lr:0.0003000000 time/step:116.56s
|
| 715 |
+
[2025-09-12 18:56:52,677] - step:701/900 train_loss:0.4030 lr:0.0003000000 time/step:118.01s
|
| 716 |
+
[2025-09-12 18:58:49,207] - step:702/900 train_loss:0.3937 lr:0.0003000000 time/step:116.52s
|
| 717 |
+
[2025-09-12 19:00:46,170] - step:703/900 train_loss:0.4356 lr:0.0003000000 time/step:116.96s
|
| 718 |
+
[2025-09-12 19:02:43,288] - step:704/900 train_loss:0.4294 lr:0.0003000000 time/step:117.11s
|
| 719 |
+
[2025-09-12 19:04:40,157] - step:705/900 train_loss:0.4150 lr:0.0003000000 time/step:116.86s
|
| 720 |
+
[2025-09-12 19:06:38,620] - step:706/900 train_loss:0.4153 lr:0.0003000000 time/step:118.45s
|
| 721 |
+
[2025-09-12 19:08:35,564] - step:707/900 train_loss:0.3966 lr:0.0003000000 time/step:116.92s
|
| 722 |
+
[2025-09-12 19:10:32,548] - step:708/900 train_loss:0.4221 lr:0.0003000000 time/step:116.98s
|
| 723 |
+
[2025-09-12 19:12:29,132] - step:709/900 train_loss:0.3952 lr:0.0003000000 time/step:116.58s
|
| 724 |
+
[2025-09-12 19:14:26,936] - step:710/900 train_loss:0.3849 lr:0.0003000000 time/step:117.80s
|
| 725 |
+
[2025-09-12 19:16:24,310] - step:711/900 train_loss:0.4114 lr:0.0003000000 time/step:117.36s
|
| 726 |
+
[2025-09-12 19:18:20,956] - step:712/900 train_loss:0.4173 lr:0.0003000000 time/step:116.64s
|
| 727 |
+
[2025-09-12 19:20:18,010] - step:713/900 train_loss:0.3898 lr:0.0003000000 time/step:117.05s
|
| 728 |
+
[2025-09-12 19:22:14,781] - step:714/900 train_loss:0.4088 lr:0.0003000000 time/step:116.76s
|
| 729 |
+
[2025-09-12 19:24:11,349] - step:715/900 train_loss:0.3975 lr:0.0003000000 time/step:116.56s
|
| 730 |
+
[2025-09-12 19:26:09,929] - step:716/900 train_loss:0.4089 lr:0.0003000000 time/step:118.57s
|
| 731 |
+
[2025-09-12 19:28:06,399] - step:717/900 train_loss:0.3964 lr:0.0003000000 time/step:116.46s
|
| 732 |
+
[2025-09-12 19:30:02,907] - step:718/900 train_loss:0.4063 lr:0.0003000000 time/step:116.49s
|
| 733 |
+
[2025-09-12 19:31:59,817] - step:719/900 train_loss:0.3934 lr:0.0003000000 time/step:116.90s
|
| 734 |
+
[2025-09-12 19:33:56,770] - step:720/900 train_loss:0.3953 lr:0.0003000000 time/step:116.95s
|
| 735 |
+
[2025-09-12 19:35:54,685] - step:721/900 train_loss:0.4275 lr:0.0003000000 time/step:117.90s
|
| 736 |
+
[2025-09-12 19:37:52,051] - step:722/900 train_loss:0.4074 lr:0.0003000000 time/step:117.36s
|
| 737 |
+
[2025-09-12 19:39:48,232] - step:723/900 train_loss:0.4163 lr:0.0003000000 time/step:116.18s
|
| 738 |
+
[2025-09-12 19:41:45,737] - step:724/900 train_loss:0.4015 lr:0.0003000000 time/step:117.50s
|
| 739 |
+
[2025-09-12 19:43:42,903] - step:725/900 train_loss:0.4202 lr:0.0003000000 time/step:117.16s
|
| 740 |
+
[2025-09-12 19:45:40,142] - step:726/900 train_loss:0.4291 lr:0.0003000000 time/step:117.23s
|
| 741 |
+
[2025-09-12 19:47:38,767] - step:727/900 train_loss:0.4219 lr:0.0003000000 time/step:118.52s
|
| 742 |
+
[2025-09-12 19:49:35,311] - step:728/900 train_loss:0.4267 lr:0.0003000000 time/step:116.54s
|
| 743 |
+
[2025-09-12 19:51:31,352] - step:729/900 train_loss:0.4008 lr:0.0003000000 time/step:116.03s
|
| 744 |
+
[2025-09-12 19:53:29,152] - step:730/900 train_loss:0.4191 lr:0.0003000000 time/step:117.79s
|
| 745 |
+
[2025-09-12 19:55:25,960] - step:731/900 train_loss:0.4093 lr:0.0003000000 time/step:116.80s
|
| 746 |
+
[2025-09-12 19:57:23,584] - step:732/900 train_loss:0.4230 lr:0.0003000000 time/step:117.61s
|
| 747 |
+
[2025-09-12 19:59:20,804] - step:733/900 train_loss:0.4213 lr:0.0003000000 time/step:117.21s
|
| 748 |
+
[2025-09-12 20:01:17,340] - step:734/900 train_loss:0.4071 lr:0.0003000000 time/step:116.53s
|
| 749 |
+
[2025-09-12 20:03:14,230] - step:735/900 train_loss:0.3944 lr:0.0003000000 time/step:116.88s
|
| 750 |
+
[2025-09-12 20:05:11,216] - step:736/900 train_loss:0.3971 lr:0.0003000000 time/step:116.98s
|
| 751 |
+
[2025-09-12 20:07:08,901] - step:737/900 train_loss:0.4144 lr:0.0003000000 time/step:117.67s
|
| 752 |
+
[2025-09-12 20:09:06,344] - step:738/900 train_loss:0.4349 lr:0.0003000000 time/step:117.44s
|
| 753 |
+
[2025-09-12 20:11:03,286] - step:739/900 train_loss:0.3967 lr:0.0003000000 time/step:116.94s
|
| 754 |
+
[2025-09-12 20:12:59,810] - step:740/900 train_loss:0.4104 lr:0.0003000000 time/step:116.52s
|
| 755 |
+
[2025-09-12 20:14:56,833] - step:741/900 train_loss:0.4195 lr:0.0003000000 time/step:117.01s
|
| 756 |
+
[2025-09-12 20:16:53,836] - step:742/900 train_loss:0.4083 lr:0.0003000000 time/step:116.99s
|
| 757 |
+
[2025-09-12 20:18:51,981] - step:743/900 train_loss:0.4021 lr:0.0003000000 time/step:118.14s
|
| 758 |
+
[2025-09-12 20:20:48,901] - step:744/900 train_loss:0.4182 lr:0.0003000000 time/step:116.91s
|
| 759 |
+
[2025-09-12 20:22:46,747] - step:745/900 train_loss:0.3946 lr:0.0003000000 time/step:117.84s
|
| 760 |
+
[2025-09-12 20:24:42,792] - step:746/900 train_loss:0.3826 lr:0.0003000000 time/step:116.03s
|
| 761 |
+
[2025-09-12 20:26:39,772] - step:747/900 train_loss:0.4267 lr:0.0003000000 time/step:116.97s
|
| 762 |
+
[2025-09-12 20:28:37,994] - step:748/900 train_loss:0.3935 lr:0.0003000000 time/step:118.21s
|
| 763 |
+
[2025-09-12 20:30:34,939] - step:749/900 train_loss:0.3979 lr:0.0003000000 time/step:116.93s
|
| 764 |
+
[2025-09-12 20:32:32,443] - step:750/900 train_loss:0.4253 lr:0.0003000000 time/step:117.50s
|
| 765 |
+
[2025-09-12 20:34:29,466] - step:751/900 train_loss:0.4006 lr:0.0003000000 time/step:117.01s
|
| 766 |
+
[2025-09-12 20:36:25,608] - step:752/900 train_loss:0.4219 lr:0.0003000000 time/step:116.13s
|
| 767 |
+
[2025-09-12 20:38:22,452] - step:753/900 train_loss:0.3919 lr:0.0003000000 time/step:116.84s
|
| 768 |
+
[2025-09-12 20:40:21,076] - step:754/900 train_loss:0.4138 lr:0.0003000000 time/step:118.62s
|
| 769 |
+
[2025-09-12 20:42:18,879] - step:755/900 train_loss:0.4144 lr:0.0003000000 time/step:117.79s
|
| 770 |
+
[2025-09-12 20:44:15,840] - step:756/900 train_loss:0.4077 lr:0.0003000000 time/step:116.95s
|
| 771 |
+
[2025-09-12 20:46:12,684] - step:757/900 train_loss:0.4420 lr:0.0003000000 time/step:116.84s
|
| 772 |
+
[2025-09-12 20:48:08,449] - step:758/900 train_loss:0.4310 lr:0.0003000000 time/step:115.75s
|
| 773 |
+
[2025-09-12 20:50:06,514] - step:759/900 train_loss:0.4193 lr:0.0003000000 time/step:118.06s
|
| 774 |
+
[2025-09-12 20:52:03,393] - step:760/900 train_loss:0.4097 lr:0.0003000000 time/step:116.87s
|
| 775 |
+
[2025-09-12 20:54:01,558] - step:761/900 train_loss:0.4206 lr:0.0003000000 time/step:118.16s
|
| 776 |
+
[2025-09-12 20:55:58,603] - step:762/900 train_loss:0.4123 lr:0.0003000000 time/step:117.04s
|
| 777 |
+
[2025-09-12 20:57:55,067] - step:763/900 train_loss:0.3960 lr:0.0003000000 time/step:116.45s
|
| 778 |
+
[2025-09-12 20:59:51,936] - step:764/900 train_loss:0.4299 lr:0.0003000000 time/step:116.85s
|
| 779 |
+
[2025-09-12 21:01:50,033] - step:765/900 train_loss:0.4122 lr:0.0003000000 time/step:118.09s
|
| 780 |
+
[2025-09-12 21:03:47,856] - step:766/900 train_loss:0.3942 lr:0.0003000000 time/step:117.82s
|
| 781 |
+
[2025-09-12 21:05:44,878] - step:767/900 train_loss:0.3948 lr:0.0003000000 time/step:117.01s
|
| 782 |
+
[2025-09-12 21:07:41,799] - step:768/900 train_loss:0.3943 lr:0.0003000000 time/step:116.91s
|
| 783 |
+
[2025-09-12 21:09:38,205] - step:769/900 train_loss:0.4122 lr:0.0003000000 time/step:116.40s
|
| 784 |
+
[2025-09-12 21:11:35,911] - step:770/900 train_loss:0.4029 lr:0.0003000000 time/step:117.70s
|
| 785 |
+
[2025-09-12 21:13:33,673] - step:771/900 train_loss:0.3994 lr:0.0003000000 time/step:117.75s
|
| 786 |
+
[2025-09-12 21:15:30,614] - step:772/900 train_loss:0.4263 lr:0.0003000000 time/step:116.93s
|
| 787 |
+
[2025-09-12 21:17:27,398] - step:773/900 train_loss:0.4199 lr:0.0003000000 time/step:116.77s
|
| 788 |
+
[2025-09-12 21:19:24,243] - step:774/900 train_loss:0.4126 lr:0.0003000000 time/step:116.84s
|
| 789 |
+
[2025-09-12 21:21:21,644] - step:775/900 train_loss:0.3885 lr:0.0003000000 time/step:117.39s
|
| 790 |
+
[2025-09-12 21:23:18,489] - step:776/900 train_loss:0.4123 lr:0.0003000000 time/step:116.84s
|
| 791 |
+
[2025-09-12 21:25:16,373] - step:777/900 train_loss:0.3887 lr:0.0003000000 time/step:117.88s
|
| 792 |
+
[2025-09-12 21:27:13,296] - step:778/900 train_loss:0.4256 lr:0.0003000000 time/step:116.91s
|
| 793 |
+
[2025-09-12 21:29:10,200] - step:779/900 train_loss:0.4090 lr:0.0003000000 time/step:116.90s
|
| 794 |
+
[2025-09-12 21:31:07,409] - step:780/900 train_loss:0.3895 lr:0.0003000000 time/step:117.20s
|
| 795 |
+
[2025-09-12 21:33:04,490] - step:781/900 train_loss:0.4134 lr:0.0003000000 time/step:117.07s
|
| 796 |
+
[2025-09-12 21:35:01,686] - step:782/900 train_loss:0.4317 lr:0.0003000000 time/step:117.19s
|
| 797 |
+
[2025-09-12 21:36:58,773] - step:783/900 train_loss:0.4093 lr:0.0003000000 time/step:117.07s
|
| 798 |
+
[2025-09-12 21:38:55,697] - step:784/900 train_loss:0.4052 lr:0.0003000000 time/step:116.92s
|
| 799 |
+
[2025-09-12 21:40:52,704] - step:785/900 train_loss:0.4158 lr:0.0003000000 time/step:117.00s
|
| 800 |
+
[2025-09-12 21:42:51,059] - step:786/900 train_loss:0.3933 lr:0.0003000000 time/step:118.35s
|
| 801 |
+
[2025-09-12 21:44:47,908] - step:787/900 train_loss:0.4167 lr:0.0003000000 time/step:116.84s
|
| 802 |
+
[2025-09-12 21:46:44,911] - step:788/900 train_loss:0.3970 lr:0.0003000000 time/step:116.99s
|
| 803 |
+
[2025-09-12 21:48:41,985] - step:789/900 train_loss:0.3789 lr:0.0003000000 time/step:117.06s
|
| 804 |
+
[2025-09-12 21:50:38,911] - step:790/900 train_loss:0.4033 lr:0.0003000000 time/step:116.92s
|
| 805 |
+
[2025-09-12 21:52:36,518] - step:791/900 train_loss:0.3703 lr:0.0003000000 time/step:117.60s
|
| 806 |
+
[2025-09-12 21:54:35,924] - step:792/900 train_loss:0.3987 lr:0.0003000000 time/step:119.40s
|
| 807 |
+
[2025-09-12 21:56:32,089] - step:793/900 train_loss:0.4103 lr:0.0003000000 time/step:116.16s
|
| 808 |
+
[2025-09-12 21:58:29,152] - step:794/900 train_loss:0.4121 lr:0.0003000000 time/step:117.05s
|
| 809 |
+
[2025-09-12 22:00:26,076] - step:795/900 train_loss:0.3756 lr:0.0003000000 time/step:116.92s
|
| 810 |
+
[2025-09-12 22:02:23,114] - step:796/900 train_loss:0.4195 lr:0.0003000000 time/step:117.03s
|
| 811 |
+
[2025-09-12 22:04:21,556] - step:797/900 train_loss:0.3852 lr:0.0003000000 time/step:118.43s
|
| 812 |
+
[2025-09-12 22:06:19,445] - step:798/900 train_loss:0.4343 lr:0.0003000000 time/step:117.88s
|
| 813 |
+
[2025-09-12 22:08:15,683] - step:799/900 train_loss:0.4024 lr:0.0003000000 time/step:116.22s
|
| 814 |
+
[2025-09-12 22:10:13,431] - step:800/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@800.pt...
|
| 815 |
+
[2025-09-12 22:10:13,432] - step:800/900 train_loss:0.4081 lr:0.0003000000 time/step:117.14s
|
| 816 |
+
[2025-09-12 22:12:09,931] - step:801/900 train_loss:0.4091 lr:0.0003000000 time/step:116.49s
|
| 817 |
+
[2025-09-12 22:14:08,180] - step:802/900 train_loss:0.4188 lr:0.0003000000 time/step:118.24s
|
| 818 |
+
[2025-09-12 22:16:05,937] - step:803/900 train_loss:0.4227 lr:0.0003000000 time/step:117.74s
|
| 819 |
+
[2025-09-12 22:18:02,679] - step:804/900 train_loss:0.3994 lr:0.0003000000 time/step:116.73s
|
| 820 |
+
[2025-09-12 22:19:59,511] - step:805/900 train_loss:0.3885 lr:0.0003000000 time/step:116.82s
|
| 821 |
+
[2025-09-12 22:21:56,602] - step:806/900 train_loss:0.3937 lr:0.0003000000 time/step:117.08s
|
| 822 |
+
[2025-09-12 22:23:54,831] - step:807/900 train_loss:0.4143 lr:0.0003000000 time/step:118.22s
|
| 823 |
+
[2025-09-12 22:25:52,068] - step:808/900 train_loss:0.4324 lr:0.0003000000 time/step:117.23s
|
| 824 |
+
[2025-09-12 22:27:49,499] - step:809/900 train_loss:0.3988 lr:0.0003000000 time/step:117.42s
|
| 825 |
+
[2025-09-12 22:29:45,897] - step:810/900 train_loss:0.4016 lr:0.0003000000 time/step:116.39s
|
| 826 |
+
[2025-09-12 22:31:42,993] - step:811/900 train_loss:0.4106 lr:0.0003000000 time/step:117.08s
|
| 827 |
+
[2025-09-12 22:33:41,172] - step:812/900 train_loss:0.4097 lr:0.0003000000 time/step:118.17s
|
| 828 |
+
[2025-09-12 22:35:38,349] - step:813/900 train_loss:0.3838 lr:0.0003000000 time/step:117.17s
|
| 829 |
+
[2025-09-12 22:37:36,103] - step:814/900 train_loss:0.3802 lr:0.0003000000 time/step:117.74s
|
| 830 |
+
[2025-09-12 22:39:33,507] - step:815/900 train_loss:0.4195 lr:0.0003000000 time/step:117.40s
|
| 831 |
+
[2025-09-12 22:41:29,750] - step:816/900 train_loss:0.4333 lr:0.0003000000 time/step:116.23s
|
| 832 |
+
[2025-09-12 22:43:26,622] - step:817/900 train_loss:0.4108 lr:0.0003000000 time/step:116.87s
|
| 833 |
+
[2025-09-12 22:45:25,127] - step:818/900 train_loss:0.3866 lr:0.0003000000 time/step:118.49s
|
| 834 |
+
[2025-09-12 22:47:22,168] - step:819/900 train_loss:0.4197 lr:0.0003000000 time/step:117.03s
|
| 835 |
+
[2025-09-12 22:49:19,672] - step:820/900 train_loss:0.3791 lr:0.0003000000 time/step:117.50s
|
| 836 |
+
[2025-09-12 22:51:17,438] - step:821/900 train_loss:0.4053 lr:0.0003000000 time/step:117.76s
|
| 837 |
+
[2025-09-12 22:53:13,613] - step:822/900 train_loss:0.4096 lr:0.0003000000 time/step:116.16s
|
| 838 |
+
[2025-09-12 22:55:11,085] - step:823/900 train_loss:0.4086 lr:0.0003000000 time/step:117.46s
|
| 839 |
+
[2025-09-12 22:57:08,006] - step:824/900 train_loss:0.4028 lr:0.0003000000 time/step:116.90s
|
| 840 |
+
[2025-09-12 22:59:05,729] - step:825/900 train_loss:0.3960 lr:0.0003000000 time/step:117.72s
|
| 841 |
+
[2025-09-12 23:01:03,331] - step:826/900 train_loss:0.4060 lr:0.0003000000 time/step:117.59s
|
| 842 |
+
[2025-09-12 23:03:00,051] - step:827/900 train_loss:0.4147 lr:0.0003000000 time/step:116.71s
|
| 843 |
+
[2025-09-12 23:04:56,347] - step:828/900 train_loss:0.4173 lr:0.0003000000 time/step:116.28s
|
| 844 |
+
[2025-09-12 23:06:53,382] - step:829/900 train_loss:0.4136 lr:0.0003000000 time/step:117.02s
|
| 845 |
+
[2025-09-12 23:08:50,925] - step:830/900 train_loss:0.4135 lr:0.0003000000 time/step:117.53s
|
| 846 |
+
[2025-09-12 23:10:48,709] - step:831/900 train_loss:0.3960 lr:0.0003000000 time/step:117.78s
|
| 847 |
+
[2025-09-12 23:12:45,852] - step:832/900 train_loss:0.3999 lr:0.0003000000 time/step:117.13s
|
| 848 |
+
[2025-09-12 23:14:43,195] - step:833/900 train_loss:0.4046 lr:0.0003000000 time/step:117.33s
|
| 849 |
+
[2025-09-12 23:16:39,299] - step:834/900 train_loss:0.4188 lr:0.0003000000 time/step:116.10s
|
| 850 |
+
[2025-09-12 23:18:36,142] - step:835/900 train_loss:0.3957 lr:0.0003000000 time/step:116.83s
|
| 851 |
+
[2025-09-12 23:20:34,486] - step:836/900 train_loss:0.4188 lr:0.0003000000 time/step:118.34s
|
| 852 |
+
[2025-09-12 23:22:31,489] - step:837/900 train_loss:0.3849 lr:0.0003000000 time/step:116.99s
|
| 853 |
+
[2025-09-12 23:24:28,392] - step:838/900 train_loss:0.4255 lr:0.0003000000 time/step:116.90s
|
| 854 |
+
[2025-09-12 23:26:24,998] - step:839/900 train_loss:0.4019 lr:0.0003000000 time/step:116.59s
|
| 855 |
+
[2025-09-12 23:28:21,798] - step:840/900 train_loss:0.4149 lr:0.0003000000 time/step:116.78s
|
| 856 |
+
[2025-09-12 23:30:20,342] - step:841/900 train_loss:0.3937 lr:0.0003000000 time/step:118.54s
|
| 857 |
+
[2025-09-12 23:32:17,286] - step:842/900 train_loss:0.3996 lr:0.0003000000 time/step:116.94s
|
| 858 |
+
[2025-09-12 23:34:14,169] - step:843/900 train_loss:0.3911 lr:0.0003000000 time/step:116.88s
|
| 859 |
+
[2025-09-12 23:36:11,513] - step:844/900 train_loss:0.4199 lr:0.0003000000 time/step:117.34s
|
| 860 |
+
[2025-09-12 23:38:07,515] - step:845/900 train_loss:0.3990 lr:0.0003000000 time/step:115.99s
|
| 861 |
+
[2025-09-12 23:40:05,382] - step:846/900 train_loss:0.4059 lr:0.0003000000 time/step:117.86s
|
| 862 |
+
[2025-09-12 23:42:03,341] - step:847/900 train_loss:0.4217 lr:0.0003000000 time/step:117.95s
|
| 863 |
+
[2025-09-12 23:44:00,267] - step:848/900 train_loss:0.4059 lr:0.0003000000 time/step:116.92s
|
| 864 |
+
[2025-09-12 23:45:57,550] - step:849/900 train_loss:0.4140 lr:0.0003000000 time/step:117.28s
|
| 865 |
+
[2025-09-12 23:47:54,492] - step:850/900 train_loss:0.3920 lr:0.0003000000 time/step:116.93s
|
| 866 |
+
[2025-09-12 23:49:50,997] - step:851/900 train_loss:0.4194 lr:0.0003000000 time/step:116.50s
|
| 867 |
+
[2025-09-12 23:51:48,718] - step:852/900 train_loss:0.3914 lr:0.0003000000 time/step:117.71s
|
| 868 |
+
[2025-09-12 23:53:45,683] - step:853/900 train_loss:0.4012 lr:0.0003000000 time/step:116.96s
|
| 869 |
+
[2025-09-12 23:55:43,182] - step:854/900 train_loss:0.4198 lr:0.0003000000 time/step:117.47s
|
| 870 |
+
[2025-09-12 23:57:40,227] - step:855/900 train_loss:0.4059 lr:0.0003000000 time/step:117.03s
|
| 871 |
+
[2025-09-12 23:59:37,792] - step:856/900 train_loss:0.4026 lr:0.0003000000 time/step:117.56s
|
| 872 |
+
[2025-09-13 00:01:34,695] - step:857/900 train_loss:0.4171 lr:0.0003000000 time/step:116.89s
|
| 873 |
+
[2025-09-13 00:03:32,341] - step:858/900 train_loss:0.4017 lr:0.0003000000 time/step:117.64s
|
| 874 |
+
[2025-09-13 00:05:29,421] - step:859/900 train_loss:0.4011 lr:0.0003000000 time/step:117.07s
|
| 875 |
+
[2025-09-13 00:07:26,749] - step:860/900 train_loss:0.3910 lr:0.0003000000 time/step:117.32s
|
| 876 |
+
[2025-09-13 00:09:23,608] - step:861/900 train_loss:0.4093 lr:0.0003000000 time/step:116.85s
|
| 877 |
+
[2025-09-13 00:11:21,037] - step:862/900 train_loss:0.4295 lr:0.0003000000 time/step:117.42s
|
| 878 |
+
[2025-09-13 00:13:17,816] - step:863/900 train_loss:0.4025 lr:0.0003000000 time/step:116.77s
|
| 879 |
+
[2025-09-13 00:15:14,919] - step:864/900 train_loss:0.3978 lr:0.0003000000 time/step:117.10s
|
| 880 |
+
[2025-09-13 00:17:12,309] - step:865/900 train_loss:0.3941 lr:0.0003000000 time/step:117.38s
|
| 881 |
+
[2025-09-13 00:19:09,330] - step:866/900 train_loss:0.4150 lr:0.0003000000 time/step:117.01s
|
| 882 |
+
[2025-09-13 00:21:06,411] - step:867/900 train_loss:0.4101 lr:0.0003000000 time/step:117.01s
|
| 883 |
+
[2025-09-13 00:23:03,516] - step:868/900 train_loss:0.4156 lr:0.0003000000 time/step:117.10s
|
| 884 |
+
[2025-09-13 00:25:00,493] - step:869/900 train_loss:0.4128 lr:0.0003000000 time/step:116.97s
|
| 885 |
+
[2025-09-13 00:26:57,821] - step:870/900 train_loss:0.4182 lr:0.0003000000 time/step:117.31s
|
| 886 |
+
[2025-09-13 00:28:54,768] - step:871/900 train_loss:0.3940 lr:0.0003000000 time/step:116.93s
|
| 887 |
+
[2025-09-13 00:30:51,704] - step:872/900 train_loss:0.4091 lr:0.0003000000 time/step:116.93s
|
| 888 |
+
[2025-09-13 00:32:48,692] - step:873/900 train_loss:0.4066 lr:0.0003000000 time/step:116.98s
|
| 889 |
+
[2025-09-13 00:34:47,091] - step:874/900 train_loss:0.4061 lr:0.0003000000 time/step:118.39s
|
| 890 |
+
[2025-09-13 00:36:44,116] - step:875/900 train_loss:0.3712 lr:0.0003000000 time/step:117.01s
|
| 891 |
+
[2025-09-13 00:38:41,019] - step:876/900 train_loss:0.4040 lr:0.0003000000 time/step:116.89s
|
| 892 |
+
[2025-09-13 00:40:38,506] - step:877/900 train_loss:0.3807 lr:0.0003000000 time/step:117.48s
|
| 893 |
+
[2025-09-13 00:42:35,384] - step:878/900 train_loss:0.4103 lr:0.0003000000 time/step:116.87s
|
| 894 |
+
[2025-09-13 00:44:33,175] - step:879/900 train_loss:0.4001 lr:0.0003000000 time/step:117.79s
|
| 895 |
+
[2025-09-13 00:46:29,986] - step:880/900 train_loss:0.3966 lr:0.0003000000 time/step:116.79s
|
| 896 |
+
[2025-09-13 00:48:27,354] - step:881/900 train_loss:0.4188 lr:0.0003000000 time/step:117.29s
|
| 897 |
+
[2025-09-13 00:50:24,406] - step:882/900 train_loss:0.4164 lr:0.0003000000 time/step:117.05s
|
| 898 |
+
[2025-09-13 00:52:22,291] - step:883/900 train_loss:0.3936 lr:0.0003000000 time/step:117.88s
|
| 899 |
+
[2025-09-13 00:54:20,651] - step:884/900 train_loss:0.4148 lr:0.0003000000 time/step:118.35s
|
| 900 |
+
[2025-09-13 00:56:17,788] - step:885/900 train_loss:0.4173 lr:0.0003000000 time/step:117.13s
|
| 901 |
+
[2025-09-13 00:58:14,279] - step:886/900 train_loss:0.4260 lr:0.0003000000 time/step:116.46s
|
| 902 |
+
[2025-09-13 01:00:11,090] - step:887/900 train_loss:0.4037 lr:0.0003000000 time/step:116.80s
|
| 903 |
+
[2025-09-13 01:02:08,948] - step:888/900 train_loss:0.4117 lr:0.0003000000 time/step:117.85s
|
| 904 |
+
[2025-09-13 01:04:07,249] - step:889/900 train_loss:0.4068 lr:0.0003000000 time/step:118.29s
|
| 905 |
+
[2025-09-13 01:06:04,130] - step:890/900 train_loss:0.4187 lr:0.0003000000 time/step:116.87s
|
| 906 |
+
[2025-09-13 01:08:01,508] - step:891/900 train_loss:0.4159 lr:0.0003000000 time/step:117.36s
|
| 907 |
+
[2025-09-13 01:09:57,620] - step:892/900 train_loss:0.3978 lr:0.0003000000 time/step:116.10s
|
| 908 |
+
[2025-09-13 01:11:55,493] - step:893/900 train_loss:0.3925 lr:0.0003000000 time/step:117.86s
|
| 909 |
+
[2025-09-13 01:13:52,516] - step:894/900 train_loss:0.3845 lr:0.0003000000 time/step:117.01s
|
| 910 |
+
[2025-09-13 01:15:50,321] - step:895/900 train_loss:0.4062 lr:0.0003000000 time/step:117.80s
|
| 911 |
+
[2025-09-13 01:17:47,232] - step:896/900 train_loss:0.3879 lr:0.0003000000 time/step:116.90s
|
| 912 |
+
[2025-09-13 01:19:44,630] - step:897/900 train_loss:0.4272 lr:0.0003000000 time/step:117.39s
|
| 913 |
+
[2025-09-13 01:21:41,559] - step:898/900 train_loss:0.4121 lr:0.0003000000 time/step:116.92s
|
| 914 |
+
[2025-09-13 01:23:39,154] - step:899/900 train_loss:0.4079 lr:0.0003000000 time/step:117.59s
|
| 915 |
+
[2025-09-13 01:25:37,577] - step:900/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@900.pt...
|
| 916 |
+
[2025-09-13 01:25:37,578] - step:900/900 train_loss:0.3995 lr:0.0003000000 time/step:117.81s
|
wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-09-11T20:06:44.955449103+01:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-09-11T20:06:45.176117844+01:00","level":"INFO","msg":"created new stream","id":"y9v5i9gr"}
|
| 3 |
+
{"time":"2025-09-11T20:06:45.176201537+01:00","level":"INFO","msg":"stream: started","id":"y9v5i9gr"}
|
| 4 |
+
{"time":"2025-09-11T20:06:45.176254637+01:00","level":"INFO","msg":"writer: Do: started","stream_id":"y9v5i9gr"}
|
| 5 |
+
{"time":"2025-09-11T20:06:45.176292219+01:00","level":"INFO","msg":"handler: started","stream_id":"y9v5i9gr"}
|
| 6 |
+
{"time":"2025-09-11T20:06:45.176341928+01:00","level":"INFO","msg":"sender: started","stream_id":"y9v5i9gr"}
|
| 7 |
+
{"time":"2025-09-11T20:06:45.680069036+01:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-09-11T20:19:16.313200337+01:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/circuit-mtp/mtp/y9v5i9gr/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 9 |
+
{"time":"2025-09-12T00:53:29.590652615+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
|
| 10 |
+
{"time":"2025-09-12T01:30:18.032795292+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 11 |
+
{"time":"2025-09-12T01:30:50.327057066+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 12 |
+
{"time":"2025-09-12T01:31:25.000022545+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 13 |
+
{"time":"2025-09-12T01:32:03.267256543+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 14 |
+
{"time":"2025-09-12T02:41:34.535497308+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 15 |
+
{"time":"2025-09-12T02:42:24.914157379+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
|
| 16 |
+
{"time":"2025-09-12T02:42:57.41051518+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 17 |
+
{"time":"2025-09-12T10:36:51.38167595+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 18 |
+
{"time":"2025-09-12T10:38:06.370172425+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 19 |
+
{"time":"2025-09-12T10:38:38.465480726+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 20 |
+
{"time":"2025-09-12T10:39:07.484991796+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
|
| 21 |
+
{"time":"2025-09-12T10:39:41.575653023+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
|
| 22 |
+
{"time":"2025-09-12T20:16:55.628544216+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 23 |
+
{"time":"2025-09-12T20:21:25.750812333+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 24 |
+
{"time":"2025-09-12T22:43:55.97454382+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 25 |
+
{"time":"2025-09-13T00:52:11.684482933+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 26 |
+
{"time":"2025-09-13T00:54:54.045134291+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
|
| 27 |
+
{"time":"2025-09-13T00:55:26.197593179+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 28 |
+
{"time":"2025-09-13T01:05:42.010380611+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 29 |
+
{"time":"2025-09-13T01:06:14.056932921+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 30 |
+
{"time":"2025-09-13T01:06:48.575121732+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 31 |
+
{"time":"2025-09-13T01:07:28.074495024+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 32 |
+
{"time":"2025-09-13T01:09:42.005493483+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 33 |
+
{"time":"2025-09-13T01:10:14.454893184+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 34 |
+
{"time":"2025-09-13T01:10:49.419226595+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 35 |
+
{"time":"2025-09-13T01:11:21.445954263+01:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/circuit-mtp/mtp/y9v5i9gr/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 36 |
+
{"time":"2025-09-13T01:11:57.007348427+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 37 |
+
{"time":"2025-09-13T01:13:57.010172043+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 38 |
+
{"time":"2025-09-13T01:14:29.220923193+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 39 |
+
{"time":"2025-09-13T01:15:27.013535251+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 40 |
+
{"time":"2025-09-13T01:15:59.276998526+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 41 |
+
{"time":"2025-09-13T01:16:33.628210655+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 42 |
+
{"time":"2025-09-13T01:17:42.016257241+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 43 |
+
{"time":"2025-09-13T01:18:14.389776393+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 44 |
+
{"time":"2025-09-13T01:19:42.02019871+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 45 |
+
{"time":"2025-09-13T01:25:38.927499163+01:00","level":"INFO","msg":"stream: closing","id":"y9v5i9gr"}
|
| 46 |
+
{"time":"2025-09-13T01:25:38.930059685+01:00","level":"INFO","msg":"Stopping system monitor"}
|
| 47 |
+
{"time":"2025-09-13T01:25:38.990179981+01:00","level":"INFO","msg":"Stopped system monitor"}
|
| 48 |
+
{"time":"2025-09-13T01:25:39.717455712+01:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 49 |
+
{"time":"2025-09-13T01:25:39.93171592+01:00","level":"INFO","msg":"handler: closed","stream_id":"y9v5i9gr"}
|
| 50 |
+
{"time":"2025-09-13T01:25:39.931829957+01:00","level":"INFO","msg":"writer: Close: closed","stream_id":"y9v5i9gr"}
|
| 51 |
+
{"time":"2025-09-13T01:25:39.932853619+01:00","level":"INFO","msg":"sender: closed","stream_id":"y9v5i9gr"}
|
| 52 |
+
{"time":"2025-09-13T01:25:39.932961632+01:00","level":"INFO","msg":"stream: closed","id":"y9v5i9gr"}
|
wandb/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-09-11 20:06:44,916 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
| 2 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Configure stats pid to 2716293
|
| 3 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from /home/agrivas/.config/wandb/settings
|
| 4 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/settings
|
| 5 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
| 6 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug.log
|
| 7 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug-internal.log
|
| 8 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():761] calling init triggers
|
| 9 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'compile': True, 'device': 'cuda', 'from_checkpoint': None, 'name': 'nanogpt', 'training': {'random_seed': 13, 'batch_size': 256, 'device_batch_size': 1, 'sequence_length': 8192, 'num_iterations': 900, 'learning_rate': 0.0003, 'use_scheduler': False, 'save_model': True, 'save_optimizer': True, 'save_model_every': 100, 'val_loss_every': 100, 'val_tokens': 4194304, 'expname': 'lr-3e-4-no-lora-cp-n-8-r-8'}, 'model': {'name': 'mtp', 'beta': 0.0, 'gamma': 1, 'kl_algorithm': 'full', 'kl_type': 'forward', 'model': {'_target_': 'mtp.models.mtp.MultiTokenLM', 'lm': '${lm.model}', 'circuit': '${circuit.model}', 'mt_head_kwargs': '${mt_head.hyperparameters}', 'init_from_lm_head': True, 'kl_type': '${model.kl_type}', 'kl_algorithm': '${model.kl_algorithm}', 'beta': 0, 'gamma': 0.9}}, 'circuit': {'name': 'cp', 'n_token': 8, 'n_component': 8, 'model': {'_target_': 'mtp.models.circuits.CircuitModel', 'vocab_size': 320, 'n_token': 8, 'n_component': 8, 'kind': 'cp'}}, 'mt_head': {'name': 'linear-evabyte', 'hyperparameters': {'type': 'evabyte', 'n_embd': 4096, 'transformer_n_head': 32, 'transformer_n_layer': 0, 'expander_type': 'linear', 'expander_n_layer': 1, 'freeze_vocab_unembedding': False, 'share_sum_weights': False, 'contextual_hmm_weights': True, 'init_hmm_identity': True}}, 'adaptor': {'name': 'none', 'hyperparameters': None}, 'lm': {'name': 'evabyte', 'n_embd': 4096, 'n_head': 32, 'model': {'_target_': 'mtp.models.lm.LM', 'lm': None, 'encoder_only': True, 'from_checkpoint': None, 'from_huggingface': 'EvaByte/EvaByte-SFT', 'adaptor_kwargs': None, 'ref_enc': 'model', 'ref_head': 'lm_head', 'freeze': True}}, 'data': {'name': 'tulu3-evabyte', 'train_bin': 'agrv/tulu-v3-sft-evabyte-packed-seq-len-8192', 'val_bin': None, 'vocab_size': 320}, 'generate': {'speculative': False}, '_wandb': {}}
|
| 11 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():784] starting backend
|
| 12 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():788] sending inform_init request
|
| 13 |
+
2025-09-11 20:06:44,948 INFO MainThread:2716293 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-09-11 20:06:44,948 INFO MainThread:2716293 [wandb_init.py:init():798] backend started and connected
|
| 15 |
+
2025-09-11 20:06:44,953 INFO MainThread:2716293 [wandb_init.py:init():891] updated telemetry
|
| 16 |
+
2025-09-11 20:06:44,961 INFO MainThread:2716293 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-09-11 20:06:45,675 INFO MainThread:2716293 [wandb_init.py:init():990] starting run threads in backend
|
| 18 |
+
2025-09-11 20:06:46,525 INFO MainThread:2716293 [wandb_run.py:_console_start():2375] atexit reg
|
| 19 |
+
2025-09-11 20:06:46,526 INFO MainThread:2716293 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
| 20 |
+
2025-09-11 20:06:46,533 INFO MainThread:2716293 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
| 21 |
+
2025-09-11 20:06:46,533 INFO MainThread:2716293 [wandb_run.py:_redirect():2315] Redirects installed.
|
| 22 |
+
2025-09-11 20:06:46,549 INFO MainThread:2716293 [wandb_init.py:init():1032] run started, returning control to user process
|
| 23 |
+
2025-09-13 01:25:38,827 INFO MsgRouterThr:2716293 [mailbox.py:close():129] Closing mailbox, abandoning 1 handles.
|
wandb/run-20250911_200644-y9v5i9gr/files/config.yaml
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.8
|
| 4 |
+
m:
|
| 5 |
+
- "1": train/ce_loss_at_4
|
| 6 |
+
"5": 2
|
| 7 |
+
"6":
|
| 8 |
+
- 1
|
| 9 |
+
- 3
|
| 10 |
+
"7": []
|
| 11 |
+
- "1": global_step
|
| 12 |
+
"5": 2
|
| 13 |
+
"6":
|
| 14 |
+
- 1
|
| 15 |
+
- 3
|
| 16 |
+
"7": []
|
| 17 |
+
- "1": train/ce_loss_at_7
|
| 18 |
+
"5": 2
|
| 19 |
+
"6":
|
| 20 |
+
- 1
|
| 21 |
+
- 3
|
| 22 |
+
"7": []
|
| 23 |
+
- "1": train/loss
|
| 24 |
+
"5": 2
|
| 25 |
+
"6":
|
| 26 |
+
- 1
|
| 27 |
+
- 3
|
| 28 |
+
"7": []
|
| 29 |
+
- "1": train/ce_loss_at_5
|
| 30 |
+
"5": 2
|
| 31 |
+
"6":
|
| 32 |
+
- 1
|
| 33 |
+
- 3
|
| 34 |
+
"7": []
|
| 35 |
+
- "1": train/ce_loss_at_2
|
| 36 |
+
"5": 2
|
| 37 |
+
"6":
|
| 38 |
+
- 1
|
| 39 |
+
- 3
|
| 40 |
+
"7": []
|
| 41 |
+
- "1": train/ce_loss_at_6
|
| 42 |
+
"5": 2
|
| 43 |
+
"6":
|
| 44 |
+
- 1
|
| 45 |
+
- 3
|
| 46 |
+
"7": []
|
| 47 |
+
- "1": train/ce_loss_at_8
|
| 48 |
+
"5": 2
|
| 49 |
+
"6":
|
| 50 |
+
- 1
|
| 51 |
+
- 3
|
| 52 |
+
"7": []
|
| 53 |
+
- "1": train/ce_loss_at_1
|
| 54 |
+
"5": 2
|
| 55 |
+
"6":
|
| 56 |
+
- 1
|
| 57 |
+
- 3
|
| 58 |
+
"7": []
|
| 59 |
+
- "1": train/ce_loss_at_3
|
| 60 |
+
"5": 2
|
| 61 |
+
"6":
|
| 62 |
+
- 1
|
| 63 |
+
- 3
|
| 64 |
+
"7": []
|
| 65 |
+
python_version: 3.10.16
|
| 66 |
+
t:
|
| 67 |
+
"1":
|
| 68 |
+
- 1
|
| 69 |
+
- 11
|
| 70 |
+
- 41
|
| 71 |
+
- 49
|
| 72 |
+
- 50
|
| 73 |
+
- 51
|
| 74 |
+
- 55
|
| 75 |
+
- 71
|
| 76 |
+
- 84
|
| 77 |
+
- 98
|
| 78 |
+
"2":
|
| 79 |
+
- 1
|
| 80 |
+
- 11
|
| 81 |
+
- 41
|
| 82 |
+
- 49
|
| 83 |
+
- 50
|
| 84 |
+
- 51
|
| 85 |
+
- 55
|
| 86 |
+
- 71
|
| 87 |
+
- 84
|
| 88 |
+
- 98
|
| 89 |
+
"3":
|
| 90 |
+
- 7
|
| 91 |
+
- 13
|
| 92 |
+
- 15
|
| 93 |
+
- 16
|
| 94 |
+
- 23
|
| 95 |
+
- 55
|
| 96 |
+
"4": 3.10.16
|
| 97 |
+
"5": 0.19.8
|
| 98 |
+
"6": 4.49.0
|
| 99 |
+
"8":
|
| 100 |
+
- 5
|
| 101 |
+
"12": 0.19.8
|
| 102 |
+
"13": linux-x86_64
|
| 103 |
+
adaptor:
|
| 104 |
+
value:
|
| 105 |
+
hyperparameters: null
|
| 106 |
+
name: none
|
| 107 |
+
circuit:
|
| 108 |
+
value:
|
| 109 |
+
model:
|
| 110 |
+
_target_: mtp.models.circuits.CircuitModel
|
| 111 |
+
kind: cp
|
| 112 |
+
n_component: 8
|
| 113 |
+
n_token: 8
|
| 114 |
+
vocab_size: 320
|
| 115 |
+
n_component: 8
|
| 116 |
+
n_token: 8
|
| 117 |
+
name: cp
|
| 118 |
+
compile:
|
| 119 |
+
value: true
|
| 120 |
+
data:
|
| 121 |
+
value:
|
| 122 |
+
name: tulu3-evabyte
|
| 123 |
+
train_bin: agrv/tulu-v3-sft-evabyte-packed-seq-len-8192
|
| 124 |
+
val_bin: null
|
| 125 |
+
vocab_size: 320
|
| 126 |
+
device:
|
| 127 |
+
value: cuda
|
| 128 |
+
from_checkpoint:
|
| 129 |
+
value: null
|
| 130 |
+
generate:
|
| 131 |
+
value:
|
| 132 |
+
speculative: false
|
| 133 |
+
lm:
|
| 134 |
+
value:
|
| 135 |
+
model:
|
| 136 |
+
_target_: mtp.models.lm.LM
|
| 137 |
+
adaptor_kwargs: null
|
| 138 |
+
encoder_only: true
|
| 139 |
+
freeze: true
|
| 140 |
+
from_checkpoint: null
|
| 141 |
+
from_huggingface: EvaByte/EvaByte-SFT
|
| 142 |
+
lm: null
|
| 143 |
+
ref_enc: model
|
| 144 |
+
ref_head: lm_head
|
| 145 |
+
n_embd: 4096
|
| 146 |
+
n_head: 32
|
| 147 |
+
name: evabyte
|
| 148 |
+
model:
|
| 149 |
+
value:
|
| 150 |
+
beta: 0
|
| 151 |
+
gamma: 1
|
| 152 |
+
kl_algorithm: full
|
| 153 |
+
kl_type: forward
|
| 154 |
+
model:
|
| 155 |
+
_target_: mtp.models.mtp.MultiTokenLM
|
| 156 |
+
beta: 0
|
| 157 |
+
circuit: ${circuit.model}
|
| 158 |
+
gamma: 0.9
|
| 159 |
+
init_from_lm_head: true
|
| 160 |
+
kl_algorithm: ${model.kl_algorithm}
|
| 161 |
+
kl_type: ${model.kl_type}
|
| 162 |
+
lm: ${lm.model}
|
| 163 |
+
mt_head_kwargs: ${mt_head.hyperparameters}
|
| 164 |
+
name: mtp
|
| 165 |
+
mt_head:
|
| 166 |
+
value:
|
| 167 |
+
hyperparameters:
|
| 168 |
+
contextual_hmm_weights: true
|
| 169 |
+
expander_n_layer: 1
|
| 170 |
+
expander_type: linear
|
| 171 |
+
freeze_vocab_unembedding: false
|
| 172 |
+
init_hmm_identity: true
|
| 173 |
+
n_embd: 4096
|
| 174 |
+
share_sum_weights: false
|
| 175 |
+
transformer_n_head: 32
|
| 176 |
+
transformer_n_layer: 0
|
| 177 |
+
type: evabyte
|
| 178 |
+
name: linear-evabyte
|
| 179 |
+
name:
|
| 180 |
+
value: nanogpt
|
| 181 |
+
training:
|
| 182 |
+
value:
|
| 183 |
+
batch_size: 256
|
| 184 |
+
device_batch_size: 1
|
| 185 |
+
expname: lr-3e-4-no-lora-cp-n-8-r-8
|
| 186 |
+
learning_rate: 0.0003
|
| 187 |
+
num_iterations: 900
|
| 188 |
+
random_seed: 13
|
| 189 |
+
save_model: true
|
| 190 |
+
save_model_every: 100
|
| 191 |
+
save_optimizer: true
|
| 192 |
+
sequence_length: 8192
|
| 193 |
+
use_scheduler: false
|
| 194 |
+
val_loss_every: 100
|
| 195 |
+
val_tokens: 4194304
|
wandb/run-20250911_200644-y9v5i9gr/files/output.log
ADDED
|
@@ -0,0 +1,936 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2025-09-11 20:06:46,551] - Saving config and checkpoints to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16...
|
| 2 |
+
[2025-09-11 20:06:46,551] - Save model: True...
|
| 3 |
+
[2025-09-11 20:06:46,552] - Save optimizer: True...
|
| 4 |
+
[2025-09-11 20:06:46,558] - Training on agrv/tulu-v3-sft-evabyte-packed-seq-len-8192...
|
| 5 |
+
Generating train split: 100%|██████████| 233628/233628 [00:06<00:00, 36437.72 examples/s]
|
| 6 |
+
Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
|
| 7 |
+
[2025-09-11 20:07:21,844] - Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
|
| 8 |
+
Generating valid split: 100%|██████████| 2360/2360 [00:00<00:00, 5736.16 examples/s]
|
| 9 |
+
[2025-09-11 20:07:23,771] - step:0/900 Saving model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@0.pt...
|
| 10 |
+
/home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:87: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
|
| 11 |
+
return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
|
| 12 |
+
[rank0]:W0911 20:08:06.374000 2716293 torch/_dynamo/exc.py:304] [7/0] Backend compiler failed with a fake tensor exception at
|
| 13 |
+
[rank0]:W0911 20:08:06.374000 2716293 torch/_dynamo/exc.py:304] [7/0] File "/disk/scratch/agrivas/nanoGPT/mtp/utils/packing.py", line 39, in torch_dynamo_resume_in_packed_targets_to_target_windows_at_34
|
| 14 |
+
[rank0]:W0911 20:08:06.374000 2716293 torch/_dynamo/exc.py:304] [7/0] return torch.concat(parts, dim=0).reshape(B, S, n)
|
| 15 |
+
[rank0]:W0911 20:08:06.374000 2716293 torch/_dynamo/exc.py:304] [7/0] Adding a graph break.
|
| 16 |
+
[rank0]:W0911 20:08:06.437000 2716293 torch/_dynamo/exc.py:304] [7/0_1] Backend compiler failed with a fake tensor exception at
|
| 17 |
+
[rank0]:W0911 20:08:06.437000 2716293 torch/_dynamo/exc.py:304] [7/0_1] File "/disk/scratch/agrivas/nanoGPT/mtp/utils/packing.py", line 39, in torch_dynamo_resume_in_packed_targets_to_target_windows_at_34
|
| 18 |
+
[rank0]:W0911 20:08:06.437000 2716293 torch/_dynamo/exc.py:304] [7/0_1] return torch.concat(parts, dim=0).reshape(B, S, n)
|
| 19 |
+
[rank0]:W0911 20:08:06.437000 2716293 torch/_dynamo/exc.py:304] [7/0_1] Adding a graph break.
|
| 20 |
+
/home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:87: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
|
| 21 |
+
return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
|
| 22 |
+
/home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:87: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
|
| 23 |
+
return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
|
| 24 |
+
/home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:87: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
|
| 25 |
+
return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
|
| 26 |
+
/home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:87: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
|
| 27 |
+
return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
|
| 28 |
+
[2025-09-11 20:10:21,957] - step:1/900 train_loss:0.5686 lr:0.0003000000 time/step:177.94s
|
| 29 |
+
[2025-09-11 20:12:19,200] - step:2/900 train_loss:0.5480 lr:0.0003000000 time/step:117.24s
|
| 30 |
+
[2025-09-11 20:14:16,539] - step:3/900 train_loss:0.5220 lr:0.0003000000 time/step:117.34s
|
| 31 |
+
[2025-09-11 20:16:13,861] - step:4/900 train_loss:0.5383 lr:0.0003000000 time/step:117.32s
|
| 32 |
+
[2025-09-11 20:18:10,435] - step:5/900 train_loss:0.5371 lr:0.0003000000 time/step:116.57s
|
| 33 |
+
[2025-09-11 20:20:08,627] - step:6/900 train_loss:0.5227 lr:0.0003000000 time/step:118.19s
|
| 34 |
+
[2025-09-11 20:22:06,149] - step:7/900 train_loss:0.5128 lr:0.0003000000 time/step:117.51s
|
| 35 |
+
[2025-09-11 20:24:03,890] - step:8/900 train_loss:0.5420 lr:0.0003000000 time/step:117.74s
|
| 36 |
+
[2025-09-11 20:26:01,252] - step:9/900 train_loss:0.5426 lr:0.0003000000 time/step:117.36s
|
| 37 |
+
[2025-09-11 20:27:59,602] - step:10/900 train_loss:0.5236 lr:0.0003000000 time/step:118.34s
|
| 38 |
+
[2025-09-11 20:29:56,227] - step:11/900 train_loss:0.4860 lr:0.0003000000 time/step:116.61s
|
| 39 |
+
[2025-09-11 20:31:54,042] - step:12/900 train_loss:0.5105 lr:0.0003000000 time/step:117.81s
|
| 40 |
+
[2025-09-11 20:33:51,406] - step:13/900 train_loss:0.4993 lr:0.0003000000 time/step:117.36s
|
| 41 |
+
[2025-09-11 20:35:48,717] - step:14/900 train_loss:0.4925 lr:0.0003000000 time/step:117.31s
|
| 42 |
+
[2025-09-11 20:37:47,558] - step:15/900 train_loss:0.5207 lr:0.0003000000 time/step:118.83s
|
| 43 |
+
[2025-09-11 20:39:45,850] - step:16/900 train_loss:0.4827 lr:0.0003000000 time/step:118.28s
|
| 44 |
+
[2025-09-11 20:41:42,738] - step:17/900 train_loss:0.5033 lr:0.0003000000 time/step:116.88s
|
| 45 |
+
[2025-09-11 20:43:39,898] - step:18/900 train_loss:0.5082 lr:0.0003000000 time/step:117.15s
|
| 46 |
+
[2025-09-11 20:45:37,029] - step:19/900 train_loss:0.4910 lr:0.0003000000 time/step:117.13s
|
| 47 |
+
[2025-09-11 20:47:34,571] - step:20/900 train_loss:0.5006 lr:0.0003000000 time/step:117.54s
|
| 48 |
+
[2025-09-11 20:49:32,312] - step:21/900 train_loss:0.4936 lr:0.0003000000 time/step:117.73s
|
| 49 |
+
[2025-09-11 20:51:29,213] - step:22/900 train_loss:0.4941 lr:0.0003000000 time/step:116.90s
|
| 50 |
+
[2025-09-11 20:53:26,056] - step:23/900 train_loss:0.5131 lr:0.0003000000 time/step:116.83s
|
| 51 |
+
[2025-09-11 20:55:22,982] - step:24/900 train_loss:0.4826 lr:0.0003000000 time/step:116.92s
|
| 52 |
+
[2025-09-11 20:57:20,427] - step:25/900 train_loss:0.4913 lr:0.0003000000 time/step:117.44s
|
| 53 |
+
[2025-09-11 20:59:18,626] - step:26/900 train_loss:0.4607 lr:0.0003000000 time/step:118.18s
|
| 54 |
+
[2025-09-11 21:01:15,710] - step:27/900 train_loss:0.4908 lr:0.0003000000 time/step:117.08s
|
| 55 |
+
[2025-09-11 21:03:12,633] - step:28/900 train_loss:0.4910 lr:0.0003000000 time/step:116.91s
|
| 56 |
+
[2025-09-11 21:05:09,636] - step:29/900 train_loss:0.4657 lr:0.0003000000 time/step:117.00s
|
| 57 |
+
[2025-09-11 21:07:06,700] - step:30/900 train_loss:0.4594 lr:0.0003000000 time/step:117.06s
|
| 58 |
+
[2025-09-11 21:09:04,683] - step:31/900 train_loss:0.4755 lr:0.0003000000 time/step:117.97s
|
| 59 |
+
[2025-09-11 21:11:01,763] - step:32/900 train_loss:0.4541 lr:0.0003000000 time/step:117.08s
|
| 60 |
+
[2025-09-11 21:12:59,791] - step:33/900 train_loss:0.4807 lr:0.0003000000 time/step:118.02s
|
| 61 |
+
[2025-09-11 21:14:55,836] - step:34/900 train_loss:0.4870 lr:0.0003000000 time/step:116.03s
|
| 62 |
+
[2025-09-11 21:16:52,899] - step:35/900 train_loss:0.4625 lr:0.0003000000 time/step:117.06s
|
| 63 |
+
[2025-09-11 21:18:51,003] - step:36/900 train_loss:0.4791 lr:0.0003000000 time/step:118.09s
|
| 64 |
+
[2025-09-11 21:20:48,545] - step:37/900 train_loss:0.4473 lr:0.0003000000 time/step:117.53s
|
| 65 |
+
[2025-09-11 21:22:45,589] - step:38/900 train_loss:0.4752 lr:0.0003000000 time/step:117.04s
|
| 66 |
+
[2025-09-11 21:24:43,273] - step:39/900 train_loss:0.4637 lr:0.0003000000 time/step:117.68s
|
| 67 |
+
[2025-09-11 21:26:39,295] - step:40/900 train_loss:0.4792 lr:0.0003000000 time/step:116.01s
|
| 68 |
+
[2025-09-11 21:28:36,435] - step:41/900 train_loss:0.4486 lr:0.0003000000 time/step:117.13s
|
| 69 |
+
[2025-09-11 21:30:33,920] - step:42/900 train_loss:0.4401 lr:0.0003000000 time/step:117.48s
|
| 70 |
+
[2025-09-11 21:32:30,825] - step:43/900 train_loss:0.4647 lr:0.0003000000 time/step:116.90s
|
| 71 |
+
[2025-09-11 21:34:28,329] - step:44/900 train_loss:0.4925 lr:0.0003000000 time/step:117.50s
|
| 72 |
+
[2025-09-11 21:36:25,926] - step:45/900 train_loss:0.4660 lr:0.0003000000 time/step:117.59s
|
| 73 |
+
[2025-09-11 21:38:22,375] - step:46/900 train_loss:0.4459 lr:0.0003000000 time/step:116.44s
|
| 74 |
+
[2025-09-11 21:40:19,319] - step:47/900 train_loss:0.4487 lr:0.0003000000 time/step:116.93s
|
| 75 |
+
[2025-09-11 21:42:17,801] - step:48/900 train_loss:0.4378 lr:0.0003000000 time/step:118.48s
|
| 76 |
+
[2025-09-11 21:44:15,250] - step:49/900 train_loss:0.4623 lr:0.0003000000 time/step:117.44s
|
| 77 |
+
[2025-09-11 21:46:12,028] - step:50/900 train_loss:0.4788 lr:0.0003000000 time/step:116.77s
|
| 78 |
+
[2025-09-11 21:48:08,924] - step:51/900 train_loss:0.4612 lr:0.0003000000 time/step:116.89s
|
| 79 |
+
[2025-09-11 21:50:05,277] - step:52/900 train_loss:0.4670 lr:0.0003000000 time/step:116.34s
|
| 80 |
+
[2025-09-11 21:52:03,579] - step:53/900 train_loss:0.4948 lr:0.0003000000 time/step:118.20s
|
| 81 |
+
[2025-09-11 21:54:00,439] - step:54/900 train_loss:0.4474 lr:0.0003000000 time/step:116.86s
|
| 82 |
+
[2025-09-11 21:55:57,226] - step:55/900 train_loss:0.4696 lr:0.0003000000 time/step:116.78s
|
| 83 |
+
[2025-09-11 21:57:54,070] - step:56/900 train_loss:0.4636 lr:0.0003000000 time/step:116.84s
|
| 84 |
+
[2025-09-11 21:59:51,015] - step:57/900 train_loss:0.4567 lr:0.0003000000 time/step:116.93s
|
| 85 |
+
[2025-09-11 22:01:48,416] - step:58/900 train_loss:0.4600 lr:0.0003000000 time/step:117.40s
|
| 86 |
+
[2025-09-11 22:03:46,720] - step:59/900 train_loss:0.4678 lr:0.0003000000 time/step:118.30s
|
| 87 |
+
[2025-09-11 22:05:43,544] - step:60/900 train_loss:0.4619 lr:0.0003000000 time/step:116.82s
|
| 88 |
+
[2025-09-11 22:07:40,424] - step:61/900 train_loss:0.4553 lr:0.0003000000 time/step:116.87s
|
| 89 |
+
[2025-09-11 22:09:37,873] - step:62/900 train_loss:0.4719 lr:0.0003000000 time/step:117.43s
|
| 90 |
+
[2025-09-11 22:11:34,969] - step:63/900 train_loss:0.4582 lr:0.0003000000 time/step:117.09s
|
| 91 |
+
[2025-09-11 22:13:31,914] - step:64/900 train_loss:0.4430 lr:0.0003000000 time/step:116.94s
|
| 92 |
+
[2025-09-11 22:15:28,799] - step:65/900 train_loss:0.4268 lr:0.0003000000 time/step:116.88s
|
| 93 |
+
[2025-09-11 22:17:25,704] - step:66/900 train_loss:0.4669 lr:0.0003000000 time/step:116.90s
|
| 94 |
+
[2025-09-11 22:19:22,827] - step:67/900 train_loss:0.4380 lr:0.0003000000 time/step:117.11s
|
| 95 |
+
[2025-09-11 22:21:20,150] - step:68/900 train_loss:0.4785 lr:0.0003000000 time/step:117.32s
|
| 96 |
+
[2025-09-11 22:23:16,126] - step:69/900 train_loss:0.4678 lr:0.0003000000 time/step:115.97s
|
| 97 |
+
[2025-09-11 22:25:13,659] - step:70/900 train_loss:0.4456 lr:0.0003000000 time/step:117.53s
|
| 98 |
+
[2025-09-11 22:27:10,581] - step:71/900 train_loss:0.4403 lr:0.0003000000 time/step:116.91s
|
| 99 |
+
[2025-09-11 22:29:07,930] - step:72/900 train_loss:0.4318 lr:0.0003000000 time/step:117.34s
|
| 100 |
+
[2025-09-11 22:31:05,566] - step:73/900 train_loss:0.4546 lr:0.0003000000 time/step:117.63s
|
| 101 |
+
[2025-09-11 22:33:02,531] - step:74/900 train_loss:0.4860 lr:0.0003000000 time/step:116.96s
|
| 102 |
+
[2025-09-11 22:34:59,254] - step:75/900 train_loss:0.4499 lr:0.0003000000 time/step:116.72s
|
| 103 |
+
[2025-09-11 22:36:57,138] - step:76/900 train_loss:0.4490 lr:0.0003000000 time/step:117.88s
|
| 104 |
+
[2025-09-11 22:38:54,164] - step:77/900 train_loss:0.4490 lr:0.0003000000 time/step:117.02s
|
| 105 |
+
[2025-09-11 22:40:51,448] - step:78/900 train_loss:0.4455 lr:0.0003000000 time/step:117.27s
|
| 106 |
+
[2025-09-11 22:42:48,430] - step:79/900 train_loss:0.4274 lr:0.0003000000 time/step:116.98s
|
| 107 |
+
[2025-09-11 22:44:45,934] - step:80/900 train_loss:0.4519 lr:0.0003000000 time/step:117.50s
|
| 108 |
+
[2025-09-11 22:46:42,798] - step:81/900 train_loss:0.4429 lr:0.0003000000 time/step:116.85s
|
| 109 |
+
[2025-09-11 22:48:39,720] - step:82/900 train_loss:0.4436 lr:0.0003000000 time/step:116.92s
|
| 110 |
+
[2025-09-11 22:50:37,164] - step:83/900 train_loss:0.4713 lr:0.0003000000 time/step:117.43s
|
| 111 |
+
[2025-09-11 22:52:33,983] - step:84/900 train_loss:0.4399 lr:0.0003000000 time/step:116.82s
|
| 112 |
+
[2025-09-11 22:54:31,605] - step:85/900 train_loss:0.4343 lr:0.0003000000 time/step:117.62s
|
| 113 |
+
[2025-09-11 22:56:29,383] - step:86/900 train_loss:0.4587 lr:0.0003000000 time/step:117.77s
|
| 114 |
+
[2025-09-11 22:58:26,338] - step:87/900 train_loss:0.4550 lr:0.0003000000 time/step:116.95s
|
| 115 |
+
[2025-09-11 23:00:23,614] - step:88/900 train_loss:0.4437 lr:0.0003000000 time/step:117.26s
|
| 116 |
+
[2025-09-11 23:02:20,358] - step:89/900 train_loss:0.4575 lr:0.0003000000 time/step:116.74s
|
| 117 |
+
[2025-09-11 23:04:17,289] - step:90/900 train_loss:0.4361 lr:0.0003000000 time/step:116.93s
|
| 118 |
+
[2025-09-11 23:06:15,307] - step:91/900 train_loss:0.4259 lr:0.0003000000 time/step:118.02s
|
| 119 |
+
[2025-09-11 23:08:12,562] - step:92/900 train_loss:0.4340 lr:0.0003000000 time/step:117.25s
|
| 120 |
+
[2025-09-11 23:10:10,001] - step:93/900 train_loss:0.4424 lr:0.0003000000 time/step:117.43s
|
| 121 |
+
[2025-09-11 23:12:07,171] - step:94/900 train_loss:0.4240 lr:0.0003000000 time/step:117.16s
|
| 122 |
+
[2025-09-11 23:14:05,158] - step:95/900 train_loss:0.4425 lr:0.0003000000 time/step:117.99s
|
| 123 |
+
[2025-09-11 23:16:02,641] - step:96/900 train_loss:0.4575 lr:0.0003000000 time/step:117.48s
|
| 124 |
+
[2025-09-11 23:17:59,591] - step:97/900 train_loss:0.4435 lr:0.0003000000 time/step:116.94s
|
| 125 |
+
[2025-09-11 23:19:55,399] - step:98/900 train_loss:0.4466 lr:0.0003000000 time/step:115.80s
|
| 126 |
+
[2025-09-11 23:21:53,531] - step:99/900 train_loss:0.4469 lr:0.0003000000 time/step:118.12s
|
| 127 |
+
[2025-09-11 23:23:52,424] - step:100/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@100.pt...
|
| 128 |
+
[2025-09-11 23:23:52,425] - step:100/900 train_loss:0.4467 lr:0.0003000000 time/step:118.25s
|
| 129 |
+
[2025-09-11 23:25:49,555] - step:101/900 train_loss:0.4462 lr:0.0003000000 time/step:117.13s
|
| 130 |
+
[2025-09-11 23:27:46,681] - step:102/900 train_loss:0.4479 lr:0.0003000000 time/step:117.12s
|
| 131 |
+
[2025-09-11 23:29:43,985] - step:103/900 train_loss:0.4212 lr:0.0003000000 time/step:117.30s
|
| 132 |
+
[2025-09-11 23:31:40,749] - step:104/900 train_loss:0.4385 lr:0.0003000000 time/step:116.76s
|
| 133 |
+
[2025-09-11 23:33:37,611] - step:105/900 train_loss:0.4490 lr:0.0003000000 time/step:116.86s
|
| 134 |
+
[2025-09-11 23:35:34,674] - step:106/900 train_loss:0.4537 lr:0.0003000000 time/step:117.06s
|
| 135 |
+
[2025-09-11 23:37:32,277] - step:107/900 train_loss:0.4278 lr:0.0003000000 time/step:117.60s
|
| 136 |
+
[2025-09-11 23:39:29,569] - step:108/900 train_loss:0.4413 lr:0.0003000000 time/step:117.28s
|
| 137 |
+
[2025-09-11 23:41:26,965] - step:109/900 train_loss:0.4219 lr:0.0003000000 time/step:117.39s
|
| 138 |
+
[2025-09-11 23:43:23,608] - step:110/900 train_loss:0.4455 lr:0.0003000000 time/step:116.64s
|
| 139 |
+
[2025-09-11 23:45:20,608] - step:111/900 train_loss:0.4581 lr:0.0003000000 time/step:117.00s
|
| 140 |
+
[2025-09-11 23:47:17,496] - step:112/900 train_loss:0.4501 lr:0.0003000000 time/step:116.89s
|
| 141 |
+
[2025-09-11 23:49:15,179] - step:113/900 train_loss:0.4332 lr:0.0003000000 time/step:117.66s
|
| 142 |
+
[2025-09-11 23:51:13,020] - step:114/900 train_loss:0.4311 lr:0.0003000000 time/step:117.83s
|
| 143 |
+
[2025-09-11 23:53:10,646] - step:115/900 train_loss:0.4449 lr:0.0003000000 time/step:117.62s
|
| 144 |
+
[2025-09-11 23:55:06,688] - step:116/900 train_loss:0.4424 lr:0.0003000000 time/step:116.04s
|
| 145 |
+
[2025-09-11 23:57:03,652] - step:117/900 train_loss:0.4392 lr:0.0003000000 time/step:116.96s
|
| 146 |
+
[2025-09-11 23:59:01,394] - step:118/900 train_loss:0.4246 lr:0.0003000000 time/step:117.74s
|
| 147 |
+
[2025-09-12 00:00:58,798] - step:119/900 train_loss:0.4339 lr:0.0003000000 time/step:117.39s
|
| 148 |
+
[2025-09-12 00:02:56,142] - step:120/900 train_loss:0.4064 lr:0.0003000000 time/step:117.33s
|
| 149 |
+
[2025-09-12 00:04:53,044] - step:121/900 train_loss:0.4421 lr:0.0003000000 time/step:116.90s
|
| 150 |
+
[2025-09-12 00:06:49,048] - step:122/900 train_loss:0.4306 lr:0.0003000000 time/step:116.00s
|
| 151 |
+
[2025-09-12 00:08:46,671] - step:123/900 train_loss:0.4163 lr:0.0003000000 time/step:117.62s
|
| 152 |
+
[2025-09-12 00:10:44,735] - step:124/900 train_loss:0.4428 lr:0.0003000000 time/step:118.05s
|
| 153 |
+
[2025-09-12 00:12:42,019] - step:125/900 train_loss:0.4188 lr:0.0003000000 time/step:117.27s
|
| 154 |
+
[2025-09-12 00:14:38,901] - step:126/900 train_loss:0.4226 lr:0.0003000000 time/step:116.88s
|
| 155 |
+
[2025-09-12 00:16:35,356] - step:127/900 train_loss:0.4379 lr:0.0003000000 time/step:116.45s
|
| 156 |
+
[2025-09-12 00:18:31,808] - step:128/900 train_loss:0.4475 lr:0.0003000000 time/step:116.45s
|
| 157 |
+
[2025-09-12 00:20:31,092] - step:129/900 train_loss:0.4579 lr:0.0003000000 time/step:119.27s
|
| 158 |
+
[2025-09-12 00:22:28,417] - step:130/900 train_loss:0.4504 lr:0.0003000000 time/step:117.31s
|
| 159 |
+
[2025-09-12 00:24:25,417] - step:131/900 train_loss:0.4345 lr:0.0003000000 time/step:116.99s
|
| 160 |
+
[2025-09-12 00:26:22,282] - step:132/900 train_loss:0.4567 lr:0.0003000000 time/step:116.86s
|
| 161 |
+
[2025-09-12 00:28:18,304] - step:133/900 train_loss:0.4396 lr:0.0003000000 time/step:116.02s
|
| 162 |
+
[2025-09-12 00:30:15,628] - step:134/900 train_loss:0.4440 lr:0.0003000000 time/step:117.32s
|
| 163 |
+
[2025-09-12 00:32:13,051] - step:135/900 train_loss:0.4384 lr:0.0003000000 time/step:117.42s
|
| 164 |
+
[2025-09-12 00:34:10,336] - step:136/900 train_loss:0.4276 lr:0.0003000000 time/step:117.28s
|
| 165 |
+
[2025-09-12 00:36:07,098] - step:137/900 train_loss:0.4424 lr:0.0003000000 time/step:116.76s
|
| 166 |
+
[2025-09-12 00:38:03,861] - step:138/900 train_loss:0.4288 lr:0.0003000000 time/step:116.76s
|
| 167 |
+
[2025-09-12 00:40:00,304] - step:139/900 train_loss:0.4333 lr:0.0003000000 time/step:116.43s
|
| 168 |
+
[2025-09-12 00:41:57,928] - step:140/900 train_loss:0.4347 lr:0.0003000000 time/step:117.62s
|
| 169 |
+
[2025-09-12 00:43:56,252] - step:141/900 train_loss:0.4515 lr:0.0003000000 time/step:118.32s
|
| 170 |
+
[2025-09-12 00:45:53,156] - step:142/900 train_loss:0.4531 lr:0.0003000000 time/step:116.90s
|
| 171 |
+
[2025-09-12 00:47:50,037] - step:143/900 train_loss:0.4426 lr:0.0003000000 time/step:116.88s
|
| 172 |
+
[2025-09-12 00:49:46,863] - step:144/900 train_loss:0.4100 lr:0.0003000000 time/step:116.81s
|
| 173 |
+
[2025-09-12 00:51:42,986] - step:145/900 train_loss:0.4185 lr:0.0003000000 time/step:116.12s
|
| 174 |
+
[2025-09-12 00:53:40,748] - step:146/900 train_loss:0.4556 lr:0.0003000000 time/step:117.75s
|
| 175 |
+
[2025-09-12 00:55:38,614] - step:147/900 train_loss:0.4580 lr:0.0003000000 time/step:117.86s
|
| 176 |
+
[2025-09-12 00:57:35,395] - step:148/900 train_loss:0.4432 lr:0.0003000000 time/step:116.77s
|
| 177 |
+
[2025-09-12 00:59:32,300] - step:149/900 train_loss:0.4260 lr:0.0003000000 time/step:116.90s
|
| 178 |
+
[2025-09-12 01:01:29,963] - step:150/900 train_loss:0.4369 lr:0.0003000000 time/step:117.65s
|
| 179 |
+
[2025-09-12 01:03:26,107] - step:151/900 train_loss:0.4121 lr:0.0003000000 time/step:116.14s
|
| 180 |
+
[2025-09-12 01:05:23,232] - step:152/900 train_loss:0.4488 lr:0.0003000000 time/step:117.12s
|
| 181 |
+
[2025-09-12 01:07:21,054] - step:153/900 train_loss:0.4290 lr:0.0003000000 time/step:117.82s
|
| 182 |
+
[2025-09-12 01:09:17,934] - step:154/900 train_loss:0.4126 lr:0.0003000000 time/step:116.88s
|
| 183 |
+
[2025-09-12 01:11:15,437] - step:155/900 train_loss:0.4201 lr:0.0003000000 time/step:117.49s
|
| 184 |
+
[2025-09-12 01:13:12,295] - step:156/900 train_loss:0.4294 lr:0.0003000000 time/step:116.85s
|
| 185 |
+
[2025-09-12 01:15:08,687] - step:157/900 train_loss:0.4340 lr:0.0003000000 time/step:116.38s
|
| 186 |
+
[2025-09-12 01:17:05,708] - step:158/900 train_loss:0.4543 lr:0.0003000000 time/step:117.01s
|
| 187 |
+
[2025-09-12 01:19:03,353] - step:159/900 train_loss:0.4211 lr:0.0003000000 time/step:117.64s
|
| 188 |
+
[2025-09-12 01:21:00,871] - step:160/900 train_loss:0.4400 lr:0.0003000000 time/step:117.51s
|
| 189 |
+
[2025-09-12 01:22:57,738] - step:161/900 train_loss:0.4259 lr:0.0003000000 time/step:116.86s
|
| 190 |
+
[2025-09-12 01:24:55,051] - step:162/900 train_loss:0.4150 lr:0.0003000000 time/step:117.31s
|
| 191 |
+
[2025-09-12 01:26:51,147] - step:163/900 train_loss:0.4168 lr:0.0003000000 time/step:116.09s
|
| 192 |
+
[2025-09-12 01:28:48,833] - step:164/900 train_loss:0.4024 lr:0.0003000000 time/step:117.68s
|
| 193 |
+
[2025-09-12 01:30:46,610] - step:165/900 train_loss:0.4476 lr:0.0003000000 time/step:117.77s
|
| 194 |
+
[2025-09-12 01:32:43,517] - step:166/900 train_loss:0.4241 lr:0.0003000000 time/step:116.90s
|
| 195 |
+
[2025-09-12 01:34:41,001] - step:167/900 train_loss:0.4268 lr:0.0003000000 time/step:117.48s
|
| 196 |
+
[2025-09-12 01:36:37,582] - step:168/900 train_loss:0.3846 lr:0.0003000000 time/step:116.57s
|
| 197 |
+
[2025-09-12 01:38:34,908] - step:169/900 train_loss:0.4199 lr:0.0003000000 time/step:117.32s
|
| 198 |
+
[2025-09-12 01:40:33,014] - step:170/900 train_loss:0.4037 lr:0.0003000000 time/step:118.09s
|
| 199 |
+
[2025-09-12 01:42:29,854] - step:171/900 train_loss:0.4579 lr:0.0003000000 time/step:116.84s
|
| 200 |
+
[2025-09-12 01:44:27,350] - step:172/900 train_loss:0.4435 lr:0.0003000000 time/step:117.48s
|
| 201 |
+
[2025-09-12 01:46:24,704] - step:173/900 train_loss:0.4139 lr:0.0003000000 time/step:117.34s
|
| 202 |
+
[2025-09-12 01:48:21,009] - step:174/900 train_loss:0.4308 lr:0.0003000000 time/step:116.30s
|
| 203 |
+
[2025-09-12 01:50:19,086] - step:175/900 train_loss:0.4156 lr:0.0003000000 time/step:118.06s
|
| 204 |
+
[2025-09-12 01:52:16,506] - step:176/900 train_loss:0.4204 lr:0.0003000000 time/step:117.41s
|
| 205 |
+
[2025-09-12 01:54:14,395] - step:177/900 train_loss:0.4211 lr:0.0003000000 time/step:117.87s
|
| 206 |
+
[2025-09-12 01:56:11,781] - step:178/900 train_loss:0.4399 lr:0.0003000000 time/step:117.38s
|
| 207 |
+
[2025-09-12 01:58:09,165] - step:179/900 train_loss:0.4327 lr:0.0003000000 time/step:117.38s
|
| 208 |
+
[2025-09-12 02:00:05,670] - step:180/900 train_loss:0.4362 lr:0.0003000000 time/step:116.49s
|
| 209 |
+
[2025-09-12 02:02:03,683] - step:181/900 train_loss:0.4204 lr:0.0003000000 time/step:118.01s
|
| 210 |
+
[2025-09-12 02:04:01,525] - step:182/900 train_loss:0.4528 lr:0.0003000000 time/step:117.84s
|
| 211 |
+
[2025-09-12 02:05:59,256] - step:183/900 train_loss:0.4115 lr:0.0003000000 time/step:117.72s
|
| 212 |
+
[2025-09-12 02:07:56,456] - step:184/900 train_loss:0.4527 lr:0.0003000000 time/step:117.20s
|
| 213 |
+
[2025-09-12 02:09:53,692] - step:185/900 train_loss:0.4378 lr:0.0003000000 time/step:117.23s
|
| 214 |
+
[2025-09-12 02:11:50,835] - step:186/900 train_loss:0.4322 lr:0.0003000000 time/step:117.14s
|
| 215 |
+
[2025-09-12 02:13:49,249] - step:187/900 train_loss:0.4503 lr:0.0003000000 time/step:118.41s
|
| 216 |
+
[2025-09-12 02:15:46,708] - step:188/900 train_loss:0.4137 lr:0.0003000000 time/step:117.45s
|
| 217 |
+
[2025-09-12 02:17:44,588] - step:189/900 train_loss:0.4373 lr:0.0003000000 time/step:117.87s
|
| 218 |
+
[2025-09-12 02:19:41,640] - step:190/900 train_loss:0.4390 lr:0.0003000000 time/step:117.04s
|
| 219 |
+
[2025-09-12 02:21:38,674] - step:191/900 train_loss:0.4540 lr:0.0003000000 time/step:117.02s
|
| 220 |
+
[2025-09-12 02:23:35,317] - step:192/900 train_loss:0.4401 lr:0.0003000000 time/step:116.64s
|
| 221 |
+
[2025-09-12 02:25:32,403] - step:193/900 train_loss:0.4325 lr:0.0003000000 time/step:117.08s
|
| 222 |
+
[2025-09-12 02:27:29,545] - step:194/900 train_loss:0.4249 lr:0.0003000000 time/step:117.13s
|
| 223 |
+
[2025-09-12 02:29:26,648] - step:195/900 train_loss:0.4074 lr:0.0003000000 time/step:117.09s
|
| 224 |
+
[2025-09-12 02:31:23,432] - step:196/900 train_loss:0.4212 lr:0.0003000000 time/step:116.77s
|
| 225 |
+
[2025-09-12 02:33:21,256] - step:197/900 train_loss:0.4408 lr:0.0003000000 time/step:117.82s
|
| 226 |
+
[2025-09-12 02:35:18,019] - step:198/900 train_loss:0.4229 lr:0.0003000000 time/step:116.76s
|
| 227 |
+
[2025-09-12 02:37:15,403] - step:199/900 train_loss:0.4517 lr:0.0003000000 time/step:117.38s
|
| 228 |
+
[2025-09-12 02:39:13,125] - step:200/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@200.pt...
|
| 229 |
+
[2025-09-12 02:39:13,129] - step:200/900 train_loss:0.4149 lr:0.0003000000 time/step:117.11s
|
| 230 |
+
[2025-09-12 02:41:09,907] - step:201/900 train_loss:0.4258 lr:0.0003000000 time/step:116.76s
|
| 231 |
+
[2025-09-12 02:43:06,972] - step:202/900 train_loss:0.4207 lr:0.0003000000 time/step:117.06s
|
| 232 |
+
[2025-09-12 02:45:03,575] - step:203/900 train_loss:0.4432 lr:0.0003000000 time/step:116.60s
|
| 233 |
+
[2025-09-12 02:47:00,062] - step:204/900 train_loss:0.4072 lr:0.0003000000 time/step:116.48s
|
| 234 |
+
[2025-09-12 02:48:57,543] - step:205/900 train_loss:0.4404 lr:0.0003000000 time/step:117.47s
|
| 235 |
+
[2025-09-12 02:50:54,165] - step:206/900 train_loss:0.4151 lr:0.0003000000 time/step:116.61s
|
| 236 |
+
[2025-09-12 02:52:50,609] - step:207/900 train_loss:0.4256 lr:0.0003000000 time/step:116.44s
|
| 237 |
+
[2025-09-12 02:54:48,171] - step:208/900 train_loss:0.4200 lr:0.0003000000 time/step:117.56s
|
| 238 |
+
[2025-09-12 02:56:44,244] - step:209/900 train_loss:0.4159 lr:0.0003000000 time/step:116.06s
|
| 239 |
+
[2025-09-12 02:58:41,740] - step:210/900 train_loss:0.4080 lr:0.0003000000 time/step:117.49s
|
| 240 |
+
[2025-09-12 03:00:38,362] - step:211/900 train_loss:0.4394 lr:0.0003000000 time/step:116.61s
|
| 241 |
+
[2025-09-12 03:02:34,864] - step:212/900 train_loss:0.4461 lr:0.0003000000 time/step:116.49s
|
| 242 |
+
[2025-09-12 03:04:32,289] - step:213/900 train_loss:0.4310 lr:0.0003000000 time/step:117.42s
|
| 243 |
+
[2025-09-12 03:06:29,834] - step:214/900 train_loss:0.4458 lr:0.0003000000 time/step:117.53s
|
| 244 |
+
[2025-09-12 03:08:26,395] - step:215/900 train_loss:0.4322 lr:0.0003000000 time/step:116.56s
|
| 245 |
+
[2025-09-12 03:10:23,441] - step:216/900 train_loss:0.3979 lr:0.0003000000 time/step:117.03s
|
| 246 |
+
[2025-09-12 03:12:19,963] - step:217/900 train_loss:0.4011 lr:0.0003000000 time/step:116.51s
|
| 247 |
+
[2025-09-12 03:14:17,627] - step:218/900 train_loss:0.4372 lr:0.0003000000 time/step:117.66s
|
| 248 |
+
[2025-09-12 03:16:15,332] - step:219/900 train_loss:0.4281 lr:0.0003000000 time/step:117.70s
|
| 249 |
+
[2025-09-12 03:18:11,833] - step:220/900 train_loss:0.4330 lr:0.0003000000 time/step:116.49s
|
| 250 |
+
[2025-09-12 03:20:08,497] - step:221/900 train_loss:0.4534 lr:0.0003000000 time/step:116.65s
|
| 251 |
+
[2025-09-12 03:22:05,021] - step:222/900 train_loss:0.4076 lr:0.0003000000 time/step:116.52s
|
| 252 |
+
[2025-09-12 03:24:01,826] - step:223/900 train_loss:0.4211 lr:0.0003000000 time/step:116.79s
|
| 253 |
+
[2025-09-12 03:25:58,807] - step:224/900 train_loss:0.4075 lr:0.0003000000 time/step:116.98s
|
| 254 |
+
[2025-09-12 03:27:56,427] - step:225/900 train_loss:0.3977 lr:0.0003000000 time/step:117.61s
|
| 255 |
+
[2025-09-12 03:29:53,271] - step:226/900 train_loss:0.4331 lr:0.0003000000 time/step:116.84s
|
| 256 |
+
[2025-09-12 03:31:48,818] - step:227/900 train_loss:0.4424 lr:0.0003000000 time/step:115.53s
|
| 257 |
+
[2025-09-12 03:33:46,260] - step:228/900 train_loss:0.4265 lr:0.0003000000 time/step:117.44s
|
| 258 |
+
[2025-09-12 03:35:42,726] - step:229/900 train_loss:0.4018 lr:0.0003000000 time/step:116.46s
|
| 259 |
+
[2025-09-12 03:37:39,927] - step:230/900 train_loss:0.4277 lr:0.0003000000 time/step:117.20s
|
| 260 |
+
[2025-09-12 03:39:37,253] - step:231/900 train_loss:0.4229 lr:0.0003000000 time/step:117.32s
|
| 261 |
+
[2025-09-12 03:41:34,210] - step:232/900 train_loss:0.4231 lr:0.0003000000 time/step:116.94s
|
| 262 |
+
[2025-09-12 03:43:30,497] - step:233/900 train_loss:0.4125 lr:0.0003000000 time/step:116.28s
|
| 263 |
+
[2025-09-12 03:45:27,022] - step:234/900 train_loss:0.4181 lr:0.0003000000 time/step:116.52s
|
| 264 |
+
[2025-09-12 03:47:23,505] - step:235/900 train_loss:0.4364 lr:0.0003000000 time/step:116.48s
|
| 265 |
+
[2025-09-12 03:49:21,967] - step:236/900 train_loss:0.4135 lr:0.0003000000 time/step:118.46s
|
| 266 |
+
[2025-09-12 03:51:18,413] - step:237/900 train_loss:0.4139 lr:0.0003000000 time/step:116.43s
|
| 267 |
+
[2025-09-12 03:53:14,453] - step:238/900 train_loss:0.4341 lr:0.0003000000 time/step:116.03s
|
| 268 |
+
[2025-09-12 03:55:11,117] - step:239/900 train_loss:0.4174 lr:0.0003000000 time/step:116.66s
|
| 269 |
+
[2025-09-12 03:57:08,642] - step:240/900 train_loss:0.4449 lr:0.0003000000 time/step:117.52s
|
| 270 |
+
[2025-09-12 03:59:06,595] - step:241/900 train_loss:0.4303 lr:0.0003000000 time/step:117.95s
|
| 271 |
+
[2025-09-12 04:01:02,667] - step:242/900 train_loss:0.4350 lr:0.0003000000 time/step:116.06s
|
| 272 |
+
[2025-09-12 04:02:58,652] - step:243/900 train_loss:0.4332 lr:0.0003000000 time/step:115.97s
|
| 273 |
+
[2025-09-12 04:04:55,158] - step:244/900 train_loss:0.4170 lr:0.0003000000 time/step:116.50s
|
| 274 |
+
[2025-09-12 04:06:52,523] - step:245/900 train_loss:0.4325 lr:0.0003000000 time/step:117.35s
|
| 275 |
+
[2025-09-12 04:08:49,506] - step:246/900 train_loss:0.4140 lr:0.0003000000 time/step:116.98s
|
| 276 |
+
[2025-09-12 04:10:46,625] - step:247/900 train_loss:0.4244 lr:0.0003000000 time/step:117.10s
|
| 277 |
+
[2025-09-12 04:12:43,060] - step:248/900 train_loss:0.4435 lr:0.0003000000 time/step:116.43s
|
| 278 |
+
[2025-09-12 04:14:39,932] - step:249/900 train_loss:0.4188 lr:0.0003000000 time/step:116.87s
|
| 279 |
+
[2025-09-12 04:16:36,428] - step:250/900 train_loss:0.4138 lr:0.0003000000 time/step:116.49s
|
| 280 |
+
[2025-09-12 04:18:34,283] - step:251/900 train_loss:0.4045 lr:0.0003000000 time/step:117.84s
|
| 281 |
+
[2025-09-12 04:20:32,264] - step:252/900 train_loss:0.4128 lr:0.0003000000 time/step:117.96s
|
| 282 |
+
[2025-09-12 04:22:28,905] - step:253/900 train_loss:0.4352 lr:0.0003000000 time/step:116.63s
|
| 283 |
+
[2025-09-12 04:24:25,744] - step:254/900 train_loss:0.4090 lr:0.0003000000 time/step:116.83s
|
| 284 |
+
[2025-09-12 04:26:22,527] - step:255/900 train_loss:0.4125 lr:0.0003000000 time/step:116.78s
|
| 285 |
+
[2025-09-12 04:28:18,535] - step:256/900 train_loss:0.3974 lr:0.0003000000 time/step:116.00s
|
| 286 |
+
[2025-09-12 04:30:16,548] - step:257/900 train_loss:0.4056 lr:0.0003000000 time/step:118.00s
|
| 287 |
+
[2025-09-12 04:32:14,016] - step:258/900 train_loss:0.4158 lr:0.0003000000 time/step:117.45s
|
| 288 |
+
[2025-09-12 04:34:10,993] - step:259/900 train_loss:0.4080 lr:0.0003000000 time/step:116.97s
|
| 289 |
+
[2025-09-12 04:36:07,637] - step:260/900 train_loss:0.4217 lr:0.0003000000 time/step:116.64s
|
| 290 |
+
[2025-09-12 04:38:05,072] - step:261/900 train_loss:0.4157 lr:0.0003000000 time/step:117.43s
|
| 291 |
+
[2025-09-12 04:40:01,843] - step:262/900 train_loss:0.4139 lr:0.0003000000 time/step:116.76s
|
| 292 |
+
[2025-09-12 04:41:58,873] - step:263/900 train_loss:0.4401 lr:0.0003000000 time/step:117.01s
|
| 293 |
+
[2025-09-12 04:43:56,795] - step:264/900 train_loss:0.4272 lr:0.0003000000 time/step:117.92s
|
| 294 |
+
[2025-09-12 04:45:53,571] - step:265/900 train_loss:0.4228 lr:0.0003000000 time/step:116.76s
|
| 295 |
+
[2025-09-12 04:47:50,269] - step:266/900 train_loss:0.4242 lr:0.0003000000 time/step:116.69s
|
| 296 |
+
[2025-09-12 04:49:47,027] - step:267/900 train_loss:0.4361 lr:0.0003000000 time/step:116.75s
|
| 297 |
+
[2025-09-12 04:51:43,112] - step:268/900 train_loss:0.4224 lr:0.0003000000 time/step:116.07s
|
| 298 |
+
[2025-09-12 04:53:41,046] - step:269/900 train_loss:0.4076 lr:0.0003000000 time/step:117.92s
|
| 299 |
+
[2025-09-12 04:55:37,470] - step:270/900 train_loss:0.4172 lr:0.0003000000 time/step:116.42s
|
| 300 |
+
[2025-09-12 04:57:33,853] - step:271/900 train_loss:0.4219 lr:0.0003000000 time/step:116.38s
|
| 301 |
+
[2025-09-12 04:59:30,265] - step:272/900 train_loss:0.4281 lr:0.0003000000 time/step:116.41s
|
| 302 |
+
[2025-09-12 05:01:26,500] - step:273/900 train_loss:0.4105 lr:0.0003000000 time/step:116.22s
|
| 303 |
+
[2025-09-12 05:03:24,415] - step:274/900 train_loss:0.4247 lr:0.0003000000 time/step:117.91s
|
| 304 |
+
[2025-09-12 05:05:21,825] - step:275/900 train_loss:0.4172 lr:0.0003000000 time/step:117.40s
|
| 305 |
+
[2025-09-12 05:07:18,643] - step:276/900 train_loss:0.4281 lr:0.0003000000 time/step:116.81s
|
| 306 |
+
[2025-09-12 05:09:15,889] - step:277/900 train_loss:0.4140 lr:0.0003000000 time/step:117.23s
|
| 307 |
+
[2025-09-12 05:11:13,080] - step:278/900 train_loss:0.4459 lr:0.0003000000 time/step:117.18s
|
| 308 |
+
[2025-09-12 05:13:09,433] - step:279/900 train_loss:0.4128 lr:0.0003000000 time/step:116.35s
|
| 309 |
+
[2025-09-12 05:15:07,057] - step:280/900 train_loss:0.4171 lr:0.0003000000 time/step:117.62s
|
| 310 |
+
[2025-09-12 05:17:03,780] - step:281/900 train_loss:0.4083 lr:0.0003000000 time/step:116.71s
|
| 311 |
+
[2025-09-12 05:19:00,703] - step:282/900 train_loss:0.4214 lr:0.0003000000 time/step:116.92s
|
| 312 |
+
[2025-09-12 05:20:57,932] - step:283/900 train_loss:0.4072 lr:0.0003000000 time/step:117.19s
|
| 313 |
+
[2025-09-12 05:22:54,350] - step:284/900 train_loss:0.4471 lr:0.0003000000 time/step:116.39s
|
| 314 |
+
[2025-09-12 05:24:50,794] - step:285/900 train_loss:0.3946 lr:0.0003000000 time/step:116.44s
|
| 315 |
+
[2025-09-12 05:26:47,657] - step:286/900 train_loss:0.4510 lr:0.0003000000 time/step:116.86s
|
| 316 |
+
[2025-09-12 05:28:43,717] - step:287/900 train_loss:0.4409 lr:0.0003000000 time/step:116.05s
|
| 317 |
+
[2025-09-12 05:30:40,741] - step:288/900 train_loss:0.3887 lr:0.0003000000 time/step:117.01s
|
| 318 |
+
[2025-09-12 05:32:38,986] - step:289/900 train_loss:0.4207 lr:0.0003000000 time/step:118.24s
|
| 319 |
+
[2025-09-12 05:34:35,229] - step:290/900 train_loss:0.4018 lr:0.0003000000 time/step:116.24s
|
| 320 |
+
[2025-09-12 05:36:31,796] - step:291/900 train_loss:0.4233 lr:0.0003000000 time/step:116.56s
|
| 321 |
+
[2025-09-12 05:38:28,659] - step:292/900 train_loss:0.4223 lr:0.0003000000 time/step:116.86s
|
| 322 |
+
[2025-09-12 05:40:25,842] - step:293/900 train_loss:0.4412 lr:0.0003000000 time/step:117.18s
|
| 323 |
+
[2025-09-12 05:42:22,767] - step:294/900 train_loss:0.3965 lr:0.0003000000 time/step:116.91s
|
| 324 |
+
[2025-09-12 05:44:20,588] - step:295/900 train_loss:0.4155 lr:0.0003000000 time/step:117.81s
|
| 325 |
+
[2025-09-12 05:46:17,250] - step:296/900 train_loss:0.4051 lr:0.0003000000 time/step:116.66s
|
| 326 |
+
[2025-09-12 05:48:13,495] - step:297/900 train_loss:0.4186 lr:0.0003000000 time/step:116.24s
|
| 327 |
+
[2025-09-12 05:50:10,418] - step:298/900 train_loss:0.4280 lr:0.0003000000 time/step:116.91s
|
| 328 |
+
[2025-09-12 05:52:07,903] - step:299/900 train_loss:0.4225 lr:0.0003000000 time/step:117.46s
|
| 329 |
+
[2025-09-12 05:54:04,575] - step:300/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@300.pt...
|
| 330 |
+
[2025-09-12 05:54:04,576] - step:300/900 train_loss:0.4086 lr:0.0003000000 time/step:116.07s
|
| 331 |
+
[2025-09-12 05:56:02,285] - step:301/900 train_loss:0.4136 lr:0.0003000000 time/step:117.71s
|
| 332 |
+
[2025-09-12 05:58:00,171] - step:302/900 train_loss:0.4114 lr:0.0003000000 time/step:117.88s
|
| 333 |
+
[2025-09-12 05:59:55,795] - step:303/900 train_loss:0.4200 lr:0.0003000000 time/step:115.62s
|
| 334 |
+
[2025-09-12 06:01:52,652] - step:304/900 train_loss:0.4085 lr:0.0003000000 time/step:116.84s
|
| 335 |
+
[2025-09-12 06:03:49,824] - step:305/900 train_loss:0.4311 lr:0.0003000000 time/step:117.16s
|
| 336 |
+
[2025-09-12 06:05:46,285] - step:306/900 train_loss:0.4367 lr:0.0003000000 time/step:116.45s
|
| 337 |
+
[2025-09-12 06:07:44,209] - step:307/900 train_loss:0.4345 lr:0.0003000000 time/step:117.92s
|
| 338 |
+
[2025-09-12 06:09:41,568] - step:308/900 train_loss:0.4016 lr:0.0003000000 time/step:117.35s
|
| 339 |
+
[2025-09-12 06:11:38,074] - step:309/900 train_loss:0.4102 lr:0.0003000000 time/step:116.49s
|
| 340 |
+
[2025-09-12 06:13:34,857] - step:310/900 train_loss:0.4332 lr:0.0003000000 time/step:116.77s
|
| 341 |
+
[2025-09-12 06:15:32,302] - step:311/900 train_loss:0.4186 lr:0.0003000000 time/step:117.43s
|
| 342 |
+
[2025-09-12 06:17:29,124] - step:312/900 train_loss:0.4371 lr:0.0003000000 time/step:116.82s
|
| 343 |
+
[2025-09-12 06:19:26,289] - step:313/900 train_loss:0.4130 lr:0.0003000000 time/step:117.16s
|
| 344 |
+
[2025-09-12 06:21:22,830] - step:314/900 train_loss:0.4031 lr:0.0003000000 time/step:116.53s
|
| 345 |
+
[2025-09-12 06:23:19,454] - step:315/900 train_loss:0.4286 lr:0.0003000000 time/step:116.62s
|
| 346 |
+
[2025-09-12 06:25:17,324] - step:316/900 train_loss:0.4007 lr:0.0003000000 time/step:117.86s
|
| 347 |
+
[2025-09-12 06:27:14,242] - step:317/900 train_loss:0.4114 lr:0.0003000000 time/step:116.91s
|
| 348 |
+
[2025-09-12 06:29:11,325] - step:318/900 train_loss:0.4251 lr:0.0003000000 time/step:117.08s
|
| 349 |
+
[2025-09-12 06:31:08,368] - step:319/900 train_loss:0.4448 lr:0.0003000000 time/step:117.03s
|
| 350 |
+
[2025-09-12 06:33:04,509] - step:320/900 train_loss:0.4103 lr:0.0003000000 time/step:116.14s
|
| 351 |
+
[2025-09-12 06:35:02,658] - step:321/900 train_loss:0.4142 lr:0.0003000000 time/step:118.14s
|
| 352 |
+
[2025-09-12 06:36:59,639] - step:322/900 train_loss:0.3985 lr:0.0003000000 time/step:116.97s
|
| 353 |
+
[2025-09-12 06:38:56,063] - step:323/900 train_loss:0.4057 lr:0.0003000000 time/step:116.42s
|
| 354 |
+
[2025-09-12 06:40:53,684] - step:324/900 train_loss:0.4223 lr:0.0003000000 time/step:117.62s
|
| 355 |
+
[2025-09-12 06:42:50,547] - step:325/900 train_loss:0.4205 lr:0.0003000000 time/step:116.85s
|
| 356 |
+
[2025-09-12 06:44:46,896] - step:326/900 train_loss:0.4172 lr:0.0003000000 time/step:116.34s
|
| 357 |
+
[2025-09-12 06:46:45,176] - step:327/900 train_loss:0.4186 lr:0.0003000000 time/step:118.27s
|
| 358 |
+
[2025-09-12 06:48:42,119] - step:328/900 train_loss:0.4294 lr:0.0003000000 time/step:116.93s
|
| 359 |
+
[2025-09-12 06:50:38,781] - step:329/900 train_loss:0.4072 lr:0.0003000000 time/step:116.66s
|
| 360 |
+
[2025-09-12 06:52:36,425] - step:330/900 train_loss:0.4248 lr:0.0003000000 time/step:117.63s
|
| 361 |
+
[2025-09-12 06:54:33,431] - step:331/900 train_loss:0.4141 lr:0.0003000000 time/step:117.00s
|
| 362 |
+
[2025-09-12 06:56:30,074] - step:332/900 train_loss:0.4124 lr:0.0003000000 time/step:116.64s
|
| 363 |
+
[2025-09-12 06:58:26,556] - step:333/900 train_loss:0.4281 lr:0.0003000000 time/step:116.47s
|
| 364 |
+
[2025-09-12 07:00:23,620] - step:334/900 train_loss:0.4141 lr:0.0003000000 time/step:117.06s
|
| 365 |
+
[2025-09-12 07:02:21,404] - step:335/900 train_loss:0.4197 lr:0.0003000000 time/step:117.77s
|
| 366 |
+
[2025-09-12 07:04:17,967] - step:336/900 train_loss:0.4356 lr:0.0003000000 time/step:116.56s
|
| 367 |
+
[2025-09-12 07:06:16,192] - step:337/900 train_loss:0.3934 lr:0.0003000000 time/step:118.22s
|
| 368 |
+
[2025-09-12 07:08:12,291] - step:338/900 train_loss:0.3917 lr:0.0003000000 time/step:116.09s
|
| 369 |
+
[2025-09-12 07:10:08,910] - step:339/900 train_loss:0.4353 lr:0.0003000000 time/step:116.61s
|
| 370 |
+
[2025-09-12 07:12:06,665] - step:340/900 train_loss:0.4537 lr:0.0003000000 time/step:117.74s
|
| 371 |
+
[2025-09-12 07:14:03,621] - step:341/900 train_loss:0.4146 lr:0.0003000000 time/step:116.95s
|
| 372 |
+
[2025-09-12 07:16:00,835] - step:342/900 train_loss:0.4194 lr:0.0003000000 time/step:117.20s
|
| 373 |
+
[2025-09-12 07:17:57,387] - step:343/900 train_loss:0.4117 lr:0.0003000000 time/step:116.54s
|
| 374 |
+
[2025-09-12 07:19:53,951] - step:344/900 train_loss:0.3925 lr:0.0003000000 time/step:116.56s
|
| 375 |
+
[2025-09-12 07:21:50,959] - step:345/900 train_loss:0.4268 lr:0.0003000000 time/step:117.00s
|
| 376 |
+
[2025-09-12 07:23:49,546] - step:346/900 train_loss:0.4113 lr:0.0003000000 time/step:118.58s
|
| 377 |
+
[2025-09-12 07:25:46,639] - step:347/900 train_loss:0.4211 lr:0.0003000000 time/step:117.08s
|
| 378 |
+
[2025-09-12 07:27:43,350] - step:348/900 train_loss:0.4183 lr:0.0003000000 time/step:116.70s
|
| 379 |
+
[2025-09-12 07:29:39,127] - step:349/900 train_loss:0.4313 lr:0.0003000000 time/step:115.77s
|
| 380 |
+
[2025-09-12 07:31:35,852] - step:350/900 train_loss:0.3881 lr:0.0003000000 time/step:116.71s
|
| 381 |
+
[2025-09-12 07:33:34,104] - step:351/900 train_loss:0.4243 lr:0.0003000000 time/step:118.24s
|
| 382 |
+
[2025-09-12 07:35:31,118] - step:352/900 train_loss:0.4273 lr:0.0003000000 time/step:117.00s
|
| 383 |
+
[2025-09-12 07:37:28,208] - step:353/900 train_loss:0.3925 lr:0.0003000000 time/step:117.06s
|
| 384 |
+
[2025-09-12 07:39:25,351] - step:354/900 train_loss:0.4223 lr:0.0003000000 time/step:117.14s
|
| 385 |
+
[2025-09-12 07:41:21,430] - step:355/900 train_loss:0.3996 lr:0.0003000000 time/step:116.07s
|
| 386 |
+
[2025-09-12 07:43:18,880] - step:356/900 train_loss:0.4095 lr:0.0003000000 time/step:117.45s
|
| 387 |
+
[2025-09-12 07:45:16,716] - step:357/900 train_loss:0.4204 lr:0.0003000000 time/step:117.83s
|
| 388 |
+
[2025-09-12 07:47:14,287] - step:358/900 train_loss:0.4157 lr:0.0003000000 time/step:117.56s
|
| 389 |
+
[2025-09-12 07:49:11,022] - step:359/900 train_loss:0.4179 lr:0.0003000000 time/step:116.72s
|
| 390 |
+
[2025-09-12 07:51:08,126] - step:360/900 train_loss:0.4490 lr:0.0003000000 time/step:117.10s
|
| 391 |
+
[2025-09-12 07:53:04,336] - step:361/900 train_loss:0.4100 lr:0.0003000000 time/step:116.20s
|
| 392 |
+
[2025-09-12 07:55:00,814] - step:362/900 train_loss:0.4050 lr:0.0003000000 time/step:116.47s
|
| 393 |
+
[2025-09-12 07:56:58,814] - step:363/900 train_loss:0.4299 lr:0.0003000000 time/step:117.99s
|
| 394 |
+
[2025-09-12 07:58:55,677] - step:364/900 train_loss:0.3970 lr:0.0003000000 time/step:116.85s
|
| 395 |
+
[2025-09-12 08:00:53,062] - step:365/900 train_loss:0.4180 lr:0.0003000000 time/step:117.38s
|
| 396 |
+
[2025-09-12 08:02:49,522] - step:366/900 train_loss:0.4307 lr:0.0003000000 time/step:116.45s
|
| 397 |
+
[2025-09-12 08:04:45,597] - step:367/900 train_loss:0.4335 lr:0.0003000000 time/step:116.07s
|
| 398 |
+
[2025-09-12 08:06:43,333] - step:368/900 train_loss:0.3967 lr:0.0003000000 time/step:117.73s
|
| 399 |
+
[2025-09-12 08:08:40,432] - step:369/900 train_loss:0.4226 lr:0.0003000000 time/step:117.09s
|
| 400 |
+
[2025-09-12 08:10:38,337] - step:370/900 train_loss:0.4086 lr:0.0003000000 time/step:117.90s
|
| 401 |
+
[2025-09-12 08:12:35,283] - step:371/900 train_loss:0.3949 lr:0.0003000000 time/step:116.93s
|
| 402 |
+
[2025-09-12 08:14:31,782] - step:372/900 train_loss:0.4219 lr:0.0003000000 time/step:116.49s
|
| 403 |
+
[2025-09-12 08:16:29,230] - step:373/900 train_loss:0.4088 lr:0.0003000000 time/step:117.44s
|
| 404 |
+
[2025-09-12 08:18:26,952] - step:374/900 train_loss:0.4184 lr:0.0003000000 time/step:117.71s
|
| 405 |
+
[2025-09-12 08:20:23,596] - step:375/900 train_loss:0.4110 lr:0.0003000000 time/step:116.64s
|
| 406 |
+
[2025-09-12 08:22:20,047] - step:376/900 train_loss:0.4305 lr:0.0003000000 time/step:116.44s
|
| 407 |
+
[2025-09-12 08:24:16,398] - step:377/900 train_loss:0.4143 lr:0.0003000000 time/step:116.35s
|
| 408 |
+
[2025-09-12 08:26:13,665] - step:378/900 train_loss:0.4139 lr:0.0003000000 time/step:117.26s
|
| 409 |
+
[2025-09-12 08:28:09,796] - step:379/900 train_loss:0.4060 lr:0.0003000000 time/step:116.13s
|
| 410 |
+
[2025-09-12 08:30:07,613] - step:380/900 train_loss:0.3921 lr:0.0003000000 time/step:117.81s
|
| 411 |
+
[2025-09-12 08:32:04,597] - step:381/900 train_loss:0.4239 lr:0.0003000000 time/step:116.97s
|
| 412 |
+
[2025-09-12 08:34:01,394] - step:382/900 train_loss:0.4041 lr:0.0003000000 time/step:116.79s
|
| 413 |
+
[2025-09-12 08:35:58,263] - step:383/900 train_loss:0.4115 lr:0.0003000000 time/step:116.86s
|
| 414 |
+
[2025-09-12 08:37:54,649] - step:384/900 train_loss:0.4216 lr:0.0003000000 time/step:116.38s
|
| 415 |
+
[2025-09-12 08:39:51,866] - step:385/900 train_loss:0.4057 lr:0.0003000000 time/step:117.21s
|
| 416 |
+
[2025-09-12 08:41:49,473] - step:386/900 train_loss:0.4021 lr:0.0003000000 time/step:117.60s
|
| 417 |
+
[2025-09-12 08:43:46,456] - step:387/900 train_loss:0.4235 lr:0.0003000000 time/step:116.98s
|
| 418 |
+
[2025-09-12 08:45:42,939] - step:388/900 train_loss:0.4309 lr:0.0003000000 time/step:116.48s
|
| 419 |
+
[2025-09-12 08:47:40,164] - step:389/900 train_loss:0.3930 lr:0.0003000000 time/step:117.22s
|
| 420 |
+
[2025-09-12 08:49:36,386] - step:390/900 train_loss:0.4063 lr:0.0003000000 time/step:116.22s
|
| 421 |
+
[2025-09-12 08:51:33,830] - step:391/900 train_loss:0.4034 lr:0.0003000000 time/step:117.43s
|
| 422 |
+
[2025-09-12 08:53:30,812] - step:392/900 train_loss:0.4071 lr:0.0003000000 time/step:116.97s
|
| 423 |
+
[2025-09-12 08:55:28,574] - step:393/900 train_loss:0.4296 lr:0.0003000000 time/step:117.75s
|
| 424 |
+
[2025-09-12 08:57:25,899] - step:394/900 train_loss:0.4171 lr:0.0003000000 time/step:117.31s
|
| 425 |
+
[2025-09-12 08:59:22,463] - step:395/900 train_loss:0.4167 lr:0.0003000000 time/step:116.56s
|
| 426 |
+
[2025-09-12 09:01:19,086] - step:396/900 train_loss:0.4119 lr:0.0003000000 time/step:116.62s
|
| 427 |
+
[2025-09-12 09:03:16,267] - step:397/900 train_loss:0.4057 lr:0.0003000000 time/step:117.17s
|
| 428 |
+
[2025-09-12 09:05:13,175] - step:398/900 train_loss:0.4064 lr:0.0003000000 time/step:116.90s
|
| 429 |
+
[2025-09-12 09:07:10,958] - step:399/900 train_loss:0.3913 lr:0.0003000000 time/step:117.77s
|
| 430 |
+
[2025-09-12 09:09:08,523] - step:400/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@400.pt...
|
| 431 |
+
[2025-09-12 09:09:08,524] - step:400/900 train_loss:0.4028 lr:0.0003000000 time/step:116.93s
|
| 432 |
+
[2025-09-12 09:11:04,902] - step:401/900 train_loss:0.3889 lr:0.0003000000 time/step:116.38s
|
| 433 |
+
[2025-09-12 09:13:01,467] - step:402/900 train_loss:0.4192 lr:0.0003000000 time/step:116.55s
|
| 434 |
+
[2025-09-12 09:14:58,472] - step:403/900 train_loss:0.4211 lr:0.0003000000 time/step:117.00s
|
| 435 |
+
[2025-09-12 09:16:55,036] - step:404/900 train_loss:0.4354 lr:0.0003000000 time/step:116.56s
|
| 436 |
+
[2025-09-12 09:18:52,741] - step:405/900 train_loss:0.4290 lr:0.0003000000 time/step:117.69s
|
| 437 |
+
[2025-09-12 09:20:49,701] - step:406/900 train_loss:0.4290 lr:0.0003000000 time/step:116.95s
|
| 438 |
+
[2025-09-12 09:22:46,403] - step:407/900 train_loss:0.4257 lr:0.0003000000 time/step:116.69s
|
| 439 |
+
[2025-09-12 09:24:43,048] - step:408/900 train_loss:0.4252 lr:0.0003000000 time/step:116.64s
|
| 440 |
+
[2025-09-12 09:26:39,532] - step:409/900 train_loss:0.3992 lr:0.0003000000 time/step:116.48s
|
| 441 |
+
[2025-09-12 09:28:37,397] - step:410/900 train_loss:0.4191 lr:0.0003000000 time/step:117.86s
|
| 442 |
+
[2025-09-12 09:30:33,697] - step:411/900 train_loss:0.3892 lr:0.0003000000 time/step:116.29s
|
| 443 |
+
[2025-09-12 09:32:30,021] - step:412/900 train_loss:0.3843 lr:0.0003000000 time/step:116.32s
|
| 444 |
+
[2025-09-12 09:34:27,365] - step:413/900 train_loss:0.4010 lr:0.0003000000 time/step:117.34s
|
| 445 |
+
[2025-09-12 09:36:24,146] - step:414/900 train_loss:0.4190 lr:0.0003000000 time/step:116.77s
|
| 446 |
+
[2025-09-12 09:38:20,888] - step:415/900 train_loss:0.4182 lr:0.0003000000 time/step:116.73s
|
| 447 |
+
[2025-09-12 09:40:17,896] - step:416/900 train_loss:0.4236 lr:0.0003000000 time/step:117.00s
|
| 448 |
+
[2025-09-12 09:42:14,418] - step:417/900 train_loss:0.4016 lr:0.0003000000 time/step:116.51s
|
| 449 |
+
[2025-09-12 09:44:11,142] - step:418/900 train_loss:0.4054 lr:0.0003000000 time/step:116.72s
|
| 450 |
+
[2025-09-12 09:46:07,906] - step:419/900 train_loss:0.4162 lr:0.0003000000 time/step:116.75s
|
| 451 |
+
[2025-09-12 09:48:05,609] - step:420/900 train_loss:0.3856 lr:0.0003000000 time/step:117.70s
|
| 452 |
+
[2025-09-12 09:50:02,634] - step:421/900 train_loss:0.3832 lr:0.0003000000 time/step:117.02s
|
| 453 |
+
[2025-09-12 09:51:59,099] - step:422/900 train_loss:0.4000 lr:0.0003000000 time/step:116.45s
|
| 454 |
+
[2025-09-12 09:53:56,083] - step:423/900 train_loss:0.4182 lr:0.0003000000 time/step:116.98s
|
| 455 |
+
[2025-09-12 09:55:53,683] - step:424/900 train_loss:0.4064 lr:0.0003000000 time/step:117.60s
|
| 456 |
+
[2025-09-12 09:57:49,838] - step:425/900 train_loss:0.4186 lr:0.0003000000 time/step:116.14s
|
| 457 |
+
[2025-09-12 09:59:47,210] - step:426/900 train_loss:0.4251 lr:0.0003000000 time/step:117.36s
|
| 458 |
+
[2025-09-12 10:01:43,887] - step:427/900 train_loss:0.3975 lr:0.0003000000 time/step:116.67s
|
| 459 |
+
[2025-09-12 10:03:40,560] - step:428/900 train_loss:0.4212 lr:0.0003000000 time/step:116.66s
|
| 460 |
+
[2025-09-12 10:05:37,859] - step:429/900 train_loss:0.4118 lr:0.0003000000 time/step:117.29s
|
| 461 |
+
[2025-09-12 10:07:35,749] - step:430/900 train_loss:0.3981 lr:0.0003000000 time/step:117.88s
|
| 462 |
+
[2025-09-12 10:09:32,291] - step:431/900 train_loss:0.4237 lr:0.0003000000 time/step:116.53s
|
| 463 |
+
[2025-09-12 10:11:29,229] - step:432/900 train_loss:0.3926 lr:0.0003000000 time/step:116.93s
|
| 464 |
+
[2025-09-12 10:13:26,136] - step:433/900 train_loss:0.4208 lr:0.0003000000 time/step:116.90s
|
| 465 |
+
[2025-09-12 10:15:22,577] - step:434/900 train_loss:0.4102 lr:0.0003000000 time/step:116.44s
|
| 466 |
+
[2025-09-12 10:17:19,961] - step:435/900 train_loss:0.4373 lr:0.0003000000 time/step:117.38s
|
| 467 |
+
[2025-09-12 10:19:18,170] - step:436/900 train_loss:0.4159 lr:0.0003000000 time/step:118.20s
|
| 468 |
+
[2025-09-12 10:21:13,810] - step:437/900 train_loss:0.4083 lr:0.0003000000 time/step:115.63s
|
| 469 |
+
[2025-09-12 10:23:10,450] - step:438/900 train_loss:0.4361 lr:0.0003000000 time/step:116.63s
|
| 470 |
+
[2025-09-12 10:25:07,257] - step:439/900 train_loss:0.4152 lr:0.0003000000 time/step:116.80s
|
| 471 |
+
[2025-09-12 10:27:04,621] - step:440/900 train_loss:0.4100 lr:0.0003000000 time/step:117.36s
|
| 472 |
+
[2025-09-12 10:29:01,561] - step:441/900 train_loss:0.4003 lr:0.0003000000 time/step:116.93s
|
| 473 |
+
[2025-09-12 10:30:58,928] - step:442/900 train_loss:0.4296 lr:0.0003000000 time/step:117.36s
|
| 474 |
+
[2025-09-12 10:32:54,885] - step:443/900 train_loss:0.4175 lr:0.0003000000 time/step:115.95s
|
| 475 |
+
[2025-09-12 10:34:51,250] - step:444/900 train_loss:0.4220 lr:0.0003000000 time/step:116.36s
|
| 476 |
+
[2025-09-12 10:36:48,671] - step:445/900 train_loss:0.4361 lr:0.0003000000 time/step:117.42s
|
| 477 |
+
[2025-09-12 10:38:46,902] - step:446/900 train_loss:0.4034 lr:0.0003000000 time/step:118.22s
|
| 478 |
+
[2025-09-12 10:40:44,143] - step:447/900 train_loss:0.4121 lr:0.0003000000 time/step:117.22s
|
| 479 |
+
[2025-09-12 10:42:40,558] - step:448/900 train_loss:0.4247 lr:0.0003000000 time/step:116.40s
|
| 480 |
+
[2025-09-12 10:44:37,203] - step:449/900 train_loss:0.4502 lr:0.0003000000 time/step:116.64s
|
| 481 |
+
[2025-09-12 10:46:34,074] - step:450/900 train_loss:0.4202 lr:0.0003000000 time/step:116.87s
|
| 482 |
+
[2025-09-12 10:48:32,574] - step:451/900 train_loss:0.4115 lr:0.0003000000 time/step:118.50s
|
| 483 |
+
[2025-09-12 10:50:30,519] - step:452/900 train_loss:0.4416 lr:0.0003000000 time/step:117.93s
|
| 484 |
+
[2025-09-12 10:52:27,400] - step:453/900 train_loss:0.4589 lr:0.0003000000 time/step:116.87s
|
| 485 |
+
[2025-09-12 10:54:23,502] - step:454/900 train_loss:0.4104 lr:0.0003000000 time/step:116.09s
|
| 486 |
+
[2025-09-12 10:56:20,043] - step:455/900 train_loss:0.4428 lr:0.0003000000 time/step:116.54s
|
| 487 |
+
[2025-09-12 10:58:18,649] - step:456/900 train_loss:0.3869 lr:0.0003000000 time/step:118.60s
|
| 488 |
+
[2025-09-12 11:00:16,434] - step:457/900 train_loss:0.3896 lr:0.0003000000 time/step:117.77s
|
| 489 |
+
[2025-09-12 11:02:12,853] - step:458/900 train_loss:0.4199 lr:0.0003000000 time/step:116.41s
|
| 490 |
+
[2025-09-12 11:04:09,871] - step:459/900 train_loss:0.4109 lr:0.0003000000 time/step:117.00s
|
| 491 |
+
[2025-09-12 11:06:05,943] - step:460/900 train_loss:0.4113 lr:0.0003000000 time/step:116.07s
|
| 492 |
+
[2025-09-12 11:08:02,527] - step:461/900 train_loss:0.3895 lr:0.0003000000 time/step:116.58s
|
| 493 |
+
[2025-09-12 11:10:00,790] - step:462/900 train_loss:0.4033 lr:0.0003000000 time/step:118.26s
|
| 494 |
+
[2025-09-12 11:11:58,115] - step:463/900 train_loss:0.4269 lr:0.0003000000 time/step:117.32s
|
| 495 |
+
[2025-09-12 11:13:54,593] - step:464/900 train_loss:0.4080 lr:0.0003000000 time/step:116.46s
|
| 496 |
+
[2025-09-12 11:15:51,480] - step:465/900 train_loss:0.4208 lr:0.0003000000 time/step:116.88s
|
| 497 |
+
[2025-09-12 11:17:48,283] - step:466/900 train_loss:0.4146 lr:0.0003000000 time/step:116.80s
|
| 498 |
+
[2025-09-12 11:19:44,666] - step:467/900 train_loss:0.4178 lr:0.0003000000 time/step:116.38s
|
| 499 |
+
[2025-09-12 11:21:43,091] - step:468/900 train_loss:0.4065 lr:0.0003000000 time/step:118.42s
|
| 500 |
+
[2025-09-12 11:23:40,099] - step:469/900 train_loss:0.4158 lr:0.0003000000 time/step:117.00s
|
| 501 |
+
[2025-09-12 11:25:36,537] - step:470/900 train_loss:0.3969 lr:0.0003000000 time/step:116.43s
|
| 502 |
+
[2025-09-12 11:27:34,080] - step:471/900 train_loss:0.4355 lr:0.0003000000 time/step:117.54s
|
| 503 |
+
[2025-09-12 11:29:30,162] - step:472/900 train_loss:0.3901 lr:0.0003000000 time/step:116.08s
|
| 504 |
+
[2025-09-12 11:31:28,047] - step:473/900 train_loss:0.4142 lr:0.0003000000 time/step:117.88s
|
| 505 |
+
[2025-09-12 11:33:24,570] - step:474/900 train_loss:0.4396 lr:0.0003000000 time/step:116.51s
|
| 506 |
+
[2025-09-12 11:35:21,454] - step:475/900 train_loss:0.3944 lr:0.0003000000 time/step:116.88s
|
| 507 |
+
[2025-09-12 11:37:18,778] - step:476/900 train_loss:0.4112 lr:0.0003000000 time/step:117.32s
|
| 508 |
+
[2025-09-12 11:39:15,275] - step:477/900 train_loss:0.4239 lr:0.0003000000 time/step:116.49s
|
| 509 |
+
[2025-09-12 11:41:11,285] - step:478/900 train_loss:0.4200 lr:0.0003000000 time/step:116.01s
|
| 510 |
+
[2025-09-12 11:43:08,711] - step:479/900 train_loss:0.4177 lr:0.0003000000 time/step:117.41s
|
| 511 |
+
[2025-09-12 11:45:05,127] - step:480/900 train_loss:0.3939 lr:0.0003000000 time/step:116.41s
|
| 512 |
+
[2025-09-12 11:47:02,193] - step:481/900 train_loss:0.4138 lr:0.0003000000 time/step:117.06s
|
| 513 |
+
[2025-09-12 11:48:59,561] - step:482/900 train_loss:0.4252 lr:0.0003000000 time/step:117.36s
|
| 514 |
+
[2025-09-12 11:50:55,554] - step:483/900 train_loss:0.4048 lr:0.0003000000 time/step:115.99s
|
| 515 |
+
[2025-09-12 11:52:52,805] - step:484/900 train_loss:0.4000 lr:0.0003000000 time/step:117.24s
|
| 516 |
+
[2025-09-12 11:54:49,667] - step:485/900 train_loss:0.4216 lr:0.0003000000 time/step:116.85s
|
| 517 |
+
[2025-09-12 11:56:46,072] - step:486/900 train_loss:0.4095 lr:0.0003000000 time/step:116.40s
|
| 518 |
+
[2025-09-12 11:58:43,074] - step:487/900 train_loss:0.4027 lr:0.0003000000 time/step:117.00s
|
| 519 |
+
[2025-09-12 12:00:40,979] - step:488/900 train_loss:0.4245 lr:0.0003000000 time/step:117.90s
|
| 520 |
+
[2025-09-12 12:02:38,064] - step:489/900 train_loss:0.3942 lr:0.0003000000 time/step:117.08s
|
| 521 |
+
[2025-09-12 12:04:34,804] - step:490/900 train_loss:0.4239 lr:0.0003000000 time/step:116.72s
|
| 522 |
+
[2025-09-12 12:06:31,269] - step:491/900 train_loss:0.3853 lr:0.0003000000 time/step:116.46s
|
| 523 |
+
[2025-09-12 12:08:28,111] - step:492/900 train_loss:0.4141 lr:0.0003000000 time/step:116.84s
|
| 524 |
+
[2025-09-12 12:10:24,954] - step:493/900 train_loss:0.4139 lr:0.0003000000 time/step:116.84s
|
| 525 |
+
[2025-09-12 12:12:22,937] - step:494/900 train_loss:0.4166 lr:0.0003000000 time/step:117.98s
|
| 526 |
+
[2025-09-12 12:14:20,061] - step:495/900 train_loss:0.3974 lr:0.0003000000 time/step:117.11s
|
| 527 |
+
[2025-09-12 12:16:16,526] - step:496/900 train_loss:0.4149 lr:0.0003000000 time/step:116.46s
|
| 528 |
+
[2025-09-12 12:18:13,009] - step:497/900 train_loss:0.4181 lr:0.0003000000 time/step:116.48s
|
| 529 |
+
[2025-09-12 12:20:09,790] - step:498/900 train_loss:0.4166 lr:0.0003000000 time/step:116.78s
|
| 530 |
+
[2025-09-12 12:22:06,615] - step:499/900 train_loss:0.4216 lr:0.0003000000 time/step:116.82s
|
| 531 |
+
[2025-09-12 12:24:05,337] - step:500/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@500.pt...
|
| 532 |
+
[2025-09-12 12:24:05,343] - step:500/900 train_loss:0.4161 lr:0.0003000000 time/step:118.13s
|
| 533 |
+
[2025-09-12 12:26:01,342] - step:501/900 train_loss:0.4010 lr:0.0003000000 time/step:116.00s
|
| 534 |
+
[2025-09-12 12:27:58,345] - step:502/900 train_loss:0.4042 lr:0.0003000000 time/step:116.99s
|
| 535 |
+
[2025-09-12 12:29:54,389] - step:503/900 train_loss:0.4216 lr:0.0003000000 time/step:116.04s
|
| 536 |
+
[2025-09-12 12:31:51,252] - step:504/900 train_loss:0.4127 lr:0.0003000000 time/step:116.86s
|
| 537 |
+
[2025-09-12 12:33:49,558] - step:505/900 train_loss:0.4019 lr:0.0003000000 time/step:118.29s
|
| 538 |
+
[2025-09-12 12:35:46,199] - step:506/900 train_loss:0.4076 lr:0.0003000000 time/step:116.64s
|
| 539 |
+
[2025-09-12 12:37:42,246] - step:507/900 train_loss:0.4207 lr:0.0003000000 time/step:116.04s
|
| 540 |
+
[2025-09-12 12:39:39,229] - step:508/900 train_loss:0.4258 lr:0.0003000000 time/step:116.98s
|
| 541 |
+
[2025-09-12 12:41:35,709] - step:509/900 train_loss:0.3826 lr:0.0003000000 time/step:116.48s
|
| 542 |
+
[2025-09-12 12:43:32,441] - step:510/900 train_loss:0.4092 lr:0.0003000000 time/step:116.72s
|
| 543 |
+
[2025-09-12 12:45:30,539] - step:511/900 train_loss:0.3954 lr:0.0003000000 time/step:118.09s
|
| 544 |
+
[2025-09-12 12:47:27,041] - step:512/900 train_loss:0.4335 lr:0.0003000000 time/step:116.49s
|
| 545 |
+
[2025-09-12 12:49:23,522] - step:513/900 train_loss:0.4216 lr:0.0003000000 time/step:116.47s
|
| 546 |
+
[2025-09-12 12:51:20,467] - step:514/900 train_loss:0.3952 lr:0.0003000000 time/step:116.94s
|
| 547 |
+
[2025-09-12 12:53:17,452] - step:515/900 train_loss:0.4052 lr:0.0003000000 time/step:116.98s
|
| 548 |
+
[2025-09-12 12:55:14,098] - step:516/900 train_loss:0.4145 lr:0.0003000000 time/step:116.64s
|
| 549 |
+
[2025-09-12 12:57:11,620] - step:517/900 train_loss:0.4292 lr:0.0003000000 time/step:117.51s
|
| 550 |
+
[2025-09-12 12:59:09,139] - step:518/900 train_loss:0.4204 lr:0.0003000000 time/step:117.51s
|
| 551 |
+
[2025-09-12 13:01:05,186] - step:519/900 train_loss:0.3932 lr:0.0003000000 time/step:116.04s
|
| 552 |
+
[2025-09-12 13:03:01,731] - step:520/900 train_loss:0.4226 lr:0.0003000000 time/step:116.54s
|
| 553 |
+
[2025-09-12 13:04:59,398] - step:521/900 train_loss:0.4080 lr:0.0003000000 time/step:117.65s
|
| 554 |
+
[2025-09-12 13:06:56,876] - step:522/900 train_loss:0.4079 lr:0.0003000000 time/step:117.47s
|
| 555 |
+
[2025-09-12 13:08:53,784] - step:523/900 train_loss:0.4375 lr:0.0003000000 time/step:116.90s
|
| 556 |
+
[2025-09-12 13:11:18,031] - step:524/900 train_loss:0.3876 lr:0.0003000000 time/step:144.24s
|
| 557 |
+
[2025-09-12 13:13:14,894] - step:525/900 train_loss:0.4133 lr:0.0003000000 time/step:116.82s
|
| 558 |
+
[2025-09-12 13:15:16,203] - step:526/900 train_loss:0.3961 lr:0.0003000000 time/step:118.95s
|
| 559 |
+
[2025-09-12 13:17:12,922] - step:527/900 train_loss:0.3895 lr:0.0003000000 time/step:116.71s
|
| 560 |
+
[2025-09-12 13:19:09,906] - step:528/900 train_loss:0.4204 lr:0.0003000000 time/step:116.98s
|
| 561 |
+
[2025-09-12 13:21:08,032] - step:529/900 train_loss:0.4078 lr:0.0003000000 time/step:118.12s
|
| 562 |
+
[2025-09-12 13:23:04,450] - step:530/900 train_loss:0.3973 lr:0.0003000000 time/step:116.41s
|
| 563 |
+
[2025-09-12 13:25:02,156] - step:531/900 train_loss:0.3875 lr:0.0003000000 time/step:117.69s
|
| 564 |
+
[2025-09-12 13:26:58,851] - step:532/900 train_loss:0.3979 lr:0.0003000000 time/step:116.69s
|
| 565 |
+
[2025-09-12 13:28:55,552] - step:533/900 train_loss:0.4210 lr:0.0003000000 time/step:116.69s
|
| 566 |
+
[2025-09-12 13:30:52,352] - step:534/900 train_loss:0.4016 lr:0.0003000000 time/step:116.80s
|
| 567 |
+
[2025-09-12 13:32:50,584] - step:535/900 train_loss:0.3971 lr:0.0003000000 time/step:118.23s
|
| 568 |
+
[2025-09-12 13:34:47,330] - step:536/900 train_loss:0.4167 lr:0.0003000000 time/step:116.73s
|
| 569 |
+
[2025-09-12 13:36:44,747] - step:537/900 train_loss:0.4366 lr:0.0003000000 time/step:117.39s
|
| 570 |
+
[2025-09-12 13:38:42,456] - step:538/900 train_loss:0.4267 lr:0.0003000000 time/step:117.71s
|
| 571 |
+
[2025-09-12 13:40:38,661] - step:539/900 train_loss:0.4092 lr:0.0003000000 time/step:116.20s
|
| 572 |
+
[2025-09-12 13:42:38,305] - step:540/900 train_loss:0.4273 lr:0.0003000000 time/step:119.62s
|
| 573 |
+
[2025-09-12 13:44:37,524] - step:541/900 train_loss:0.4157 lr:0.0003000000 time/step:119.17s
|
| 574 |
+
[2025-09-12 13:46:33,425] - step:542/900 train_loss:0.4237 lr:0.0003000000 time/step:115.89s
|
| 575 |
+
[2025-09-12 13:48:30,101] - step:543/900 train_loss:0.4052 lr:0.0003000000 time/step:116.67s
|
| 576 |
+
[2025-09-12 13:50:27,196] - step:544/900 train_loss:0.4260 lr:0.0003000000 time/step:117.09s
|
| 577 |
+
[2025-09-12 13:52:24,079] - step:545/900 train_loss:0.4021 lr:0.0003000000 time/step:116.88s
|
| 578 |
+
[2025-09-12 13:54:21,661] - step:546/900 train_loss:0.3897 lr:0.0003000000 time/step:117.57s
|
| 579 |
+
[2025-09-12 13:56:19,479] - step:547/900 train_loss:0.4029 lr:0.0003000000 time/step:117.81s
|
| 580 |
+
[2025-09-12 13:58:15,488] - step:548/900 train_loss:0.4107 lr:0.0003000000 time/step:116.00s
|
| 581 |
+
[2025-09-12 14:00:11,893] - step:549/900 train_loss:0.4159 lr:0.0003000000 time/step:116.40s
|
| 582 |
+
[2025-09-12 14:02:08,916] - step:550/900 train_loss:0.4075 lr:0.0003000000 time/step:117.01s
|
| 583 |
+
[2025-09-12 14:04:06,359] - step:551/900 train_loss:0.3932 lr:0.0003000000 time/step:117.43s
|
| 584 |
+
[2025-09-12 14:06:02,862] - step:552/900 train_loss:0.4110 lr:0.0003000000 time/step:116.49s
|
| 585 |
+
[2025-09-12 14:08:00,226] - step:553/900 train_loss:0.4250 lr:0.0003000000 time/step:117.36s
|
| 586 |
+
[2025-09-12 14:09:56,780] - step:554/900 train_loss:0.3990 lr:0.0003000000 time/step:116.54s
|
| 587 |
+
[2025-09-12 14:11:53,353] - step:555/900 train_loss:0.4041 lr:0.0003000000 time/step:116.56s
|
| 588 |
+
[2025-09-12 14:13:50,235] - step:556/900 train_loss:0.4062 lr:0.0003000000 time/step:116.87s
|
| 589 |
+
[2025-09-12 14:15:47,160] - step:557/900 train_loss:0.4144 lr:0.0003000000 time/step:116.92s
|
| 590 |
+
[2025-09-12 14:17:44,967] - step:558/900 train_loss:0.4032 lr:0.0003000000 time/step:117.80s
|
| 591 |
+
[2025-09-12 14:19:40,685] - step:559/900 train_loss:0.4082 lr:0.0003000000 time/step:115.71s
|
| 592 |
+
[2025-09-12 14:21:37,889] - step:560/900 train_loss:0.4140 lr:0.0003000000 time/step:117.20s
|
| 593 |
+
[2025-09-12 14:23:34,834] - step:561/900 train_loss:0.4284 lr:0.0003000000 time/step:116.94s
|
| 594 |
+
[2025-09-12 14:25:31,517] - step:562/900 train_loss:0.4096 lr:0.0003000000 time/step:116.67s
|
| 595 |
+
[2025-09-12 14:27:29,793] - step:563/900 train_loss:0.4017 lr:0.0003000000 time/step:118.26s
|
| 596 |
+
[2025-09-12 14:29:26,683] - step:564/900 train_loss:0.4014 lr:0.0003000000 time/step:116.88s
|
| 597 |
+
[2025-09-12 14:31:22,468] - step:565/900 train_loss:0.4061 lr:0.0003000000 time/step:115.78s
|
| 598 |
+
[2025-09-12 14:33:19,190] - step:566/900 train_loss:0.4188 lr:0.0003000000 time/step:116.72s
|
| 599 |
+
[2025-09-12 14:35:16,130] - step:567/900 train_loss:0.4305 lr:0.0003000000 time/step:116.93s
|
| 600 |
+
[2025-09-12 14:37:13,373] - step:568/900 train_loss:0.3922 lr:0.0003000000 time/step:117.24s
|
| 601 |
+
[2025-09-12 14:39:10,305] - step:569/900 train_loss:0.4190 lr:0.0003000000 time/step:116.92s
|
| 602 |
+
[2025-09-12 14:41:07,121] - step:570/900 train_loss:0.4047 lr:0.0003000000 time/step:116.81s
|
| 603 |
+
[2025-09-12 14:43:03,948] - step:571/900 train_loss:0.4152 lr:0.0003000000 time/step:116.82s
|
| 604 |
+
[2025-09-12 14:45:00,151] - step:572/900 train_loss:0.3946 lr:0.0003000000 time/step:116.19s
|
| 605 |
+
[2025-09-12 14:46:57,634] - step:573/900 train_loss:0.4138 lr:0.0003000000 time/step:117.48s
|
| 606 |
+
[2025-09-12 14:48:55,022] - step:574/900 train_loss:0.4231 lr:0.0003000000 time/step:117.37s
|
| 607 |
+
[2025-09-12 14:50:50,877] - step:575/900 train_loss:0.3978 lr:0.0003000000 time/step:115.85s
|
| 608 |
+
[2025-09-12 14:52:49,128] - step:576/900 train_loss:0.4169 lr:0.0003000000 time/step:118.25s
|
| 609 |
+
[2025-09-12 14:54:45,289] - step:577/900 train_loss:0.3971 lr:0.0003000000 time/step:116.15s
|
| 610 |
+
[2025-09-12 14:56:41,851] - step:578/900 train_loss:0.4058 lr:0.0003000000 time/step:116.56s
|
| 611 |
+
[2025-09-12 14:58:38,779] - step:579/900 train_loss:0.4105 lr:0.0003000000 time/step:116.92s
|
| 612 |
+
[2025-09-12 15:00:35,657] - step:580/900 train_loss:0.4145 lr:0.0003000000 time/step:116.87s
|
| 613 |
+
[2025-09-12 15:02:33,021] - step:581/900 train_loss:0.4067 lr:0.0003000000 time/step:117.36s
|
| 614 |
+
[2025-09-12 15:04:29,564] - step:582/900 train_loss:0.4209 lr:0.0003000000 time/step:116.53s
|
| 615 |
+
[2025-09-12 15:06:26,089] - step:583/900 train_loss:0.4106 lr:0.0003000000 time/step:116.52s
|
| 616 |
+
[2025-09-12 15:08:22,953] - step:584/900 train_loss:0.4220 lr:0.0003000000 time/step:116.86s
|
| 617 |
+
[2025-09-12 15:10:19,376] - step:585/900 train_loss:0.4001 lr:0.0003000000 time/step:116.41s
|
| 618 |
+
[2025-09-12 15:12:16,440] - step:586/900 train_loss:0.3963 lr:0.0003000000 time/step:117.06s
|
| 619 |
+
[2025-09-12 15:14:14,343] - step:587/900 train_loss:0.4118 lr:0.0003000000 time/step:117.89s
|
| 620 |
+
[2025-09-12 15:16:10,568] - step:588/900 train_loss:0.4285 lr:0.0003000000 time/step:116.22s
|
| 621 |
+
[2025-09-12 15:18:06,609] - step:589/900 train_loss:0.4177 lr:0.0003000000 time/step:116.04s
|
| 622 |
+
[2025-09-12 15:20:03,934] - step:590/900 train_loss:0.4256 lr:0.0003000000 time/step:117.32s
|
| 623 |
+
[2025-09-12 15:22:00,505] - step:591/900 train_loss:0.4258 lr:0.0003000000 time/step:116.57s
|
| 624 |
+
[2025-09-12 15:23:57,739] - step:592/900 train_loss:0.4031 lr:0.0003000000 time/step:117.19s
|
| 625 |
+
[2025-09-12 15:25:55,502] - step:593/900 train_loss:0.3975 lr:0.0003000000 time/step:117.76s
|
| 626 |
+
[2025-09-12 15:27:51,604] - step:594/900 train_loss:0.4098 lr:0.0003000000 time/step:116.10s
|
| 627 |
+
[2025-09-12 15:29:48,152] - step:595/900 train_loss:0.4044 lr:0.0003000000 time/step:116.54s
|
| 628 |
+
[2025-09-12 15:31:45,056] - step:596/900 train_loss:0.4394 lr:0.0003000000 time/step:116.89s
|
| 629 |
+
[2025-09-12 15:33:42,598] - step:597/900 train_loss:0.4166 lr:0.0003000000 time/step:117.54s
|
| 630 |
+
[2025-09-12 15:35:38,903] - step:598/900 train_loss:0.3857 lr:0.0003000000 time/step:116.29s
|
| 631 |
+
[2025-09-12 15:37:35,947] - step:599/900 train_loss:0.3944 lr:0.0003000000 time/step:117.04s
|
| 632 |
+
[2025-09-12 15:39:32,999] - step:600/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@600.pt...
|
| 633 |
+
[2025-09-12 15:39:33,008] - step:600/900 train_loss:0.4121 lr:0.0003000000 time/step:116.45s
|
| 634 |
+
[2025-09-12 15:41:29,871] - step:601/900 train_loss:0.4055 lr:0.0003000000 time/step:116.86s
|
| 635 |
+
[2025-09-12 15:43:27,496] - step:602/900 train_loss:0.4131 lr:0.0003000000 time/step:117.62s
|
| 636 |
+
[2025-09-12 15:45:24,775] - step:603/900 train_loss:0.4117 lr:0.0003000000 time/step:117.27s
|
| 637 |
+
[2025-09-12 15:47:21,843] - step:604/900 train_loss:0.4073 lr:0.0003000000 time/step:117.06s
|
| 638 |
+
[2025-09-12 15:49:19,207] - step:605/900 train_loss:0.3994 lr:0.0003000000 time/step:117.35s
|
| 639 |
+
[2025-09-12 15:51:15,705] - step:606/900 train_loss:0.4006 lr:0.0003000000 time/step:116.49s
|
| 640 |
+
[2025-09-12 15:53:12,651] - step:607/900 train_loss:0.4087 lr:0.0003000000 time/step:116.94s
|
| 641 |
+
[2025-09-12 15:55:10,370] - step:608/900 train_loss:0.4194 lr:0.0003000000 time/step:117.71s
|
| 642 |
+
[2025-09-12 15:57:07,183] - step:609/900 train_loss:0.4059 lr:0.0003000000 time/step:116.80s
|
| 643 |
+
[2025-09-12 15:59:03,945] - step:610/900 train_loss:0.3960 lr:0.0003000000 time/step:116.75s
|
| 644 |
+
[2025-09-12 16:01:01,845] - step:611/900 train_loss:0.4203 lr:0.0003000000 time/step:117.89s
|
| 645 |
+
[2025-09-12 16:02:57,870] - step:612/900 train_loss:0.4208 lr:0.0003000000 time/step:116.02s
|
| 646 |
+
[2025-09-12 16:04:55,209] - step:613/900 train_loss:0.4205 lr:0.0003000000 time/step:117.33s
|
| 647 |
+
[2025-09-12 16:06:52,213] - step:614/900 train_loss:0.4023 lr:0.0003000000 time/step:117.00s
|
| 648 |
+
[2025-09-12 16:08:49,316] - step:615/900 train_loss:0.4011 lr:0.0003000000 time/step:117.09s
|
| 649 |
+
[2025-09-12 16:10:47,341] - step:616/900 train_loss:0.3898 lr:0.0003000000 time/step:118.02s
|
| 650 |
+
[2025-09-12 16:12:44,306] - step:617/900 train_loss:0.4223 lr:0.0003000000 time/step:116.96s
|
| 651 |
+
[2025-09-12 16:14:41,166] - step:618/900 train_loss:0.4022 lr:0.0003000000 time/step:116.85s
|
| 652 |
+
[2025-09-12 16:16:38,068] - step:619/900 train_loss:0.4259 lr:0.0003000000 time/step:116.89s
|
| 653 |
+
[2025-09-12 16:18:35,272] - step:620/900 train_loss:0.4129 lr:0.0003000000 time/step:117.20s
|
| 654 |
+
[2025-09-12 16:20:32,436] - step:621/900 train_loss:0.4122 lr:0.0003000000 time/step:117.13s
|
| 655 |
+
[2025-09-12 16:22:30,553] - step:622/900 train_loss:0.4185 lr:0.0003000000 time/step:118.10s
|
| 656 |
+
[2025-09-12 16:24:27,881] - step:623/900 train_loss:0.3991 lr:0.0003000000 time/step:117.28s
|
| 657 |
+
[2025-09-12 16:26:24,425] - step:624/900 train_loss:0.4208 lr:0.0003000000 time/step:116.53s
|
| 658 |
+
[2025-09-12 16:28:21,471] - step:625/900 train_loss:0.4276 lr:0.0003000000 time/step:117.04s
|
| 659 |
+
[2025-09-12 16:30:19,129] - step:626/900 train_loss:0.4259 lr:0.0003000000 time/step:117.64s
|
| 660 |
+
[2025-09-12 16:32:19,616] - step:627/900 train_loss:0.3848 lr:0.0003000000 time/step:120.47s
|
| 661 |
+
[2025-09-12 16:34:17,638] - step:628/900 train_loss:0.4005 lr:0.0003000000 time/step:118.02s
|
| 662 |
+
[2025-09-12 16:36:14,359] - step:629/900 train_loss:0.3988 lr:0.0003000000 time/step:116.71s
|
| 663 |
+
[2025-09-12 16:38:11,222] - step:630/900 train_loss:0.4181 lr:0.0003000000 time/step:116.86s
|
| 664 |
+
[2025-09-12 16:40:08,509] - step:631/900 train_loss:0.4042 lr:0.0003000000 time/step:117.28s
|
| 665 |
+
[2025-09-12 16:42:06,712] - step:632/900 train_loss:0.4010 lr:0.0003000000 time/step:118.19s
|
| 666 |
+
[2025-09-12 16:44:03,814] - step:633/900 train_loss:0.4108 lr:0.0003000000 time/step:117.10s
|
| 667 |
+
[2025-09-12 16:46:01,576] - step:634/900 train_loss:0.4218 lr:0.0003000000 time/step:117.65s
|
| 668 |
+
[2025-09-12 16:47:57,601] - step:635/900 train_loss:0.4339 lr:0.0003000000 time/step:116.02s
|
| 669 |
+
[2025-09-12 16:49:54,473] - step:636/900 train_loss:0.4252 lr:0.0003000000 time/step:116.86s
|
| 670 |
+
[2025-09-12 16:51:52,707] - step:637/900 train_loss:0.3961 lr:0.0003000000 time/step:118.19s
|
| 671 |
+
[2025-09-12 16:53:50,406] - step:638/900 train_loss:0.4049 lr:0.0003000000 time/step:117.69s
|
| 672 |
+
[2025-09-12 16:55:48,233] - step:639/900 train_loss:0.4217 lr:0.0003000000 time/step:117.81s
|
| 673 |
+
[2025-09-12 16:57:44,596] - step:640/900 train_loss:0.4046 lr:0.0003000000 time/step:116.35s
|
| 674 |
+
[2025-09-12 16:59:40,200] - step:641/900 train_loss:0.4136 lr:0.0003000000 time/step:115.60s
|
| 675 |
+
[2025-09-12 17:01:37,286] - step:642/900 train_loss:0.4027 lr:0.0003000000 time/step:117.08s
|
| 676 |
+
[2025-09-12 17:03:35,226] - step:643/900 train_loss:0.3820 lr:0.0003000000 time/step:117.93s
|
| 677 |
+
[2025-09-12 17:05:33,570] - step:644/900 train_loss:0.4089 lr:0.0003000000 time/step:118.33s
|
| 678 |
+
[2025-09-12 17:07:30,395] - step:645/900 train_loss:0.3874 lr:0.0003000000 time/step:116.82s
|
| 679 |
+
[2025-09-12 17:09:27,297] - step:646/900 train_loss:0.4146 lr:0.0003000000 time/step:116.90s
|
| 680 |
+
[2025-09-12 17:11:23,362] - step:647/900 train_loss:0.3988 lr:0.0003000000 time/step:116.06s
|
| 681 |
+
[2025-09-12 17:13:20,787] - step:648/900 train_loss:0.4128 lr:0.0003000000 time/step:117.42s
|
| 682 |
+
[2025-09-12 17:15:18,588] - step:649/900 train_loss:0.4332 lr:0.0003000000 time/step:117.79s
|
| 683 |
+
[2025-09-12 17:17:16,062] - step:650/900 train_loss:0.4214 lr:0.0003000000 time/step:117.47s
|
| 684 |
+
[2025-09-12 17:19:12,730] - step:651/900 train_loss:0.4074 lr:0.0003000000 time/step:116.66s
|
| 685 |
+
[2025-09-12 17:21:09,550] - step:652/900 train_loss:0.4025 lr:0.0003000000 time/step:116.81s
|
| 686 |
+
[2025-09-12 17:23:05,702] - step:653/900 train_loss:0.4008 lr:0.0003000000 time/step:116.15s
|
| 687 |
+
[2025-09-12 17:25:03,925] - step:654/900 train_loss:0.4060 lr:0.0003000000 time/step:118.18s
|
| 688 |
+
[2025-09-12 17:27:02,401] - step:655/900 train_loss:0.3931 lr:0.0003000000 time/step:118.47s
|
| 689 |
+
[2025-09-12 17:28:59,392] - step:656/900 train_loss:0.3985 lr:0.0003000000 time/step:116.97s
|
| 690 |
+
[2025-09-12 17:30:56,335] - step:657/900 train_loss:0.4319 lr:0.0003000000 time/step:116.93s
|
| 691 |
+
[2025-09-12 17:32:52,897] - step:658/900 train_loss:0.4200 lr:0.0003000000 time/step:116.56s
|
| 692 |
+
[2025-09-12 17:34:50,643] - step:659/900 train_loss:0.3811 lr:0.0003000000 time/step:117.73s
|
| 693 |
+
[2025-09-12 17:36:47,661] - step:660/900 train_loss:0.3960 lr:0.0003000000 time/step:117.00s
|
| 694 |
+
[2025-09-12 17:38:45,367] - step:661/900 train_loss:0.3810 lr:0.0003000000 time/step:117.70s
|
| 695 |
+
[2025-09-12 17:40:42,471] - step:662/900 train_loss:0.3948 lr:0.0003000000 time/step:117.10s
|
| 696 |
+
[2025-09-12 17:42:39,354] - step:663/900 train_loss:0.4221 lr:0.0003000000 time/step:116.86s
|
| 697 |
+
[2025-09-12 17:44:37,177] - step:664/900 train_loss:0.4021 lr:0.0003000000 time/step:117.82s
|
| 698 |
+
[2025-09-12 17:46:33,621] - step:665/900 train_loss:0.4521 lr:0.0003000000 time/step:116.43s
|
| 699 |
+
[2025-09-12 17:48:31,225] - step:666/900 train_loss:0.4265 lr:0.0003000000 time/step:117.60s
|
| 700 |
+
[2025-09-12 17:50:28,126] - step:667/900 train_loss:0.4109 lr:0.0003000000 time/step:116.89s
|
| 701 |
+
[2025-09-12 17:52:25,032] - step:668/900 train_loss:0.4247 lr:0.0003000000 time/step:116.90s
|
| 702 |
+
[2025-09-12 17:54:22,433] - step:669/900 train_loss:0.4024 lr:0.0003000000 time/step:117.40s
|
| 703 |
+
[2025-09-12 17:56:19,263] - step:670/900 train_loss:0.4238 lr:0.0003000000 time/step:116.81s
|
| 704 |
+
[2025-09-12 17:58:15,840] - step:671/900 train_loss:0.4240 lr:0.0003000000 time/step:116.57s
|
| 705 |
+
[2025-09-12 18:00:13,196] - step:672/900 train_loss:0.4079 lr:0.0003000000 time/step:117.35s
|
| 706 |
+
[2025-09-12 18:02:09,946] - step:673/900 train_loss:0.4152 lr:0.0003000000 time/step:116.74s
|
| 707 |
+
[2025-09-12 18:04:08,272] - step:674/900 train_loss:0.4386 lr:0.0003000000 time/step:118.32s
|
| 708 |
+
[2025-09-12 18:06:05,695] - step:675/900 train_loss:0.3944 lr:0.0003000000 time/step:117.41s
|
| 709 |
+
[2025-09-12 18:08:01,761] - step:676/900 train_loss:0.3997 lr:0.0003000000 time/step:116.05s
|
| 710 |
+
[2025-09-12 18:09:59,340] - step:677/900 train_loss:0.4081 lr:0.0003000000 time/step:117.57s
|
| 711 |
+
[2025-09-12 18:11:56,223] - step:678/900 train_loss:0.4326 lr:0.0003000000 time/step:116.88s
|
| 712 |
+
[2025-09-12 18:13:53,528] - step:679/900 train_loss:0.4058 lr:0.0003000000 time/step:117.30s
|
| 713 |
+
[2025-09-12 18:15:51,604] - step:680/900 train_loss:0.4257 lr:0.0003000000 time/step:118.06s
|
| 714 |
+
[2025-09-12 18:17:48,495] - step:681/900 train_loss:0.4226 lr:0.0003000000 time/step:116.88s
|
| 715 |
+
[2025-09-12 18:19:44,618] - step:682/900 train_loss:0.3978 lr:0.0003000000 time/step:116.12s
|
| 716 |
+
[2025-09-12 18:21:41,760] - step:683/900 train_loss:0.4064 lr:0.0003000000 time/step:117.14s
|
| 717 |
+
[2025-09-12 18:23:38,665] - step:684/900 train_loss:0.3959 lr:0.0003000000 time/step:116.90s
|
| 718 |
+
[2025-09-12 18:25:36,029] - step:685/900 train_loss:0.4136 lr:0.0003000000 time/step:117.35s
|
| 719 |
+
[2025-09-12 18:27:33,774] - step:686/900 train_loss:0.4058 lr:0.0003000000 time/step:117.62s
|
| 720 |
+
[2025-09-12 18:29:30,658] - step:687/900 train_loss:0.4132 lr:0.0003000000 time/step:116.88s
|
| 721 |
+
[2025-09-12 18:31:27,420] - step:688/900 train_loss:0.4048 lr:0.0003000000 time/step:116.76s
|
| 722 |
+
[2025-09-12 18:33:24,361] - step:689/900 train_loss:0.4023 lr:0.0003000000 time/step:116.94s
|
| 723 |
+
[2025-09-12 18:35:21,754] - step:690/900 train_loss:0.3715 lr:0.0003000000 time/step:117.38s
|
| 724 |
+
[2025-09-12 18:37:19,552] - step:691/900 train_loss:0.4017 lr:0.0003000000 time/step:117.78s
|
| 725 |
+
[2025-09-12 18:39:16,412] - step:692/900 train_loss:0.4232 lr:0.0003000000 time/step:116.85s
|
| 726 |
+
[2025-09-12 18:41:13,974] - step:693/900 train_loss:0.4196 lr:0.0003000000 time/step:117.55s
|
| 727 |
+
[2025-09-12 18:43:10,197] - step:694/900 train_loss:0.4010 lr:0.0003000000 time/step:116.22s
|
| 728 |
+
[2025-09-12 18:45:07,263] - step:695/900 train_loss:0.3904 lr:0.0003000000 time/step:117.06s
|
| 729 |
+
[2025-09-12 18:47:05,813] - step:696/900 train_loss:0.4152 lr:0.0003000000 time/step:118.53s
|
| 730 |
+
[2025-09-12 18:49:02,863] - step:697/900 train_loss:0.4064 lr:0.0003000000 time/step:117.04s
|
| 731 |
+
[2025-09-12 18:50:59,812] - step:698/900 train_loss:0.3980 lr:0.0003000000 time/step:116.94s
|
| 732 |
+
[2025-09-12 18:52:57,370] - step:699/900 train_loss:0.3884 lr:0.0003000000 time/step:117.55s
|
| 733 |
+
[2025-09-12 18:54:54,648] - step:700/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@700.pt...
|
| 734 |
+
[2025-09-12 18:54:54,648] - step:700/900 train_loss:0.3973 lr:0.0003000000 time/step:116.56s
|
| 735 |
+
[2025-09-12 18:56:52,677] - step:701/900 train_loss:0.4030 lr:0.0003000000 time/step:118.01s
|
| 736 |
+
[2025-09-12 18:58:49,207] - step:702/900 train_loss:0.3937 lr:0.0003000000 time/step:116.52s
|
| 737 |
+
[2025-09-12 19:00:46,170] - step:703/900 train_loss:0.4356 lr:0.0003000000 time/step:116.96s
|
| 738 |
+
[2025-09-12 19:02:43,288] - step:704/900 train_loss:0.4294 lr:0.0003000000 time/step:117.11s
|
| 739 |
+
[2025-09-12 19:04:40,157] - step:705/900 train_loss:0.4150 lr:0.0003000000 time/step:116.86s
|
| 740 |
+
[2025-09-12 19:06:38,620] - step:706/900 train_loss:0.4153 lr:0.0003000000 time/step:118.45s
|
| 741 |
+
[2025-09-12 19:08:35,564] - step:707/900 train_loss:0.3966 lr:0.0003000000 time/step:116.92s
|
| 742 |
+
[2025-09-12 19:10:32,548] - step:708/900 train_loss:0.4221 lr:0.0003000000 time/step:116.98s
|
| 743 |
+
[2025-09-12 19:12:29,132] - step:709/900 train_loss:0.3952 lr:0.0003000000 time/step:116.58s
|
| 744 |
+
[2025-09-12 19:14:26,936] - step:710/900 train_loss:0.3849 lr:0.0003000000 time/step:117.80s
|
| 745 |
+
[2025-09-12 19:16:24,310] - step:711/900 train_loss:0.4114 lr:0.0003000000 time/step:117.36s
|
| 746 |
+
[2025-09-12 19:18:20,956] - step:712/900 train_loss:0.4173 lr:0.0003000000 time/step:116.64s
|
| 747 |
+
[2025-09-12 19:20:18,010] - step:713/900 train_loss:0.3898 lr:0.0003000000 time/step:117.05s
|
| 748 |
+
[2025-09-12 19:22:14,781] - step:714/900 train_loss:0.4088 lr:0.0003000000 time/step:116.76s
|
| 749 |
+
[2025-09-12 19:24:11,349] - step:715/900 train_loss:0.3975 lr:0.0003000000 time/step:116.56s
|
| 750 |
+
[2025-09-12 19:26:09,929] - step:716/900 train_loss:0.4089 lr:0.0003000000 time/step:118.57s
|
| 751 |
+
[2025-09-12 19:28:06,399] - step:717/900 train_loss:0.3964 lr:0.0003000000 time/step:116.46s
|
| 752 |
+
[2025-09-12 19:30:02,907] - step:718/900 train_loss:0.4063 lr:0.0003000000 time/step:116.49s
|
| 753 |
+
[2025-09-12 19:31:59,817] - step:719/900 train_loss:0.3934 lr:0.0003000000 time/step:116.90s
|
| 754 |
+
[2025-09-12 19:33:56,770] - step:720/900 train_loss:0.3953 lr:0.0003000000 time/step:116.95s
|
| 755 |
+
[2025-09-12 19:35:54,685] - step:721/900 train_loss:0.4275 lr:0.0003000000 time/step:117.90s
|
| 756 |
+
[2025-09-12 19:37:52,051] - step:722/900 train_loss:0.4074 lr:0.0003000000 time/step:117.36s
|
| 757 |
+
[2025-09-12 19:39:48,232] - step:723/900 train_loss:0.4163 lr:0.0003000000 time/step:116.18s
|
| 758 |
+
[2025-09-12 19:41:45,737] - step:724/900 train_loss:0.4015 lr:0.0003000000 time/step:117.50s
|
| 759 |
+
[2025-09-12 19:43:42,903] - step:725/900 train_loss:0.4202 lr:0.0003000000 time/step:117.16s
|
| 760 |
+
[2025-09-12 19:45:40,142] - step:726/900 train_loss:0.4291 lr:0.0003000000 time/step:117.23s
|
| 761 |
+
[2025-09-12 19:47:38,767] - step:727/900 train_loss:0.4219 lr:0.0003000000 time/step:118.52s
|
| 762 |
+
[2025-09-12 19:49:35,311] - step:728/900 train_loss:0.4267 lr:0.0003000000 time/step:116.54s
|
| 763 |
+
[2025-09-12 19:51:31,352] - step:729/900 train_loss:0.4008 lr:0.0003000000 time/step:116.03s
|
| 764 |
+
[2025-09-12 19:53:29,152] - step:730/900 train_loss:0.4191 lr:0.0003000000 time/step:117.79s
|
| 765 |
+
[2025-09-12 19:55:25,960] - step:731/900 train_loss:0.4093 lr:0.0003000000 time/step:116.80s
|
| 766 |
+
[2025-09-12 19:57:23,584] - step:732/900 train_loss:0.4230 lr:0.0003000000 time/step:117.61s
|
| 767 |
+
[2025-09-12 19:59:20,804] - step:733/900 train_loss:0.4213 lr:0.0003000000 time/step:117.21s
|
| 768 |
+
[2025-09-12 20:01:17,340] - step:734/900 train_loss:0.4071 lr:0.0003000000 time/step:116.53s
|
| 769 |
+
[2025-09-12 20:03:14,230] - step:735/900 train_loss:0.3944 lr:0.0003000000 time/step:116.88s
|
| 770 |
+
[2025-09-12 20:05:11,216] - step:736/900 train_loss:0.3971 lr:0.0003000000 time/step:116.98s
|
| 771 |
+
[2025-09-12 20:07:08,901] - step:737/900 train_loss:0.4144 lr:0.0003000000 time/step:117.67s
|
| 772 |
+
[2025-09-12 20:09:06,344] - step:738/900 train_loss:0.4349 lr:0.0003000000 time/step:117.44s
|
| 773 |
+
[2025-09-12 20:11:03,286] - step:739/900 train_loss:0.3967 lr:0.0003000000 time/step:116.94s
|
| 774 |
+
[2025-09-12 20:12:59,810] - step:740/900 train_loss:0.4104 lr:0.0003000000 time/step:116.52s
|
| 775 |
+
[2025-09-12 20:14:56,833] - step:741/900 train_loss:0.4195 lr:0.0003000000 time/step:117.01s
|
| 776 |
+
[2025-09-12 20:16:53,836] - step:742/900 train_loss:0.4083 lr:0.0003000000 time/step:116.99s
|
| 777 |
+
[2025-09-12 20:18:51,981] - step:743/900 train_loss:0.4021 lr:0.0003000000 time/step:118.14s
|
| 778 |
+
[2025-09-12 20:20:48,901] - step:744/900 train_loss:0.4182 lr:0.0003000000 time/step:116.91s
|
| 779 |
+
[2025-09-12 20:22:46,747] - step:745/900 train_loss:0.3946 lr:0.0003000000 time/step:117.84s
|
| 780 |
+
[2025-09-12 20:24:42,792] - step:746/900 train_loss:0.3826 lr:0.0003000000 time/step:116.03s
|
| 781 |
+
[2025-09-12 20:26:39,772] - step:747/900 train_loss:0.4267 lr:0.0003000000 time/step:116.97s
|
| 782 |
+
[2025-09-12 20:28:37,994] - step:748/900 train_loss:0.3935 lr:0.0003000000 time/step:118.21s
|
| 783 |
+
[2025-09-12 20:30:34,939] - step:749/900 train_loss:0.3979 lr:0.0003000000 time/step:116.93s
|
| 784 |
+
[2025-09-12 20:32:32,443] - step:750/900 train_loss:0.4253 lr:0.0003000000 time/step:117.50s
|
| 785 |
+
[2025-09-12 20:34:29,466] - step:751/900 train_loss:0.4006 lr:0.0003000000 time/step:117.01s
|
| 786 |
+
[2025-09-12 20:36:25,608] - step:752/900 train_loss:0.4219 lr:0.0003000000 time/step:116.13s
|
| 787 |
+
[2025-09-12 20:38:22,452] - step:753/900 train_loss:0.3919 lr:0.0003000000 time/step:116.84s
|
| 788 |
+
[2025-09-12 20:40:21,076] - step:754/900 train_loss:0.4138 lr:0.0003000000 time/step:118.62s
|
| 789 |
+
[2025-09-12 20:42:18,879] - step:755/900 train_loss:0.4144 lr:0.0003000000 time/step:117.79s
|
| 790 |
+
[2025-09-12 20:44:15,840] - step:756/900 train_loss:0.4077 lr:0.0003000000 time/step:116.95s
|
| 791 |
+
[2025-09-12 20:46:12,684] - step:757/900 train_loss:0.4420 lr:0.0003000000 time/step:116.84s
|
| 792 |
+
[2025-09-12 20:48:08,449] - step:758/900 train_loss:0.4310 lr:0.0003000000 time/step:115.75s
|
| 793 |
+
[2025-09-12 20:50:06,514] - step:759/900 train_loss:0.4193 lr:0.0003000000 time/step:118.06s
|
| 794 |
+
[2025-09-12 20:52:03,393] - step:760/900 train_loss:0.4097 lr:0.0003000000 time/step:116.87s
|
| 795 |
+
[2025-09-12 20:54:01,558] - step:761/900 train_loss:0.4206 lr:0.0003000000 time/step:118.16s
|
| 796 |
+
[2025-09-12 20:55:58,603] - step:762/900 train_loss:0.4123 lr:0.0003000000 time/step:117.04s
|
| 797 |
+
[2025-09-12 20:57:55,067] - step:763/900 train_loss:0.3960 lr:0.0003000000 time/step:116.45s
|
| 798 |
+
[2025-09-12 20:59:51,936] - step:764/900 train_loss:0.4299 lr:0.0003000000 time/step:116.85s
|
| 799 |
+
[2025-09-12 21:01:50,033] - step:765/900 train_loss:0.4122 lr:0.0003000000 time/step:118.09s
|
| 800 |
+
[2025-09-12 21:03:47,856] - step:766/900 train_loss:0.3942 lr:0.0003000000 time/step:117.82s
|
| 801 |
+
[2025-09-12 21:05:44,878] - step:767/900 train_loss:0.3948 lr:0.0003000000 time/step:117.01s
|
| 802 |
+
[2025-09-12 21:07:41,799] - step:768/900 train_loss:0.3943 lr:0.0003000000 time/step:116.91s
|
| 803 |
+
[2025-09-12 21:09:38,205] - step:769/900 train_loss:0.4122 lr:0.0003000000 time/step:116.40s
|
| 804 |
+
[2025-09-12 21:11:35,911] - step:770/900 train_loss:0.4029 lr:0.0003000000 time/step:117.70s
|
| 805 |
+
[2025-09-12 21:13:33,673] - step:771/900 train_loss:0.3994 lr:0.0003000000 time/step:117.75s
|
| 806 |
+
[2025-09-12 21:15:30,614] - step:772/900 train_loss:0.4263 lr:0.0003000000 time/step:116.93s
|
| 807 |
+
[2025-09-12 21:17:27,398] - step:773/900 train_loss:0.4199 lr:0.0003000000 time/step:116.77s
|
| 808 |
+
[2025-09-12 21:19:24,243] - step:774/900 train_loss:0.4126 lr:0.0003000000 time/step:116.84s
|
| 809 |
+
[2025-09-12 21:21:21,644] - step:775/900 train_loss:0.3885 lr:0.0003000000 time/step:117.39s
|
| 810 |
+
[2025-09-12 21:23:18,489] - step:776/900 train_loss:0.4123 lr:0.0003000000 time/step:116.84s
|
| 811 |
+
[2025-09-12 21:25:16,373] - step:777/900 train_loss:0.3887 lr:0.0003000000 time/step:117.88s
|
| 812 |
+
[2025-09-12 21:27:13,296] - step:778/900 train_loss:0.4256 lr:0.0003000000 time/step:116.91s
|
| 813 |
+
[2025-09-12 21:29:10,200] - step:779/900 train_loss:0.4090 lr:0.0003000000 time/step:116.90s
|
| 814 |
+
[2025-09-12 21:31:07,409] - step:780/900 train_loss:0.3895 lr:0.0003000000 time/step:117.20s
|
| 815 |
+
[2025-09-12 21:33:04,490] - step:781/900 train_loss:0.4134 lr:0.0003000000 time/step:117.07s
|
| 816 |
+
[2025-09-12 21:35:01,686] - step:782/900 train_loss:0.4317 lr:0.0003000000 time/step:117.19s
|
| 817 |
+
[2025-09-12 21:36:58,773] - step:783/900 train_loss:0.4093 lr:0.0003000000 time/step:117.07s
|
| 818 |
+
[2025-09-12 21:38:55,697] - step:784/900 train_loss:0.4052 lr:0.0003000000 time/step:116.92s
|
| 819 |
+
[2025-09-12 21:40:52,704] - step:785/900 train_loss:0.4158 lr:0.0003000000 time/step:117.00s
|
| 820 |
+
[2025-09-12 21:42:51,059] - step:786/900 train_loss:0.3933 lr:0.0003000000 time/step:118.35s
|
| 821 |
+
[2025-09-12 21:44:47,908] - step:787/900 train_loss:0.4167 lr:0.0003000000 time/step:116.84s
|
| 822 |
+
[2025-09-12 21:46:44,911] - step:788/900 train_loss:0.3970 lr:0.0003000000 time/step:116.99s
|
| 823 |
+
[2025-09-12 21:48:41,985] - step:789/900 train_loss:0.3789 lr:0.0003000000 time/step:117.06s
|
| 824 |
+
[2025-09-12 21:50:38,911] - step:790/900 train_loss:0.4033 lr:0.0003000000 time/step:116.92s
|
| 825 |
+
[2025-09-12 21:52:36,518] - step:791/900 train_loss:0.3703 lr:0.0003000000 time/step:117.60s
|
| 826 |
+
[2025-09-12 21:54:35,924] - step:792/900 train_loss:0.3987 lr:0.0003000000 time/step:119.40s
|
| 827 |
+
[2025-09-12 21:56:32,089] - step:793/900 train_loss:0.4103 lr:0.0003000000 time/step:116.16s
|
| 828 |
+
[2025-09-12 21:58:29,152] - step:794/900 train_loss:0.4121 lr:0.0003000000 time/step:117.05s
|
| 829 |
+
[2025-09-12 22:00:26,076] - step:795/900 train_loss:0.3756 lr:0.0003000000 time/step:116.92s
|
| 830 |
+
[2025-09-12 22:02:23,114] - step:796/900 train_loss:0.4195 lr:0.0003000000 time/step:117.03s
|
| 831 |
+
[2025-09-12 22:04:21,556] - step:797/900 train_loss:0.3852 lr:0.0003000000 time/step:118.43s
|
| 832 |
+
[2025-09-12 22:06:19,445] - step:798/900 train_loss:0.4343 lr:0.0003000000 time/step:117.88s
|
| 833 |
+
[2025-09-12 22:08:15,683] - step:799/900 train_loss:0.4024 lr:0.0003000000 time/step:116.22s
|
| 834 |
+
[2025-09-12 22:10:13,431] - step:800/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@800.pt...
|
| 835 |
+
[2025-09-12 22:10:13,432] - step:800/900 train_loss:0.4081 lr:0.0003000000 time/step:117.14s
|
| 836 |
+
[2025-09-12 22:12:09,931] - step:801/900 train_loss:0.4091 lr:0.0003000000 time/step:116.49s
|
| 837 |
+
[2025-09-12 22:14:08,180] - step:802/900 train_loss:0.4188 lr:0.0003000000 time/step:118.24s
|
| 838 |
+
[2025-09-12 22:16:05,937] - step:803/900 train_loss:0.4227 lr:0.0003000000 time/step:117.74s
|
| 839 |
+
[2025-09-12 22:18:02,679] - step:804/900 train_loss:0.3994 lr:0.0003000000 time/step:116.73s
|
| 840 |
+
[2025-09-12 22:19:59,511] - step:805/900 train_loss:0.3885 lr:0.0003000000 time/step:116.82s
|
| 841 |
+
[2025-09-12 22:21:56,602] - step:806/900 train_loss:0.3937 lr:0.0003000000 time/step:117.08s
|
| 842 |
+
[2025-09-12 22:23:54,831] - step:807/900 train_loss:0.4143 lr:0.0003000000 time/step:118.22s
|
| 843 |
+
[2025-09-12 22:25:52,068] - step:808/900 train_loss:0.4324 lr:0.0003000000 time/step:117.23s
|
| 844 |
+
[2025-09-12 22:27:49,499] - step:809/900 train_loss:0.3988 lr:0.0003000000 time/step:117.42s
|
| 845 |
+
[2025-09-12 22:29:45,897] - step:810/900 train_loss:0.4016 lr:0.0003000000 time/step:116.39s
|
| 846 |
+
[2025-09-12 22:31:42,993] - step:811/900 train_loss:0.4106 lr:0.0003000000 time/step:117.08s
|
| 847 |
+
[2025-09-12 22:33:41,172] - step:812/900 train_loss:0.4097 lr:0.0003000000 time/step:118.17s
|
| 848 |
+
[2025-09-12 22:35:38,349] - step:813/900 train_loss:0.3838 lr:0.0003000000 time/step:117.17s
|
| 849 |
+
[2025-09-12 22:37:36,103] - step:814/900 train_loss:0.3802 lr:0.0003000000 time/step:117.74s
|
| 850 |
+
[2025-09-12 22:39:33,507] - step:815/900 train_loss:0.4195 lr:0.0003000000 time/step:117.40s
|
| 851 |
+
[2025-09-12 22:41:29,750] - step:816/900 train_loss:0.4333 lr:0.0003000000 time/step:116.23s
|
| 852 |
+
[2025-09-12 22:43:26,622] - step:817/900 train_loss:0.4108 lr:0.0003000000 time/step:116.87s
|
| 853 |
+
[2025-09-12 22:45:25,127] - step:818/900 train_loss:0.3866 lr:0.0003000000 time/step:118.49s
|
| 854 |
+
[2025-09-12 22:47:22,168] - step:819/900 train_loss:0.4197 lr:0.0003000000 time/step:117.03s
|
| 855 |
+
[2025-09-12 22:49:19,672] - step:820/900 train_loss:0.3791 lr:0.0003000000 time/step:117.50s
|
| 856 |
+
[2025-09-12 22:51:17,438] - step:821/900 train_loss:0.4053 lr:0.0003000000 time/step:117.76s
|
| 857 |
+
[2025-09-12 22:53:13,613] - step:822/900 train_loss:0.4096 lr:0.0003000000 time/step:116.16s
|
| 858 |
+
[2025-09-12 22:55:11,085] - step:823/900 train_loss:0.4086 lr:0.0003000000 time/step:117.46s
|
| 859 |
+
[2025-09-12 22:57:08,006] - step:824/900 train_loss:0.4028 lr:0.0003000000 time/step:116.90s
|
| 860 |
+
[2025-09-12 22:59:05,729] - step:825/900 train_loss:0.3960 lr:0.0003000000 time/step:117.72s
|
| 861 |
+
[2025-09-12 23:01:03,331] - step:826/900 train_loss:0.4060 lr:0.0003000000 time/step:117.59s
|
| 862 |
+
[2025-09-12 23:03:00,051] - step:827/900 train_loss:0.4147 lr:0.0003000000 time/step:116.71s
|
| 863 |
+
[2025-09-12 23:04:56,347] - step:828/900 train_loss:0.4173 lr:0.0003000000 time/step:116.28s
|
| 864 |
+
[2025-09-12 23:06:53,382] - step:829/900 train_loss:0.4136 lr:0.0003000000 time/step:117.02s
|
| 865 |
+
[2025-09-12 23:08:50,925] - step:830/900 train_loss:0.4135 lr:0.0003000000 time/step:117.53s
|
| 866 |
+
[2025-09-12 23:10:48,709] - step:831/900 train_loss:0.3960 lr:0.0003000000 time/step:117.78s
|
| 867 |
+
[2025-09-12 23:12:45,852] - step:832/900 train_loss:0.3999 lr:0.0003000000 time/step:117.13s
|
| 868 |
+
[2025-09-12 23:14:43,195] - step:833/900 train_loss:0.4046 lr:0.0003000000 time/step:117.33s
|
| 869 |
+
[2025-09-12 23:16:39,299] - step:834/900 train_loss:0.4188 lr:0.0003000000 time/step:116.10s
|
| 870 |
+
[2025-09-12 23:18:36,142] - step:835/900 train_loss:0.3957 lr:0.0003000000 time/step:116.83s
|
| 871 |
+
[2025-09-12 23:20:34,486] - step:836/900 train_loss:0.4188 lr:0.0003000000 time/step:118.34s
|
| 872 |
+
[2025-09-12 23:22:31,489] - step:837/900 train_loss:0.3849 lr:0.0003000000 time/step:116.99s
|
| 873 |
+
[2025-09-12 23:24:28,392] - step:838/900 train_loss:0.4255 lr:0.0003000000 time/step:116.90s
|
| 874 |
+
[2025-09-12 23:26:24,998] - step:839/900 train_loss:0.4019 lr:0.0003000000 time/step:116.59s
|
| 875 |
+
[2025-09-12 23:28:21,798] - step:840/900 train_loss:0.4149 lr:0.0003000000 time/step:116.78s
|
| 876 |
+
[2025-09-12 23:30:20,342] - step:841/900 train_loss:0.3937 lr:0.0003000000 time/step:118.54s
|
| 877 |
+
[2025-09-12 23:32:17,286] - step:842/900 train_loss:0.3996 lr:0.0003000000 time/step:116.94s
|
| 878 |
+
[2025-09-12 23:34:14,169] - step:843/900 train_loss:0.3911 lr:0.0003000000 time/step:116.88s
|
| 879 |
+
[2025-09-12 23:36:11,513] - step:844/900 train_loss:0.4199 lr:0.0003000000 time/step:117.34s
|
| 880 |
+
[2025-09-12 23:38:07,515] - step:845/900 train_loss:0.3990 lr:0.0003000000 time/step:115.99s
|
| 881 |
+
[2025-09-12 23:40:05,382] - step:846/900 train_loss:0.4059 lr:0.0003000000 time/step:117.86s
|
| 882 |
+
[2025-09-12 23:42:03,341] - step:847/900 train_loss:0.4217 lr:0.0003000000 time/step:117.95s
|
| 883 |
+
[2025-09-12 23:44:00,267] - step:848/900 train_loss:0.4059 lr:0.0003000000 time/step:116.92s
|
| 884 |
+
[2025-09-12 23:45:57,550] - step:849/900 train_loss:0.4140 lr:0.0003000000 time/step:117.28s
|
| 885 |
+
[2025-09-12 23:47:54,492] - step:850/900 train_loss:0.3920 lr:0.0003000000 time/step:116.93s
|
| 886 |
+
[2025-09-12 23:49:50,997] - step:851/900 train_loss:0.4194 lr:0.0003000000 time/step:116.50s
|
| 887 |
+
[2025-09-12 23:51:48,718] - step:852/900 train_loss:0.3914 lr:0.0003000000 time/step:117.71s
|
| 888 |
+
[2025-09-12 23:53:45,683] - step:853/900 train_loss:0.4012 lr:0.0003000000 time/step:116.96s
|
| 889 |
+
[2025-09-12 23:55:43,182] - step:854/900 train_loss:0.4198 lr:0.0003000000 time/step:117.47s
|
| 890 |
+
[2025-09-12 23:57:40,227] - step:855/900 train_loss:0.4059 lr:0.0003000000 time/step:117.03s
|
| 891 |
+
[2025-09-12 23:59:37,792] - step:856/900 train_loss:0.4026 lr:0.0003000000 time/step:117.56s
|
| 892 |
+
[2025-09-13 00:01:34,695] - step:857/900 train_loss:0.4171 lr:0.0003000000 time/step:116.89s
|
| 893 |
+
[2025-09-13 00:03:32,341] - step:858/900 train_loss:0.4017 lr:0.0003000000 time/step:117.64s
|
| 894 |
+
[2025-09-13 00:05:29,421] - step:859/900 train_loss:0.4011 lr:0.0003000000 time/step:117.07s
|
| 895 |
+
[2025-09-13 00:07:26,749] - step:860/900 train_loss:0.3910 lr:0.0003000000 time/step:117.32s
|
| 896 |
+
[2025-09-13 00:09:23,608] - step:861/900 train_loss:0.4093 lr:0.0003000000 time/step:116.85s
|
| 897 |
+
[2025-09-13 00:11:21,037] - step:862/900 train_loss:0.4295 lr:0.0003000000 time/step:117.42s
|
| 898 |
+
[2025-09-13 00:13:17,816] - step:863/900 train_loss:0.4025 lr:0.0003000000 time/step:116.77s
|
| 899 |
+
[2025-09-13 00:15:14,919] - step:864/900 train_loss:0.3978 lr:0.0003000000 time/step:117.10s
|
| 900 |
+
[2025-09-13 00:17:12,309] - step:865/900 train_loss:0.3941 lr:0.0003000000 time/step:117.38s
|
| 901 |
+
[2025-09-13 00:19:09,330] - step:866/900 train_loss:0.4150 lr:0.0003000000 time/step:117.01s
|
| 902 |
+
[2025-09-13 00:21:06,411] - step:867/900 train_loss:0.4101 lr:0.0003000000 time/step:117.01s
|
| 903 |
+
[2025-09-13 00:23:03,516] - step:868/900 train_loss:0.4156 lr:0.0003000000 time/step:117.10s
|
| 904 |
+
[2025-09-13 00:25:00,493] - step:869/900 train_loss:0.4128 lr:0.0003000000 time/step:116.97s
|
| 905 |
+
[2025-09-13 00:26:57,821] - step:870/900 train_loss:0.4182 lr:0.0003000000 time/step:117.31s
|
| 906 |
+
[2025-09-13 00:28:54,768] - step:871/900 train_loss:0.3940 lr:0.0003000000 time/step:116.93s
|
| 907 |
+
[2025-09-13 00:30:51,704] - step:872/900 train_loss:0.4091 lr:0.0003000000 time/step:116.93s
|
| 908 |
+
[2025-09-13 00:32:48,692] - step:873/900 train_loss:0.4066 lr:0.0003000000 time/step:116.98s
|
| 909 |
+
[2025-09-13 00:34:47,091] - step:874/900 train_loss:0.4061 lr:0.0003000000 time/step:118.39s
|
| 910 |
+
[2025-09-13 00:36:44,116] - step:875/900 train_loss:0.3712 lr:0.0003000000 time/step:117.01s
|
| 911 |
+
[2025-09-13 00:38:41,019] - step:876/900 train_loss:0.4040 lr:0.0003000000 time/step:116.89s
|
| 912 |
+
[2025-09-13 00:40:38,506] - step:877/900 train_loss:0.3807 lr:0.0003000000 time/step:117.48s
|
| 913 |
+
[2025-09-13 00:42:35,384] - step:878/900 train_loss:0.4103 lr:0.0003000000 time/step:116.87s
|
| 914 |
+
[2025-09-13 00:44:33,175] - step:879/900 train_loss:0.4001 lr:0.0003000000 time/step:117.79s
|
| 915 |
+
[2025-09-13 00:46:29,986] - step:880/900 train_loss:0.3966 lr:0.0003000000 time/step:116.79s
|
| 916 |
+
[2025-09-13 00:48:27,354] - step:881/900 train_loss:0.4188 lr:0.0003000000 time/step:117.29s
|
| 917 |
+
[2025-09-13 00:50:24,406] - step:882/900 train_loss:0.4164 lr:0.0003000000 time/step:117.05s
|
| 918 |
+
[2025-09-13 00:52:22,291] - step:883/900 train_loss:0.3936 lr:0.0003000000 time/step:117.88s
|
| 919 |
+
[2025-09-13 00:54:20,651] - step:884/900 train_loss:0.4148 lr:0.0003000000 time/step:118.35s
|
| 920 |
+
[2025-09-13 00:56:17,788] - step:885/900 train_loss:0.4173 lr:0.0003000000 time/step:117.13s
|
| 921 |
+
[2025-09-13 00:58:14,279] - step:886/900 train_loss:0.4260 lr:0.0003000000 time/step:116.46s
|
| 922 |
+
[2025-09-13 01:00:11,090] - step:887/900 train_loss:0.4037 lr:0.0003000000 time/step:116.80s
|
| 923 |
+
[2025-09-13 01:02:08,948] - step:888/900 train_loss:0.4117 lr:0.0003000000 time/step:117.85s
|
| 924 |
+
[2025-09-13 01:04:07,249] - step:889/900 train_loss:0.4068 lr:0.0003000000 time/step:118.29s
|
| 925 |
+
[2025-09-13 01:06:04,130] - step:890/900 train_loss:0.4187 lr:0.0003000000 time/step:116.87s
|
| 926 |
+
[2025-09-13 01:08:01,508] - step:891/900 train_loss:0.4159 lr:0.0003000000 time/step:117.36s
|
| 927 |
+
[2025-09-13 01:09:57,620] - step:892/900 train_loss:0.3978 lr:0.0003000000 time/step:116.10s
|
| 928 |
+
[2025-09-13 01:11:55,493] - step:893/900 train_loss:0.3925 lr:0.0003000000 time/step:117.86s
|
| 929 |
+
[2025-09-13 01:13:52,516] - step:894/900 train_loss:0.3845 lr:0.0003000000 time/step:117.01s
|
| 930 |
+
[2025-09-13 01:15:50,321] - step:895/900 train_loss:0.4062 lr:0.0003000000 time/step:117.80s
|
| 931 |
+
[2025-09-13 01:17:47,232] - step:896/900 train_loss:0.3879 lr:0.0003000000 time/step:116.90s
|
| 932 |
+
[2025-09-13 01:19:44,630] - step:897/900 train_loss:0.4272 lr:0.0003000000 time/step:117.39s
|
| 933 |
+
[2025-09-13 01:21:41,559] - step:898/900 train_loss:0.4121 lr:0.0003000000 time/step:116.92s
|
| 934 |
+
[2025-09-13 01:23:39,154] - step:899/900 train_loss:0.4079 lr:0.0003000000 time/step:117.59s
|
| 935 |
+
[2025-09-13 01:25:37,577] - step:900/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@900.pt...
|
| 936 |
+
[2025-09-13 01:25:37,578] - step:900/900 train_loss:0.3995 lr:0.0003000000 time/step:117.81s
|
wandb/run-20250911_200644-y9v5i9gr/files/requirements.txt
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
parso==0.8.4
|
| 2 |
+
pydantic_core==2.27.2
|
| 3 |
+
charset-normalizer==3.4.1
|
| 4 |
+
xxhash==3.5.0
|
| 5 |
+
PyYAML==6.0.2
|
| 6 |
+
transformers==4.49.0
|
| 7 |
+
idna==3.10
|
| 8 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 9 |
+
numpy==2.2.3
|
| 10 |
+
hydra-core==1.3.2
|
| 11 |
+
Pygments==2.19.1
|
| 12 |
+
rich==14.0.0
|
| 13 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 14 |
+
urllib3==2.3.0
|
| 15 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 16 |
+
contourpy==1.3.1
|
| 17 |
+
cycler==0.12.1
|
| 18 |
+
decorator==5.2.1
|
| 19 |
+
psutil==7.0.0
|
| 20 |
+
aiohttp==3.11.13
|
| 21 |
+
einops==0.8.1
|
| 22 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 23 |
+
exceptiongroup==1.2.2
|
| 24 |
+
stack-data==0.6.3
|
| 25 |
+
setproctitle==1.3.5
|
| 26 |
+
fsspec==2024.12.0
|
| 27 |
+
tueplots==0.2.0
|
| 28 |
+
pexpect==4.9.0
|
| 29 |
+
gitdb==4.0.12
|
| 30 |
+
fonttools==4.56.0
|
| 31 |
+
ipython==8.35.0
|
| 32 |
+
huggingface-hub==0.29.2
|
| 33 |
+
filelock==3.17.0
|
| 34 |
+
torchvision==0.21.0+cu124
|
| 35 |
+
platformdirs==4.3.6
|
| 36 |
+
peft==0.15.1
|
| 37 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 38 |
+
wandb==0.19.8
|
| 39 |
+
click==8.1.8
|
| 40 |
+
mpmath==1.3.0
|
| 41 |
+
Jinja2==3.1.6
|
| 42 |
+
scipy==1.14.1
|
| 43 |
+
markdown-it-py==3.0.0
|
| 44 |
+
matplotlib-inline==0.1.7
|
| 45 |
+
wheel==0.45.1
|
| 46 |
+
setuptools==75.8.2
|
| 47 |
+
tqdm==4.67.1
|
| 48 |
+
antlr4-python3-runtime==4.9.3
|
| 49 |
+
deepspeed==0.16.7
|
| 50 |
+
omegaconf==2.3.0
|
| 51 |
+
torchaudio==2.6.0+cu124
|
| 52 |
+
aiosignal==1.3.2
|
| 53 |
+
accelerate==1.6.0
|
| 54 |
+
py-cpuinfo==9.0.0
|
| 55 |
+
pyparsing==3.2.1
|
| 56 |
+
ninja==1.11.1.4
|
| 57 |
+
pandas==2.2.3
|
| 58 |
+
six==1.17.0
|
| 59 |
+
wcwidth==0.2.13
|
| 60 |
+
safetensors==0.5.3
|
| 61 |
+
attrs==25.1.0
|
| 62 |
+
python-dateutil==2.9.0.post0
|
| 63 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 64 |
+
multiprocess==0.70.16
|
| 65 |
+
seaborn==0.13.2
|
| 66 |
+
networkx==3.4.2
|
| 67 |
+
regex==2024.11.6
|
| 68 |
+
nvidia-nvtx-cu12==12.4.127
|
| 69 |
+
tokenizers==0.21.0
|
| 70 |
+
datasets==3.3.2
|
| 71 |
+
nvidia-curand-cu12==10.3.5.147
|
| 72 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 73 |
+
MarkupSafe==3.0.2
|
| 74 |
+
triton==3.1.0
|
| 75 |
+
pip==25.0.1
|
| 76 |
+
jedi==0.19.2
|
| 77 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 78 |
+
iniconfig==2.0.0
|
| 79 |
+
pluggy==1.5.0
|
| 80 |
+
pure_eval==0.2.3
|
| 81 |
+
docker-pycreds==0.4.0
|
| 82 |
+
libcirkit==0.2.1
|
| 83 |
+
mdurl==0.1.2
|
| 84 |
+
annotated-types==0.7.0
|
| 85 |
+
sentry-sdk==2.22.0
|
| 86 |
+
executing==2.2.0
|
| 87 |
+
pydantic==2.10.6
|
| 88 |
+
opt_einsum==3.4.0
|
| 89 |
+
pytz==2025.1
|
| 90 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 91 |
+
protobuf==5.29.3
|
| 92 |
+
requests==2.32.3
|
| 93 |
+
tomli==2.2.1
|
| 94 |
+
matplotlib==3.10.1
|
| 95 |
+
hjson==3.1.0
|
| 96 |
+
frozenlist==1.5.0
|
| 97 |
+
pillow==11.1.0
|
| 98 |
+
GitPython==3.1.44
|
| 99 |
+
typing_extensions==4.12.2
|
| 100 |
+
pyarrow==19.0.1
|
| 101 |
+
propcache==0.3.0
|
| 102 |
+
prompt_toolkit==3.0.51
|
| 103 |
+
torch==2.6.0+cu124
|
| 104 |
+
async-timeout==5.0.1
|
| 105 |
+
bitsandbytes==0.45.5
|
| 106 |
+
trl==0.16.1
|
| 107 |
+
ptyprocess==0.7.0
|
| 108 |
+
dill==0.3.8
|
| 109 |
+
pytest==8.3.5
|
| 110 |
+
nvidia-nccl-cu12==2.21.5
|
| 111 |
+
sympy==1.13.1
|
| 112 |
+
flash_attn==2.7.4.post1
|
| 113 |
+
certifi==2025.1.31
|
| 114 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 115 |
+
tzdata==2025.1
|
| 116 |
+
aiohappyeyeballs==2.5.0
|
| 117 |
+
msgpack==1.1.0
|
| 118 |
+
traitlets==5.14.3
|
| 119 |
+
multidict==6.1.0
|
| 120 |
+
packaging==24.2
|
| 121 |
+
kiwisolver==1.4.8
|
| 122 |
+
smmap==5.0.2
|
| 123 |
+
asttokens==3.0.0
|
| 124 |
+
yarl==1.18.3
|
| 125 |
+
graphviz==0.20.3
|
wandb/run-20250911_200644-y9v5i9gr/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-6.8.0-57-generic-x86_64-with-glibc2.39",
|
| 3 |
+
"python": "CPython 3.10.16",
|
| 4 |
+
"startedAt": "2025-09-11T19:06:44.948798Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"data=tulu3-evabyte-packed",
|
| 7 |
+
"training=tulu3-evabyte-1epoch",
|
| 8 |
+
"lm=evabyte",
|
| 9 |
+
"model=mtp",
|
| 10 |
+
"adaptor=none",
|
| 11 |
+
"mt_head=linear-evabyte",
|
| 12 |
+
"circuit=cp",
|
| 13 |
+
"circuit.n_token=8",
|
| 14 |
+
"circuit.n_component=8",
|
| 15 |
+
"training.device_batch_size=1",
|
| 16 |
+
"data.vocab_size=320",
|
| 17 |
+
"model.model.beta=0",
|
| 18 |
+
"model.model.gamma=0.9",
|
| 19 |
+
"data.val_bin=null",
|
| 20 |
+
"training.learning_rate=0.0003",
|
| 21 |
+
"training.expname=lr-3e-4-no-lora-cp-n-8-r-8"
|
| 22 |
+
],
|
| 23 |
+
"program": "-m mtp.train",
|
| 24 |
+
"git": {
|
| 25 |
+
"remote": "git@github.com:PiotrNawrot/nanoGPT.git",
|
| 26 |
+
"commit": "26cfb78beb2138c5995ff5a43c8f8e1cc44652fd"
|
| 27 |
+
},
|
| 28 |
+
"email": "agrivas@inf.ed.ac.uk",
|
| 29 |
+
"root": "/disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16",
|
| 30 |
+
"host": "scotia01.inf.ed.ac.uk",
|
| 31 |
+
"executable": "/home/agrivas/nanoGPT/.venv/bin/python3",
|
| 32 |
+
"cpu_count": 24,
|
| 33 |
+
"cpu_count_logical": 48,
|
| 34 |
+
"gpu": "NVIDIA L40S",
|
| 35 |
+
"gpu_count": 4,
|
| 36 |
+
"disk": {
|
| 37 |
+
"/": {
|
| 38 |
+
"total": "184643391488",
|
| 39 |
+
"used": "37109506048"
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"memory": {
|
| 43 |
+
"total": "540522938368"
|
| 44 |
+
},
|
| 45 |
+
"cpu": {
|
| 46 |
+
"count": 24,
|
| 47 |
+
"countLogical": 48
|
| 48 |
+
},
|
| 49 |
+
"gpu_nvidia": [
|
| 50 |
+
{
|
| 51 |
+
"name": "NVIDIA L40S",
|
| 52 |
+
"memoryTotal": "48305799168",
|
| 53 |
+
"cudaCores": 18176,
|
| 54 |
+
"architecture": "Ada"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"name": "NVIDIA L40S",
|
| 58 |
+
"memoryTotal": "48305799168",
|
| 59 |
+
"cudaCores": 18176,
|
| 60 |
+
"architecture": "Ada"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA L40S",
|
| 64 |
+
"memoryTotal": "48305799168",
|
| 65 |
+
"cudaCores": 18176,
|
| 66 |
+
"architecture": "Ada"
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"name": "NVIDIA L40S",
|
| 70 |
+
"memoryTotal": "48305799168",
|
| 71 |
+
"cudaCores": 18176,
|
| 72 |
+
"architecture": "Ada"
|
| 73 |
+
}
|
| 74 |
+
],
|
| 75 |
+
"slurm": {
|
| 76 |
+
"cluster_name": "landoniacluster",
|
| 77 |
+
"conf": "/etc/slurm/slurm.conf",
|
| 78 |
+
"cpus_on_node": "16",
|
| 79 |
+
"cpus_per_gpu": "4",
|
| 80 |
+
"gpus_on_node": "4",
|
| 81 |
+
"gtids": "0",
|
| 82 |
+
"job_account": "research-staff",
|
| 83 |
+
"job_cpus_per_node": "16",
|
| 84 |
+
"job_end_time": "1757962299",
|
| 85 |
+
"job_gid": "10000",
|
| 86 |
+
"job_gpus": "0,1,2,3",
|
| 87 |
+
"job_id": "2085792",
|
| 88 |
+
"job_name": "slurm.sh",
|
| 89 |
+
"job_nodelist": "scotia01",
|
| 90 |
+
"job_num_nodes": "1",
|
| 91 |
+
"job_partition": "PGR-Standard",
|
| 92 |
+
"job_qos": "normal",
|
| 93 |
+
"job_start_time": "1757616699",
|
| 94 |
+
"job_uid": "1782564",
|
| 95 |
+
"job_user": "agrivas",
|
| 96 |
+
"jobid": "2085792",
|
| 97 |
+
"localid": "0",
|
| 98 |
+
"mem_per_node": "64000",
|
| 99 |
+
"nnodes": "1",
|
| 100 |
+
"nodeid": "0",
|
| 101 |
+
"nodelist": "scotia01",
|
| 102 |
+
"nprocs": "1",
|
| 103 |
+
"ntasks": "1",
|
| 104 |
+
"prio_process": "0",
|
| 105 |
+
"procid": "0",
|
| 106 |
+
"submit_dir": "/home/agrivas",
|
| 107 |
+
"submit_host": "hastings.inf.ed.ac.uk",
|
| 108 |
+
"task_pid": "2707112",
|
| 109 |
+
"tasks_per_node": "1",
|
| 110 |
+
"topology_addr": "scotia01",
|
| 111 |
+
"topology_addr_pattern": "node"
|
| 112 |
+
},
|
| 113 |
+
"cudaVersion": "12.8"
|
| 114 |
+
}
|
wandb/run-20250911_200644-y9v5i9gr/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"train/ce_loss_at_7":0.6471807956695557,"_step":899,"_runtime":105532.670379318,"train/loss":0.3994542360305786,"_timestamp":1.7577231375784273e+09,"train/ce_loss_at_1":0.18259316682815552,"train/ce_loss_at_3":0.31228378415107727,"global_step":900,"train/ce_loss_at_5":0.45094943046569824,"train/ce_loss_at_4":0.3768407106399536,"_wandb":{"runtime":105533},"train/ce_loss_at_6":0.5358694791793823,"train/ce_loss_at_2":0.25190725922584534,"train/ce_loss_at_8":0.7982796430587769}
|
wandb/run-20250911_200644-y9v5i9gr/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-09-11T20:06:44.955449103+01:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-09-11T20:06:45.176117844+01:00","level":"INFO","msg":"created new stream","id":"y9v5i9gr"}
|
| 3 |
+
{"time":"2025-09-11T20:06:45.176201537+01:00","level":"INFO","msg":"stream: started","id":"y9v5i9gr"}
|
| 4 |
+
{"time":"2025-09-11T20:06:45.176254637+01:00","level":"INFO","msg":"writer: Do: started","stream_id":"y9v5i9gr"}
|
| 5 |
+
{"time":"2025-09-11T20:06:45.176292219+01:00","level":"INFO","msg":"handler: started","stream_id":"y9v5i9gr"}
|
| 6 |
+
{"time":"2025-09-11T20:06:45.176341928+01:00","level":"INFO","msg":"sender: started","stream_id":"y9v5i9gr"}
|
| 7 |
+
{"time":"2025-09-11T20:06:45.680069036+01:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-09-11T20:19:16.313200337+01:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/circuit-mtp/mtp/y9v5i9gr/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 9 |
+
{"time":"2025-09-12T00:53:29.590652615+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
|
| 10 |
+
{"time":"2025-09-12T01:30:18.032795292+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 11 |
+
{"time":"2025-09-12T01:30:50.327057066+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 12 |
+
{"time":"2025-09-12T01:31:25.000022545+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 13 |
+
{"time":"2025-09-12T01:32:03.267256543+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 14 |
+
{"time":"2025-09-12T02:41:34.535497308+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 15 |
+
{"time":"2025-09-12T02:42:24.914157379+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
|
| 16 |
+
{"time":"2025-09-12T02:42:57.41051518+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 17 |
+
{"time":"2025-09-12T10:36:51.38167595+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 18 |
+
{"time":"2025-09-12T10:38:06.370172425+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 19 |
+
{"time":"2025-09-12T10:38:38.465480726+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 20 |
+
{"time":"2025-09-12T10:39:07.484991796+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
|
| 21 |
+
{"time":"2025-09-12T10:39:41.575653023+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
|
| 22 |
+
{"time":"2025-09-12T20:16:55.628544216+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 23 |
+
{"time":"2025-09-12T20:21:25.750812333+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 24 |
+
{"time":"2025-09-12T22:43:55.97454382+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 25 |
+
{"time":"2025-09-13T00:52:11.684482933+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 26 |
+
{"time":"2025-09-13T00:54:54.045134291+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
|
| 27 |
+
{"time":"2025-09-13T00:55:26.197593179+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 28 |
+
{"time":"2025-09-13T01:05:42.010380611+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 29 |
+
{"time":"2025-09-13T01:06:14.056932921+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 30 |
+
{"time":"2025-09-13T01:06:48.575121732+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 31 |
+
{"time":"2025-09-13T01:07:28.074495024+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 32 |
+
{"time":"2025-09-13T01:09:42.005493483+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 33 |
+
{"time":"2025-09-13T01:10:14.454893184+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 34 |
+
{"time":"2025-09-13T01:10:49.419226595+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 35 |
+
{"time":"2025-09-13T01:11:21.445954263+01:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/circuit-mtp/mtp/y9v5i9gr/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 36 |
+
{"time":"2025-09-13T01:11:57.007348427+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 37 |
+
{"time":"2025-09-13T01:13:57.010172043+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 38 |
+
{"time":"2025-09-13T01:14:29.220923193+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 39 |
+
{"time":"2025-09-13T01:15:27.013535251+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 40 |
+
{"time":"2025-09-13T01:15:59.276998526+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 41 |
+
{"time":"2025-09-13T01:16:33.628210655+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 42 |
+
{"time":"2025-09-13T01:17:42.016257241+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 43 |
+
{"time":"2025-09-13T01:18:14.389776393+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 44 |
+
{"time":"2025-09-13T01:19:42.02019871+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 45 |
+
{"time":"2025-09-13T01:25:38.927499163+01:00","level":"INFO","msg":"stream: closing","id":"y9v5i9gr"}
|
| 46 |
+
{"time":"2025-09-13T01:25:38.930059685+01:00","level":"INFO","msg":"Stopping system monitor"}
|
| 47 |
+
{"time":"2025-09-13T01:25:38.990179981+01:00","level":"INFO","msg":"Stopped system monitor"}
|
| 48 |
+
{"time":"2025-09-13T01:25:39.717455712+01:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 49 |
+
{"time":"2025-09-13T01:25:39.93171592+01:00","level":"INFO","msg":"handler: closed","stream_id":"y9v5i9gr"}
|
| 50 |
+
{"time":"2025-09-13T01:25:39.931829957+01:00","level":"INFO","msg":"writer: Close: closed","stream_id":"y9v5i9gr"}
|
| 51 |
+
{"time":"2025-09-13T01:25:39.932853619+01:00","level":"INFO","msg":"sender: closed","stream_id":"y9v5i9gr"}
|
| 52 |
+
{"time":"2025-09-13T01:25:39.932961632+01:00","level":"INFO","msg":"stream: closed","id":"y9v5i9gr"}
|
wandb/run-20250911_200644-y9v5i9gr/logs/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-09-11 20:06:44,916 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
| 2 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Configure stats pid to 2716293
|
| 3 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from /home/agrivas/.config/wandb/settings
|
| 4 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/settings
|
| 5 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
| 6 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug.log
|
| 7 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug-internal.log
|
| 8 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():761] calling init triggers
|
| 9 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'compile': True, 'device': 'cuda', 'from_checkpoint': None, 'name': 'nanogpt', 'training': {'random_seed': 13, 'batch_size': 256, 'device_batch_size': 1, 'sequence_length': 8192, 'num_iterations': 900, 'learning_rate': 0.0003, 'use_scheduler': False, 'save_model': True, 'save_optimizer': True, 'save_model_every': 100, 'val_loss_every': 100, 'val_tokens': 4194304, 'expname': 'lr-3e-4-no-lora-cp-n-8-r-8'}, 'model': {'name': 'mtp', 'beta': 0.0, 'gamma': 1, 'kl_algorithm': 'full', 'kl_type': 'forward', 'model': {'_target_': 'mtp.models.mtp.MultiTokenLM', 'lm': '${lm.model}', 'circuit': '${circuit.model}', 'mt_head_kwargs': '${mt_head.hyperparameters}', 'init_from_lm_head': True, 'kl_type': '${model.kl_type}', 'kl_algorithm': '${model.kl_algorithm}', 'beta': 0, 'gamma': 0.9}}, 'circuit': {'name': 'cp', 'n_token': 8, 'n_component': 8, 'model': {'_target_': 'mtp.models.circuits.CircuitModel', 'vocab_size': 320, 'n_token': 8, 'n_component': 8, 'kind': 'cp'}}, 'mt_head': {'name': 'linear-evabyte', 'hyperparameters': {'type': 'evabyte', 'n_embd': 4096, 'transformer_n_head': 32, 'transformer_n_layer': 0, 'expander_type': 'linear', 'expander_n_layer': 1, 'freeze_vocab_unembedding': False, 'share_sum_weights': False, 'contextual_hmm_weights': True, 'init_hmm_identity': True}}, 'adaptor': {'name': 'none', 'hyperparameters': None}, 'lm': {'name': 'evabyte', 'n_embd': 4096, 'n_head': 32, 'model': {'_target_': 'mtp.models.lm.LM', 'lm': None, 'encoder_only': True, 'from_checkpoint': None, 'from_huggingface': 'EvaByte/EvaByte-SFT', 'adaptor_kwargs': None, 'ref_enc': 'model', 'ref_head': 'lm_head', 'freeze': True}}, 'data': {'name': 'tulu3-evabyte', 'train_bin': 'agrv/tulu-v3-sft-evabyte-packed-seq-len-8192', 'val_bin': None, 'vocab_size': 320}, 'generate': {'speculative': False}, '_wandb': {}}
|
| 11 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():784] starting backend
|
| 12 |
+
2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():788] sending inform_init request
|
| 13 |
+
2025-09-11 20:06:44,948 INFO MainThread:2716293 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-09-11 20:06:44,948 INFO MainThread:2716293 [wandb_init.py:init():798] backend started and connected
|
| 15 |
+
2025-09-11 20:06:44,953 INFO MainThread:2716293 [wandb_init.py:init():891] updated telemetry
|
| 16 |
+
2025-09-11 20:06:44,961 INFO MainThread:2716293 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-09-11 20:06:45,675 INFO MainThread:2716293 [wandb_init.py:init():990] starting run threads in backend
|
| 18 |
+
2025-09-11 20:06:46,525 INFO MainThread:2716293 [wandb_run.py:_console_start():2375] atexit reg
|
| 19 |
+
2025-09-11 20:06:46,526 INFO MainThread:2716293 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
| 20 |
+
2025-09-11 20:06:46,533 INFO MainThread:2716293 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
| 21 |
+
2025-09-11 20:06:46,533 INFO MainThread:2716293 [wandb_run.py:_redirect():2315] Redirects installed.
|
| 22 |
+
2025-09-11 20:06:46,549 INFO MainThread:2716293 [wandb_init.py:init():1032] run started, returning control to user process
|
| 23 |
+
2025-09-13 01:25:38,827 INFO MsgRouterThr:2716293 [mailbox.py:close():129] Closing mailbox, abandoning 1 handles.
|
wandb/run-20250911_200644-y9v5i9gr/run-y9v5i9gr.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:049e97c6350380a1f5f79cdc0c647b0d0e33cacb013dc81b82c059a2c5672f20
|
| 3 |
+
size 14919748
|