agrv commited on
Commit
f3c6863
·
verified ·
1 Parent(s): e648114

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20250911_200644-y9v5i9gr/run-y9v5i9gr.wandb filter=lfs diff=lfs merge=lfs -text
.hydra/config.yaml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compile: true
2
+ device: cuda
3
+ from_checkpoint: null
4
+ name: nanogpt
5
+ training:
6
+ random_seed: 13
7
+ batch_size: 256
8
+ device_batch_size: 1
9
+ sequence_length: 8192
10
+ num_iterations: 900
11
+ learning_rate: 0.0003
12
+ use_scheduler: false
13
+ save_model: true
14
+ save_optimizer: true
15
+ save_model_every: 100
16
+ val_loss_every: 100
17
+ val_tokens: 4194304
18
+ expname: lr-3e-4-no-lora-cp-n-8-r-8
19
+ model:
20
+ name: mtp
21
+ beta: 0.0
22
+ gamma: 1
23
+ kl_algorithm: full
24
+ kl_type: forward
25
+ model:
26
+ _target_: mtp.models.mtp.MultiTokenLM
27
+ lm: ${lm.model}
28
+ circuit: ${circuit.model}
29
+ mt_head_kwargs: ${mt_head.hyperparameters}
30
+ init_from_lm_head: true
31
+ kl_type: ${model.kl_type}
32
+ kl_algorithm: ${model.kl_algorithm}
33
+ beta: 0
34
+ gamma: 0.9
35
+ circuit:
36
+ name: cp
37
+ n_token: 8
38
+ n_component: 8
39
+ model:
40
+ _target_: mtp.models.circuits.CircuitModel
41
+ vocab_size: ${data.vocab_size}
42
+ n_token: ${circuit.n_token}
43
+ n_component: ${circuit.n_component}
44
+ kind: cp
45
+ mt_head:
46
+ name: linear-evabyte
47
+ hyperparameters:
48
+ type: evabyte
49
+ n_embd: ${lm.n_embd}
50
+ transformer_n_head: ${lm.n_head}
51
+ transformer_n_layer: 0
52
+ expander_type: linear
53
+ expander_n_layer: 1
54
+ freeze_vocab_unembedding: false
55
+ share_sum_weights: false
56
+ contextual_hmm_weights: true
57
+ init_hmm_identity: true
58
+ adaptor:
59
+ name: none
60
+ hyperparameters: null
61
+ lm:
62
+ name: evabyte
63
+ n_embd: 4096
64
+ n_head: 32
65
+ model:
66
+ _target_: mtp.models.lm.LM
67
+ lm: null
68
+ encoder_only: true
69
+ from_checkpoint: null
70
+ from_huggingface: EvaByte/EvaByte-SFT
71
+ adaptor_kwargs: ${adaptor.hyperparameters}
72
+ ref_enc: model
73
+ ref_head: lm_head
74
+ freeze: true
75
+ data:
76
+ name: tulu3-evabyte
77
+ train_bin: agrv/tulu-v3-sft-evabyte-packed-seq-len-8192
78
+ val_bin: null
79
+ vocab_size: 320
80
+ generate:
81
+ speculative: false
.hydra/hydra.yaml ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ./logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - data=tulu3-evabyte-packed
116
+ - training=tulu3-evabyte-1epoch
117
+ - lm=evabyte
118
+ - model=mtp
119
+ - adaptor=none
120
+ - mt_head=linear-evabyte
121
+ - circuit=cp
122
+ - circuit.n_token=8
123
+ - circuit.n_component=8
124
+ - training.device_batch_size=1
125
+ - data.vocab_size=320
126
+ - model.model.beta=0
127
+ - model.model.gamma=0.9
128
+ - data.val_bin=null
129
+ - training.learning_rate=0.0003
130
+ - training.expname=lr-3e-4-no-lora-cp-n-8-r-8
131
+ job:
132
+ name: ${name}
133
+ chdir: true
134
+ override_dirname: adaptor=none,circuit.n_component=8,circuit.n_token=8,circuit=cp,data.val_bin=null,data.vocab_size=320,data=tulu3-evabyte-packed,lm=evabyte,model.model.beta=0,model.model.gamma=0.9,model=mtp,mt_head=linear-evabyte,training.device_batch_size=1,training.expname=lr-3e-4-no-lora-cp-n-8-r-8,training.learning_rate=0.0003,training=tulu3-evabyte-1epoch
135
+ id: ???
136
+ num: ???
137
+ config_name: config
138
+ env_set: {}
139
+ env_copy: []
140
+ config:
141
+ override_dirname:
142
+ kv_sep: '='
143
+ item_sep: ','
144
+ exclude_keys: []
145
+ runtime:
146
+ version: 1.3.2
147
+ version_base: '1.3'
148
+ cwd: /disk/scratch/agrivas/nanoGPT
149
+ config_sources:
150
+ - path: hydra.conf
151
+ schema: pkg
152
+ provider: hydra
153
+ - path: /disk/scratch/agrivas/nanoGPT/configs
154
+ schema: file
155
+ provider: main
156
+ - path: ''
157
+ schema: structured
158
+ provider: schema
159
+ output_dir: /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16
160
+ choices:
161
+ generate: default
162
+ data: tulu3-evabyte-packed
163
+ lm: evabyte
164
+ adaptor: none
165
+ mt_head: linear-evabyte
166
+ circuit: cp
167
+ model: mtp
168
+ training: tulu3-evabyte-1epoch
169
+ hydra/env: default
170
+ hydra/callbacks: null
171
+ hydra/job_logging: default
172
+ hydra/hydra_logging: default
173
+ hydra/hydra_help: default
174
+ hydra/help: default
175
+ hydra/sweeper: basic
176
+ hydra/launcher: basic
177
+ hydra/output: default
178
+ verbose: false
.hydra/overrides.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - data=tulu3-evabyte-packed
2
+ - training=tulu3-evabyte-1epoch
3
+ - lm=evabyte
4
+ - model=mtp
5
+ - adaptor=none
6
+ - mt_head=linear-evabyte
7
+ - circuit=cp
8
+ - circuit.n_token=8
9
+ - circuit.n_component=8
10
+ - training.device_batch_size=1
11
+ - data.vocab_size=320
12
+ - model.model.beta=0
13
+ - model.model.gamma=0.9
14
+ - data.val_bin=null
15
+ - training.learning_rate=0.0003
16
+ - training.expname=lr-3e-4-no-lora-cp-n-8-r-8
config.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compile: true
2
+ device: cuda
3
+ from_checkpoint: null
4
+ name: nanogpt
5
+ training:
6
+ random_seed: 13
7
+ batch_size: 256
8
+ device_batch_size: 1
9
+ sequence_length: 8192
10
+ num_iterations: 900
11
+ learning_rate: 0.0003
12
+ use_scheduler: false
13
+ save_model: true
14
+ save_optimizer: true
15
+ save_model_every: 100
16
+ val_loss_every: 100
17
+ val_tokens: 4194304
18
+ expname: lr-3e-4-no-lora-cp-n-8-r-8
19
+ model:
20
+ name: mtp
21
+ beta: 0.0
22
+ gamma: 1
23
+ kl_algorithm: full
24
+ kl_type: forward
25
+ model:
26
+ _target_: mtp.models.mtp.MultiTokenLM
27
+ lm: ${lm.model}
28
+ circuit: ${circuit.model}
29
+ mt_head_kwargs: ${mt_head.hyperparameters}
30
+ init_from_lm_head: true
31
+ kl_type: ${model.kl_type}
32
+ kl_algorithm: ${model.kl_algorithm}
33
+ beta: 0
34
+ gamma: 0.9
35
+ circuit:
36
+ name: cp
37
+ n_token: 8
38
+ n_component: 8
39
+ model:
40
+ _target_: mtp.models.circuits.CircuitModel
41
+ vocab_size: 320
42
+ n_token: 8
43
+ n_component: 8
44
+ kind: cp
45
+ mt_head:
46
+ name: linear-evabyte
47
+ hyperparameters:
48
+ type: evabyte
49
+ n_embd: 4096
50
+ transformer_n_head: 32
51
+ transformer_n_layer: 0
52
+ expander_type: linear
53
+ expander_n_layer: 1
54
+ freeze_vocab_unembedding: false
55
+ share_sum_weights: false
56
+ contextual_hmm_weights: true
57
+ init_hmm_identity: true
58
+ adaptor:
59
+ name: none
60
+ hyperparameters: null
61
+ lm:
62
+ name: evabyte
63
+ n_embd: 4096
64
+ n_head: 32
65
+ model:
66
+ _target_: mtp.models.lm.LM
67
+ lm: null
68
+ encoder_only: true
69
+ from_checkpoint: null
70
+ from_huggingface: EvaByte/EvaByte-SFT
71
+ adaptor_kwargs: null
72
+ ref_enc: model
73
+ ref_head: lm_head
74
+ freeze: true
75
+ data:
76
+ name: tulu3-evabyte
77
+ train_bin: agrv/tulu-v3-sft-evabyte-packed-seq-len-8192
78
+ val_bin: null
79
+ vocab_size: 320
80
+ generate:
81
+ speculative: false
82
+ expname: lr-3e-4-no-lora-cp-n-8-r-8
83
+ wandb_run_id: y9v5i9gr
model@0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49b32f3432aedda946c39f670a3092b4ba8e1afcf352a260eb91a75f760c00cf
3
+ size 167886916
model@300.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94a448ea9a2443052525acd82e48458ada80b6e74225312c21193404445233db
3
+ size 503564433
model@600.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfdccabb110a7f91c95f4f688b9ae8d557017f0b664c64d1abca769d9eedca70
3
+ size 503564433
model@900.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fd7891bdf8e6767e17d7276d3f0b7eca933e4f66740662f9d3cef90eb6f10a2
3
+ size 503564433
nanogpt.log ADDED
@@ -0,0 +1,916 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-09-11 20:06:35,177] - Setting up model... compile=True...
2
+ [2025-09-11 20:06:46,551] - Saving config and checkpoints to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16...
3
+ [2025-09-11 20:06:46,551] - Save model: True...
4
+ [2025-09-11 20:06:46,552] - Save optimizer: True...
5
+ [2025-09-11 20:06:46,558] - Training on agrv/tulu-v3-sft-evabyte-packed-seq-len-8192...
6
+ [2025-09-11 20:07:21,844] - Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
7
+ [2025-09-11 20:07:23,771] - step:0/900 Saving model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@0.pt...
8
+ [2025-09-11 20:10:21,957] - step:1/900 train_loss:0.5686 lr:0.0003000000 time/step:177.94s
9
+ [2025-09-11 20:12:19,200] - step:2/900 train_loss:0.5480 lr:0.0003000000 time/step:117.24s
10
+ [2025-09-11 20:14:16,539] - step:3/900 train_loss:0.5220 lr:0.0003000000 time/step:117.34s
11
+ [2025-09-11 20:16:13,861] - step:4/900 train_loss:0.5383 lr:0.0003000000 time/step:117.32s
12
+ [2025-09-11 20:18:10,435] - step:5/900 train_loss:0.5371 lr:0.0003000000 time/step:116.57s
13
+ [2025-09-11 20:20:08,627] - step:6/900 train_loss:0.5227 lr:0.0003000000 time/step:118.19s
14
+ [2025-09-11 20:22:06,149] - step:7/900 train_loss:0.5128 lr:0.0003000000 time/step:117.51s
15
+ [2025-09-11 20:24:03,890] - step:8/900 train_loss:0.5420 lr:0.0003000000 time/step:117.74s
16
+ [2025-09-11 20:26:01,252] - step:9/900 train_loss:0.5426 lr:0.0003000000 time/step:117.36s
17
+ [2025-09-11 20:27:59,602] - step:10/900 train_loss:0.5236 lr:0.0003000000 time/step:118.34s
18
+ [2025-09-11 20:29:56,227] - step:11/900 train_loss:0.4860 lr:0.0003000000 time/step:116.61s
19
+ [2025-09-11 20:31:54,042] - step:12/900 train_loss:0.5105 lr:0.0003000000 time/step:117.81s
20
+ [2025-09-11 20:33:51,406] - step:13/900 train_loss:0.4993 lr:0.0003000000 time/step:117.36s
21
+ [2025-09-11 20:35:48,717] - step:14/900 train_loss:0.4925 lr:0.0003000000 time/step:117.31s
22
+ [2025-09-11 20:37:47,558] - step:15/900 train_loss:0.5207 lr:0.0003000000 time/step:118.83s
23
+ [2025-09-11 20:39:45,850] - step:16/900 train_loss:0.4827 lr:0.0003000000 time/step:118.28s
24
+ [2025-09-11 20:41:42,738] - step:17/900 train_loss:0.5033 lr:0.0003000000 time/step:116.88s
25
+ [2025-09-11 20:43:39,898] - step:18/900 train_loss:0.5082 lr:0.0003000000 time/step:117.15s
26
+ [2025-09-11 20:45:37,029] - step:19/900 train_loss:0.4910 lr:0.0003000000 time/step:117.13s
27
+ [2025-09-11 20:47:34,571] - step:20/900 train_loss:0.5006 lr:0.0003000000 time/step:117.54s
28
+ [2025-09-11 20:49:32,312] - step:21/900 train_loss:0.4936 lr:0.0003000000 time/step:117.73s
29
+ [2025-09-11 20:51:29,213] - step:22/900 train_loss:0.4941 lr:0.0003000000 time/step:116.90s
30
+ [2025-09-11 20:53:26,056] - step:23/900 train_loss:0.5131 lr:0.0003000000 time/step:116.83s
31
+ [2025-09-11 20:55:22,982] - step:24/900 train_loss:0.4826 lr:0.0003000000 time/step:116.92s
32
+ [2025-09-11 20:57:20,427] - step:25/900 train_loss:0.4913 lr:0.0003000000 time/step:117.44s
33
+ [2025-09-11 20:59:18,626] - step:26/900 train_loss:0.4607 lr:0.0003000000 time/step:118.18s
34
+ [2025-09-11 21:01:15,710] - step:27/900 train_loss:0.4908 lr:0.0003000000 time/step:117.08s
35
+ [2025-09-11 21:03:12,633] - step:28/900 train_loss:0.4910 lr:0.0003000000 time/step:116.91s
36
+ [2025-09-11 21:05:09,636] - step:29/900 train_loss:0.4657 lr:0.0003000000 time/step:117.00s
37
+ [2025-09-11 21:07:06,700] - step:30/900 train_loss:0.4594 lr:0.0003000000 time/step:117.06s
38
+ [2025-09-11 21:09:04,683] - step:31/900 train_loss:0.4755 lr:0.0003000000 time/step:117.97s
39
+ [2025-09-11 21:11:01,763] - step:32/900 train_loss:0.4541 lr:0.0003000000 time/step:117.08s
40
+ [2025-09-11 21:12:59,791] - step:33/900 train_loss:0.4807 lr:0.0003000000 time/step:118.02s
41
+ [2025-09-11 21:14:55,836] - step:34/900 train_loss:0.4870 lr:0.0003000000 time/step:116.03s
42
+ [2025-09-11 21:16:52,899] - step:35/900 train_loss:0.4625 lr:0.0003000000 time/step:117.06s
43
+ [2025-09-11 21:18:51,003] - step:36/900 train_loss:0.4791 lr:0.0003000000 time/step:118.09s
44
+ [2025-09-11 21:20:48,545] - step:37/900 train_loss:0.4473 lr:0.0003000000 time/step:117.53s
45
+ [2025-09-11 21:22:45,589] - step:38/900 train_loss:0.4752 lr:0.0003000000 time/step:117.04s
46
+ [2025-09-11 21:24:43,273] - step:39/900 train_loss:0.4637 lr:0.0003000000 time/step:117.68s
47
+ [2025-09-11 21:26:39,295] - step:40/900 train_loss:0.4792 lr:0.0003000000 time/step:116.01s
48
+ [2025-09-11 21:28:36,435] - step:41/900 train_loss:0.4486 lr:0.0003000000 time/step:117.13s
49
+ [2025-09-11 21:30:33,920] - step:42/900 train_loss:0.4401 lr:0.0003000000 time/step:117.48s
50
+ [2025-09-11 21:32:30,825] - step:43/900 train_loss:0.4647 lr:0.0003000000 time/step:116.90s
51
+ [2025-09-11 21:34:28,329] - step:44/900 train_loss:0.4925 lr:0.0003000000 time/step:117.50s
52
+ [2025-09-11 21:36:25,926] - step:45/900 train_loss:0.4660 lr:0.0003000000 time/step:117.59s
53
+ [2025-09-11 21:38:22,375] - step:46/900 train_loss:0.4459 lr:0.0003000000 time/step:116.44s
54
+ [2025-09-11 21:40:19,319] - step:47/900 train_loss:0.4487 lr:0.0003000000 time/step:116.93s
55
+ [2025-09-11 21:42:17,801] - step:48/900 train_loss:0.4378 lr:0.0003000000 time/step:118.48s
56
+ [2025-09-11 21:44:15,250] - step:49/900 train_loss:0.4623 lr:0.0003000000 time/step:117.44s
57
+ [2025-09-11 21:46:12,028] - step:50/900 train_loss:0.4788 lr:0.0003000000 time/step:116.77s
58
+ [2025-09-11 21:48:08,924] - step:51/900 train_loss:0.4612 lr:0.0003000000 time/step:116.89s
59
+ [2025-09-11 21:50:05,277] - step:52/900 train_loss:0.4670 lr:0.0003000000 time/step:116.34s
60
+ [2025-09-11 21:52:03,579] - step:53/900 train_loss:0.4948 lr:0.0003000000 time/step:118.20s
61
+ [2025-09-11 21:54:00,439] - step:54/900 train_loss:0.4474 lr:0.0003000000 time/step:116.86s
62
+ [2025-09-11 21:55:57,226] - step:55/900 train_loss:0.4696 lr:0.0003000000 time/step:116.78s
63
+ [2025-09-11 21:57:54,070] - step:56/900 train_loss:0.4636 lr:0.0003000000 time/step:116.84s
64
+ [2025-09-11 21:59:51,015] - step:57/900 train_loss:0.4567 lr:0.0003000000 time/step:116.93s
65
+ [2025-09-11 22:01:48,416] - step:58/900 train_loss:0.4600 lr:0.0003000000 time/step:117.40s
66
+ [2025-09-11 22:03:46,720] - step:59/900 train_loss:0.4678 lr:0.0003000000 time/step:118.30s
67
+ [2025-09-11 22:05:43,544] - step:60/900 train_loss:0.4619 lr:0.0003000000 time/step:116.82s
68
+ [2025-09-11 22:07:40,424] - step:61/900 train_loss:0.4553 lr:0.0003000000 time/step:116.87s
69
+ [2025-09-11 22:09:37,873] - step:62/900 train_loss:0.4719 lr:0.0003000000 time/step:117.43s
70
+ [2025-09-11 22:11:34,969] - step:63/900 train_loss:0.4582 lr:0.0003000000 time/step:117.09s
71
+ [2025-09-11 22:13:31,914] - step:64/900 train_loss:0.4430 lr:0.0003000000 time/step:116.94s
72
+ [2025-09-11 22:15:28,799] - step:65/900 train_loss:0.4268 lr:0.0003000000 time/step:116.88s
73
+ [2025-09-11 22:17:25,704] - step:66/900 train_loss:0.4669 lr:0.0003000000 time/step:116.90s
74
+ [2025-09-11 22:19:22,827] - step:67/900 train_loss:0.4380 lr:0.0003000000 time/step:117.11s
75
+ [2025-09-11 22:21:20,150] - step:68/900 train_loss:0.4785 lr:0.0003000000 time/step:117.32s
76
+ [2025-09-11 22:23:16,126] - step:69/900 train_loss:0.4678 lr:0.0003000000 time/step:115.97s
77
+ [2025-09-11 22:25:13,659] - step:70/900 train_loss:0.4456 lr:0.0003000000 time/step:117.53s
78
+ [2025-09-11 22:27:10,581] - step:71/900 train_loss:0.4403 lr:0.0003000000 time/step:116.91s
79
+ [2025-09-11 22:29:07,930] - step:72/900 train_loss:0.4318 lr:0.0003000000 time/step:117.34s
80
+ [2025-09-11 22:31:05,566] - step:73/900 train_loss:0.4546 lr:0.0003000000 time/step:117.63s
81
+ [2025-09-11 22:33:02,531] - step:74/900 train_loss:0.4860 lr:0.0003000000 time/step:116.96s
82
+ [2025-09-11 22:34:59,254] - step:75/900 train_loss:0.4499 lr:0.0003000000 time/step:116.72s
83
+ [2025-09-11 22:36:57,138] - step:76/900 train_loss:0.4490 lr:0.0003000000 time/step:117.88s
84
+ [2025-09-11 22:38:54,164] - step:77/900 train_loss:0.4490 lr:0.0003000000 time/step:117.02s
85
+ [2025-09-11 22:40:51,448] - step:78/900 train_loss:0.4455 lr:0.0003000000 time/step:117.27s
86
+ [2025-09-11 22:42:48,430] - step:79/900 train_loss:0.4274 lr:0.0003000000 time/step:116.98s
87
+ [2025-09-11 22:44:45,934] - step:80/900 train_loss:0.4519 lr:0.0003000000 time/step:117.50s
88
+ [2025-09-11 22:46:42,798] - step:81/900 train_loss:0.4429 lr:0.0003000000 time/step:116.85s
89
+ [2025-09-11 22:48:39,720] - step:82/900 train_loss:0.4436 lr:0.0003000000 time/step:116.92s
90
+ [2025-09-11 22:50:37,164] - step:83/900 train_loss:0.4713 lr:0.0003000000 time/step:117.43s
91
+ [2025-09-11 22:52:33,983] - step:84/900 train_loss:0.4399 lr:0.0003000000 time/step:116.82s
92
+ [2025-09-11 22:54:31,605] - step:85/900 train_loss:0.4343 lr:0.0003000000 time/step:117.62s
93
+ [2025-09-11 22:56:29,383] - step:86/900 train_loss:0.4587 lr:0.0003000000 time/step:117.77s
94
+ [2025-09-11 22:58:26,338] - step:87/900 train_loss:0.4550 lr:0.0003000000 time/step:116.95s
95
+ [2025-09-11 23:00:23,614] - step:88/900 train_loss:0.4437 lr:0.0003000000 time/step:117.26s
96
+ [2025-09-11 23:02:20,358] - step:89/900 train_loss:0.4575 lr:0.0003000000 time/step:116.74s
97
+ [2025-09-11 23:04:17,289] - step:90/900 train_loss:0.4361 lr:0.0003000000 time/step:116.93s
98
+ [2025-09-11 23:06:15,307] - step:91/900 train_loss:0.4259 lr:0.0003000000 time/step:118.02s
99
+ [2025-09-11 23:08:12,562] - step:92/900 train_loss:0.4340 lr:0.0003000000 time/step:117.25s
100
+ [2025-09-11 23:10:10,001] - step:93/900 train_loss:0.4424 lr:0.0003000000 time/step:117.43s
101
+ [2025-09-11 23:12:07,171] - step:94/900 train_loss:0.4240 lr:0.0003000000 time/step:117.16s
102
+ [2025-09-11 23:14:05,158] - step:95/900 train_loss:0.4425 lr:0.0003000000 time/step:117.99s
103
+ [2025-09-11 23:16:02,641] - step:96/900 train_loss:0.4575 lr:0.0003000000 time/step:117.48s
104
+ [2025-09-11 23:17:59,591] - step:97/900 train_loss:0.4435 lr:0.0003000000 time/step:116.94s
105
+ [2025-09-11 23:19:55,399] - step:98/900 train_loss:0.4466 lr:0.0003000000 time/step:115.80s
106
+ [2025-09-11 23:21:53,531] - step:99/900 train_loss:0.4469 lr:0.0003000000 time/step:118.12s
107
+ [2025-09-11 23:23:52,424] - step:100/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@100.pt...
108
+ [2025-09-11 23:23:52,425] - step:100/900 train_loss:0.4467 lr:0.0003000000 time/step:118.25s
109
+ [2025-09-11 23:25:49,555] - step:101/900 train_loss:0.4462 lr:0.0003000000 time/step:117.13s
110
+ [2025-09-11 23:27:46,681] - step:102/900 train_loss:0.4479 lr:0.0003000000 time/step:117.12s
111
+ [2025-09-11 23:29:43,985] - step:103/900 train_loss:0.4212 lr:0.0003000000 time/step:117.30s
112
+ [2025-09-11 23:31:40,749] - step:104/900 train_loss:0.4385 lr:0.0003000000 time/step:116.76s
113
+ [2025-09-11 23:33:37,611] - step:105/900 train_loss:0.4490 lr:0.0003000000 time/step:116.86s
114
+ [2025-09-11 23:35:34,674] - step:106/900 train_loss:0.4537 lr:0.0003000000 time/step:117.06s
115
+ [2025-09-11 23:37:32,277] - step:107/900 train_loss:0.4278 lr:0.0003000000 time/step:117.60s
116
+ [2025-09-11 23:39:29,569] - step:108/900 train_loss:0.4413 lr:0.0003000000 time/step:117.28s
117
+ [2025-09-11 23:41:26,965] - step:109/900 train_loss:0.4219 lr:0.0003000000 time/step:117.39s
118
+ [2025-09-11 23:43:23,608] - step:110/900 train_loss:0.4455 lr:0.0003000000 time/step:116.64s
119
+ [2025-09-11 23:45:20,608] - step:111/900 train_loss:0.4581 lr:0.0003000000 time/step:117.00s
120
+ [2025-09-11 23:47:17,496] - step:112/900 train_loss:0.4501 lr:0.0003000000 time/step:116.89s
121
+ [2025-09-11 23:49:15,179] - step:113/900 train_loss:0.4332 lr:0.0003000000 time/step:117.66s
122
+ [2025-09-11 23:51:13,020] - step:114/900 train_loss:0.4311 lr:0.0003000000 time/step:117.83s
123
+ [2025-09-11 23:53:10,646] - step:115/900 train_loss:0.4449 lr:0.0003000000 time/step:117.62s
124
+ [2025-09-11 23:55:06,688] - step:116/900 train_loss:0.4424 lr:0.0003000000 time/step:116.04s
125
+ [2025-09-11 23:57:03,652] - step:117/900 train_loss:0.4392 lr:0.0003000000 time/step:116.96s
126
+ [2025-09-11 23:59:01,394] - step:118/900 train_loss:0.4246 lr:0.0003000000 time/step:117.74s
127
+ [2025-09-12 00:00:58,798] - step:119/900 train_loss:0.4339 lr:0.0003000000 time/step:117.39s
128
+ [2025-09-12 00:02:56,142] - step:120/900 train_loss:0.4064 lr:0.0003000000 time/step:117.33s
129
+ [2025-09-12 00:04:53,044] - step:121/900 train_loss:0.4421 lr:0.0003000000 time/step:116.90s
130
+ [2025-09-12 00:06:49,048] - step:122/900 train_loss:0.4306 lr:0.0003000000 time/step:116.00s
131
+ [2025-09-12 00:08:46,671] - step:123/900 train_loss:0.4163 lr:0.0003000000 time/step:117.62s
132
+ [2025-09-12 00:10:44,735] - step:124/900 train_loss:0.4428 lr:0.0003000000 time/step:118.05s
133
+ [2025-09-12 00:12:42,019] - step:125/900 train_loss:0.4188 lr:0.0003000000 time/step:117.27s
134
+ [2025-09-12 00:14:38,901] - step:126/900 train_loss:0.4226 lr:0.0003000000 time/step:116.88s
135
+ [2025-09-12 00:16:35,356] - step:127/900 train_loss:0.4379 lr:0.0003000000 time/step:116.45s
136
+ [2025-09-12 00:18:31,808] - step:128/900 train_loss:0.4475 lr:0.0003000000 time/step:116.45s
137
+ [2025-09-12 00:20:31,092] - step:129/900 train_loss:0.4579 lr:0.0003000000 time/step:119.27s
138
+ [2025-09-12 00:22:28,417] - step:130/900 train_loss:0.4504 lr:0.0003000000 time/step:117.31s
139
+ [2025-09-12 00:24:25,417] - step:131/900 train_loss:0.4345 lr:0.0003000000 time/step:116.99s
140
+ [2025-09-12 00:26:22,282] - step:132/900 train_loss:0.4567 lr:0.0003000000 time/step:116.86s
141
+ [2025-09-12 00:28:18,304] - step:133/900 train_loss:0.4396 lr:0.0003000000 time/step:116.02s
142
+ [2025-09-12 00:30:15,628] - step:134/900 train_loss:0.4440 lr:0.0003000000 time/step:117.32s
143
+ [2025-09-12 00:32:13,051] - step:135/900 train_loss:0.4384 lr:0.0003000000 time/step:117.42s
144
+ [2025-09-12 00:34:10,336] - step:136/900 train_loss:0.4276 lr:0.0003000000 time/step:117.28s
145
+ [2025-09-12 00:36:07,098] - step:137/900 train_loss:0.4424 lr:0.0003000000 time/step:116.76s
146
+ [2025-09-12 00:38:03,861] - step:138/900 train_loss:0.4288 lr:0.0003000000 time/step:116.76s
147
+ [2025-09-12 00:40:00,304] - step:139/900 train_loss:0.4333 lr:0.0003000000 time/step:116.43s
148
+ [2025-09-12 00:41:57,928] - step:140/900 train_loss:0.4347 lr:0.0003000000 time/step:117.62s
149
+ [2025-09-12 00:43:56,252] - step:141/900 train_loss:0.4515 lr:0.0003000000 time/step:118.32s
150
+ [2025-09-12 00:45:53,156] - step:142/900 train_loss:0.4531 lr:0.0003000000 time/step:116.90s
151
+ [2025-09-12 00:47:50,037] - step:143/900 train_loss:0.4426 lr:0.0003000000 time/step:116.88s
152
+ [2025-09-12 00:49:46,863] - step:144/900 train_loss:0.4100 lr:0.0003000000 time/step:116.81s
153
+ [2025-09-12 00:51:42,986] - step:145/900 train_loss:0.4185 lr:0.0003000000 time/step:116.12s
154
+ [2025-09-12 00:53:40,748] - step:146/900 train_loss:0.4556 lr:0.0003000000 time/step:117.75s
155
+ [2025-09-12 00:55:38,614] - step:147/900 train_loss:0.4580 lr:0.0003000000 time/step:117.86s
156
+ [2025-09-12 00:57:35,395] - step:148/900 train_loss:0.4432 lr:0.0003000000 time/step:116.77s
157
+ [2025-09-12 00:59:32,300] - step:149/900 train_loss:0.4260 lr:0.0003000000 time/step:116.90s
158
+ [2025-09-12 01:01:29,963] - step:150/900 train_loss:0.4369 lr:0.0003000000 time/step:117.65s
159
+ [2025-09-12 01:03:26,107] - step:151/900 train_loss:0.4121 lr:0.0003000000 time/step:116.14s
160
+ [2025-09-12 01:05:23,232] - step:152/900 train_loss:0.4488 lr:0.0003000000 time/step:117.12s
161
+ [2025-09-12 01:07:21,054] - step:153/900 train_loss:0.4290 lr:0.0003000000 time/step:117.82s
162
+ [2025-09-12 01:09:17,934] - step:154/900 train_loss:0.4126 lr:0.0003000000 time/step:116.88s
163
+ [2025-09-12 01:11:15,437] - step:155/900 train_loss:0.4201 lr:0.0003000000 time/step:117.49s
164
+ [2025-09-12 01:13:12,295] - step:156/900 train_loss:0.4294 lr:0.0003000000 time/step:116.85s
165
+ [2025-09-12 01:15:08,687] - step:157/900 train_loss:0.4340 lr:0.0003000000 time/step:116.38s
166
+ [2025-09-12 01:17:05,708] - step:158/900 train_loss:0.4543 lr:0.0003000000 time/step:117.01s
167
+ [2025-09-12 01:19:03,353] - step:159/900 train_loss:0.4211 lr:0.0003000000 time/step:117.64s
168
+ [2025-09-12 01:21:00,871] - step:160/900 train_loss:0.4400 lr:0.0003000000 time/step:117.51s
169
+ [2025-09-12 01:22:57,738] - step:161/900 train_loss:0.4259 lr:0.0003000000 time/step:116.86s
170
+ [2025-09-12 01:24:55,051] - step:162/900 train_loss:0.4150 lr:0.0003000000 time/step:117.31s
171
+ [2025-09-12 01:26:51,147] - step:163/900 train_loss:0.4168 lr:0.0003000000 time/step:116.09s
172
+ [2025-09-12 01:28:48,833] - step:164/900 train_loss:0.4024 lr:0.0003000000 time/step:117.68s
173
+ [2025-09-12 01:30:46,610] - step:165/900 train_loss:0.4476 lr:0.0003000000 time/step:117.77s
174
+ [2025-09-12 01:32:43,517] - step:166/900 train_loss:0.4241 lr:0.0003000000 time/step:116.90s
175
+ [2025-09-12 01:34:41,001] - step:167/900 train_loss:0.4268 lr:0.0003000000 time/step:117.48s
176
+ [2025-09-12 01:36:37,582] - step:168/900 train_loss:0.3846 lr:0.0003000000 time/step:116.57s
177
+ [2025-09-12 01:38:34,908] - step:169/900 train_loss:0.4199 lr:0.0003000000 time/step:117.32s
178
+ [2025-09-12 01:40:33,014] - step:170/900 train_loss:0.4037 lr:0.0003000000 time/step:118.09s
179
+ [2025-09-12 01:42:29,854] - step:171/900 train_loss:0.4579 lr:0.0003000000 time/step:116.84s
180
+ [2025-09-12 01:44:27,350] - step:172/900 train_loss:0.4435 lr:0.0003000000 time/step:117.48s
181
+ [2025-09-12 01:46:24,704] - step:173/900 train_loss:0.4139 lr:0.0003000000 time/step:117.34s
182
+ [2025-09-12 01:48:21,009] - step:174/900 train_loss:0.4308 lr:0.0003000000 time/step:116.30s
183
+ [2025-09-12 01:50:19,086] - step:175/900 train_loss:0.4156 lr:0.0003000000 time/step:118.06s
184
+ [2025-09-12 01:52:16,506] - step:176/900 train_loss:0.4204 lr:0.0003000000 time/step:117.41s
185
+ [2025-09-12 01:54:14,395] - step:177/900 train_loss:0.4211 lr:0.0003000000 time/step:117.87s
186
+ [2025-09-12 01:56:11,781] - step:178/900 train_loss:0.4399 lr:0.0003000000 time/step:117.38s
187
+ [2025-09-12 01:58:09,165] - step:179/900 train_loss:0.4327 lr:0.0003000000 time/step:117.38s
188
+ [2025-09-12 02:00:05,670] - step:180/900 train_loss:0.4362 lr:0.0003000000 time/step:116.49s
189
+ [2025-09-12 02:02:03,683] - step:181/900 train_loss:0.4204 lr:0.0003000000 time/step:118.01s
190
+ [2025-09-12 02:04:01,525] - step:182/900 train_loss:0.4528 lr:0.0003000000 time/step:117.84s
191
+ [2025-09-12 02:05:59,256] - step:183/900 train_loss:0.4115 lr:0.0003000000 time/step:117.72s
192
+ [2025-09-12 02:07:56,456] - step:184/900 train_loss:0.4527 lr:0.0003000000 time/step:117.20s
193
+ [2025-09-12 02:09:53,692] - step:185/900 train_loss:0.4378 lr:0.0003000000 time/step:117.23s
194
+ [2025-09-12 02:11:50,835] - step:186/900 train_loss:0.4322 lr:0.0003000000 time/step:117.14s
195
+ [2025-09-12 02:13:49,249] - step:187/900 train_loss:0.4503 lr:0.0003000000 time/step:118.41s
196
+ [2025-09-12 02:15:46,708] - step:188/900 train_loss:0.4137 lr:0.0003000000 time/step:117.45s
197
+ [2025-09-12 02:17:44,588] - step:189/900 train_loss:0.4373 lr:0.0003000000 time/step:117.87s
198
+ [2025-09-12 02:19:41,640] - step:190/900 train_loss:0.4390 lr:0.0003000000 time/step:117.04s
199
+ [2025-09-12 02:21:38,674] - step:191/900 train_loss:0.4540 lr:0.0003000000 time/step:117.02s
200
+ [2025-09-12 02:23:35,317] - step:192/900 train_loss:0.4401 lr:0.0003000000 time/step:116.64s
201
+ [2025-09-12 02:25:32,403] - step:193/900 train_loss:0.4325 lr:0.0003000000 time/step:117.08s
202
+ [2025-09-12 02:27:29,545] - step:194/900 train_loss:0.4249 lr:0.0003000000 time/step:117.13s
203
+ [2025-09-12 02:29:26,648] - step:195/900 train_loss:0.4074 lr:0.0003000000 time/step:117.09s
204
+ [2025-09-12 02:31:23,432] - step:196/900 train_loss:0.4212 lr:0.0003000000 time/step:116.77s
205
+ [2025-09-12 02:33:21,256] - step:197/900 train_loss:0.4408 lr:0.0003000000 time/step:117.82s
206
+ [2025-09-12 02:35:18,019] - step:198/900 train_loss:0.4229 lr:0.0003000000 time/step:116.76s
207
+ [2025-09-12 02:37:15,403] - step:199/900 train_loss:0.4517 lr:0.0003000000 time/step:117.38s
208
+ [2025-09-12 02:39:13,125] - step:200/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@200.pt...
209
+ [2025-09-12 02:39:13,129] - step:200/900 train_loss:0.4149 lr:0.0003000000 time/step:117.11s
210
+ [2025-09-12 02:41:09,907] - step:201/900 train_loss:0.4258 lr:0.0003000000 time/step:116.76s
211
+ [2025-09-12 02:43:06,972] - step:202/900 train_loss:0.4207 lr:0.0003000000 time/step:117.06s
212
+ [2025-09-12 02:45:03,575] - step:203/900 train_loss:0.4432 lr:0.0003000000 time/step:116.60s
213
+ [2025-09-12 02:47:00,062] - step:204/900 train_loss:0.4072 lr:0.0003000000 time/step:116.48s
214
+ [2025-09-12 02:48:57,543] - step:205/900 train_loss:0.4404 lr:0.0003000000 time/step:117.47s
215
+ [2025-09-12 02:50:54,165] - step:206/900 train_loss:0.4151 lr:0.0003000000 time/step:116.61s
216
+ [2025-09-12 02:52:50,609] - step:207/900 train_loss:0.4256 lr:0.0003000000 time/step:116.44s
217
+ [2025-09-12 02:54:48,171] - step:208/900 train_loss:0.4200 lr:0.0003000000 time/step:117.56s
218
+ [2025-09-12 02:56:44,244] - step:209/900 train_loss:0.4159 lr:0.0003000000 time/step:116.06s
219
+ [2025-09-12 02:58:41,740] - step:210/900 train_loss:0.4080 lr:0.0003000000 time/step:117.49s
220
+ [2025-09-12 03:00:38,362] - step:211/900 train_loss:0.4394 lr:0.0003000000 time/step:116.61s
221
+ [2025-09-12 03:02:34,864] - step:212/900 train_loss:0.4461 lr:0.0003000000 time/step:116.49s
222
+ [2025-09-12 03:04:32,289] - step:213/900 train_loss:0.4310 lr:0.0003000000 time/step:117.42s
223
+ [2025-09-12 03:06:29,834] - step:214/900 train_loss:0.4458 lr:0.0003000000 time/step:117.53s
224
+ [2025-09-12 03:08:26,395] - step:215/900 train_loss:0.4322 lr:0.0003000000 time/step:116.56s
225
+ [2025-09-12 03:10:23,441] - step:216/900 train_loss:0.3979 lr:0.0003000000 time/step:117.03s
226
+ [2025-09-12 03:12:19,963] - step:217/900 train_loss:0.4011 lr:0.0003000000 time/step:116.51s
227
+ [2025-09-12 03:14:17,627] - step:218/900 train_loss:0.4372 lr:0.0003000000 time/step:117.66s
228
+ [2025-09-12 03:16:15,332] - step:219/900 train_loss:0.4281 lr:0.0003000000 time/step:117.70s
229
+ [2025-09-12 03:18:11,833] - step:220/900 train_loss:0.4330 lr:0.0003000000 time/step:116.49s
230
+ [2025-09-12 03:20:08,497] - step:221/900 train_loss:0.4534 lr:0.0003000000 time/step:116.65s
231
+ [2025-09-12 03:22:05,021] - step:222/900 train_loss:0.4076 lr:0.0003000000 time/step:116.52s
232
+ [2025-09-12 03:24:01,826] - step:223/900 train_loss:0.4211 lr:0.0003000000 time/step:116.79s
233
+ [2025-09-12 03:25:58,807] - step:224/900 train_loss:0.4075 lr:0.0003000000 time/step:116.98s
234
+ [2025-09-12 03:27:56,427] - step:225/900 train_loss:0.3977 lr:0.0003000000 time/step:117.61s
235
+ [2025-09-12 03:29:53,271] - step:226/900 train_loss:0.4331 lr:0.0003000000 time/step:116.84s
236
+ [2025-09-12 03:31:48,818] - step:227/900 train_loss:0.4424 lr:0.0003000000 time/step:115.53s
237
+ [2025-09-12 03:33:46,260] - step:228/900 train_loss:0.4265 lr:0.0003000000 time/step:117.44s
238
+ [2025-09-12 03:35:42,726] - step:229/900 train_loss:0.4018 lr:0.0003000000 time/step:116.46s
239
+ [2025-09-12 03:37:39,927] - step:230/900 train_loss:0.4277 lr:0.0003000000 time/step:117.20s
240
+ [2025-09-12 03:39:37,253] - step:231/900 train_loss:0.4229 lr:0.0003000000 time/step:117.32s
241
+ [2025-09-12 03:41:34,210] - step:232/900 train_loss:0.4231 lr:0.0003000000 time/step:116.94s
242
+ [2025-09-12 03:43:30,497] - step:233/900 train_loss:0.4125 lr:0.0003000000 time/step:116.28s
243
+ [2025-09-12 03:45:27,022] - step:234/900 train_loss:0.4181 lr:0.0003000000 time/step:116.52s
244
+ [2025-09-12 03:47:23,505] - step:235/900 train_loss:0.4364 lr:0.0003000000 time/step:116.48s
245
+ [2025-09-12 03:49:21,967] - step:236/900 train_loss:0.4135 lr:0.0003000000 time/step:118.46s
246
+ [2025-09-12 03:51:18,413] - step:237/900 train_loss:0.4139 lr:0.0003000000 time/step:116.43s
247
+ [2025-09-12 03:53:14,453] - step:238/900 train_loss:0.4341 lr:0.0003000000 time/step:116.03s
248
+ [2025-09-12 03:55:11,117] - step:239/900 train_loss:0.4174 lr:0.0003000000 time/step:116.66s
249
+ [2025-09-12 03:57:08,642] - step:240/900 train_loss:0.4449 lr:0.0003000000 time/step:117.52s
250
+ [2025-09-12 03:59:06,595] - step:241/900 train_loss:0.4303 lr:0.0003000000 time/step:117.95s
251
+ [2025-09-12 04:01:02,667] - step:242/900 train_loss:0.4350 lr:0.0003000000 time/step:116.06s
252
+ [2025-09-12 04:02:58,652] - step:243/900 train_loss:0.4332 lr:0.0003000000 time/step:115.97s
253
+ [2025-09-12 04:04:55,158] - step:244/900 train_loss:0.4170 lr:0.0003000000 time/step:116.50s
254
+ [2025-09-12 04:06:52,523] - step:245/900 train_loss:0.4325 lr:0.0003000000 time/step:117.35s
255
+ [2025-09-12 04:08:49,506] - step:246/900 train_loss:0.4140 lr:0.0003000000 time/step:116.98s
256
+ [2025-09-12 04:10:46,625] - step:247/900 train_loss:0.4244 lr:0.0003000000 time/step:117.10s
257
+ [2025-09-12 04:12:43,060] - step:248/900 train_loss:0.4435 lr:0.0003000000 time/step:116.43s
258
+ [2025-09-12 04:14:39,932] - step:249/900 train_loss:0.4188 lr:0.0003000000 time/step:116.87s
259
+ [2025-09-12 04:16:36,428] - step:250/900 train_loss:0.4138 lr:0.0003000000 time/step:116.49s
260
+ [2025-09-12 04:18:34,283] - step:251/900 train_loss:0.4045 lr:0.0003000000 time/step:117.84s
261
+ [2025-09-12 04:20:32,264] - step:252/900 train_loss:0.4128 lr:0.0003000000 time/step:117.96s
262
+ [2025-09-12 04:22:28,905] - step:253/900 train_loss:0.4352 lr:0.0003000000 time/step:116.63s
263
+ [2025-09-12 04:24:25,744] - step:254/900 train_loss:0.4090 lr:0.0003000000 time/step:116.83s
264
+ [2025-09-12 04:26:22,527] - step:255/900 train_loss:0.4125 lr:0.0003000000 time/step:116.78s
265
+ [2025-09-12 04:28:18,535] - step:256/900 train_loss:0.3974 lr:0.0003000000 time/step:116.00s
266
+ [2025-09-12 04:30:16,548] - step:257/900 train_loss:0.4056 lr:0.0003000000 time/step:118.00s
267
+ [2025-09-12 04:32:14,016] - step:258/900 train_loss:0.4158 lr:0.0003000000 time/step:117.45s
268
+ [2025-09-12 04:34:10,993] - step:259/900 train_loss:0.4080 lr:0.0003000000 time/step:116.97s
269
+ [2025-09-12 04:36:07,637] - step:260/900 train_loss:0.4217 lr:0.0003000000 time/step:116.64s
270
+ [2025-09-12 04:38:05,072] - step:261/900 train_loss:0.4157 lr:0.0003000000 time/step:117.43s
271
+ [2025-09-12 04:40:01,843] - step:262/900 train_loss:0.4139 lr:0.0003000000 time/step:116.76s
272
+ [2025-09-12 04:41:58,873] - step:263/900 train_loss:0.4401 lr:0.0003000000 time/step:117.01s
273
+ [2025-09-12 04:43:56,795] - step:264/900 train_loss:0.4272 lr:0.0003000000 time/step:117.92s
274
+ [2025-09-12 04:45:53,571] - step:265/900 train_loss:0.4228 lr:0.0003000000 time/step:116.76s
275
+ [2025-09-12 04:47:50,269] - step:266/900 train_loss:0.4242 lr:0.0003000000 time/step:116.69s
276
+ [2025-09-12 04:49:47,027] - step:267/900 train_loss:0.4361 lr:0.0003000000 time/step:116.75s
277
+ [2025-09-12 04:51:43,112] - step:268/900 train_loss:0.4224 lr:0.0003000000 time/step:116.07s
278
+ [2025-09-12 04:53:41,046] - step:269/900 train_loss:0.4076 lr:0.0003000000 time/step:117.92s
279
+ [2025-09-12 04:55:37,470] - step:270/900 train_loss:0.4172 lr:0.0003000000 time/step:116.42s
280
+ [2025-09-12 04:57:33,853] - step:271/900 train_loss:0.4219 lr:0.0003000000 time/step:116.38s
281
+ [2025-09-12 04:59:30,265] - step:272/900 train_loss:0.4281 lr:0.0003000000 time/step:116.41s
282
+ [2025-09-12 05:01:26,500] - step:273/900 train_loss:0.4105 lr:0.0003000000 time/step:116.22s
283
+ [2025-09-12 05:03:24,415] - step:274/900 train_loss:0.4247 lr:0.0003000000 time/step:117.91s
284
+ [2025-09-12 05:05:21,825] - step:275/900 train_loss:0.4172 lr:0.0003000000 time/step:117.40s
285
+ [2025-09-12 05:07:18,643] - step:276/900 train_loss:0.4281 lr:0.0003000000 time/step:116.81s
286
+ [2025-09-12 05:09:15,889] - step:277/900 train_loss:0.4140 lr:0.0003000000 time/step:117.23s
287
+ [2025-09-12 05:11:13,080] - step:278/900 train_loss:0.4459 lr:0.0003000000 time/step:117.18s
288
+ [2025-09-12 05:13:09,433] - step:279/900 train_loss:0.4128 lr:0.0003000000 time/step:116.35s
289
+ [2025-09-12 05:15:07,057] - step:280/900 train_loss:0.4171 lr:0.0003000000 time/step:117.62s
290
+ [2025-09-12 05:17:03,780] - step:281/900 train_loss:0.4083 lr:0.0003000000 time/step:116.71s
291
+ [2025-09-12 05:19:00,703] - step:282/900 train_loss:0.4214 lr:0.0003000000 time/step:116.92s
292
+ [2025-09-12 05:20:57,932] - step:283/900 train_loss:0.4072 lr:0.0003000000 time/step:117.19s
293
+ [2025-09-12 05:22:54,350] - step:284/900 train_loss:0.4471 lr:0.0003000000 time/step:116.39s
294
+ [2025-09-12 05:24:50,794] - step:285/900 train_loss:0.3946 lr:0.0003000000 time/step:116.44s
295
+ [2025-09-12 05:26:47,657] - step:286/900 train_loss:0.4510 lr:0.0003000000 time/step:116.86s
296
+ [2025-09-12 05:28:43,717] - step:287/900 train_loss:0.4409 lr:0.0003000000 time/step:116.05s
297
+ [2025-09-12 05:30:40,741] - step:288/900 train_loss:0.3887 lr:0.0003000000 time/step:117.01s
298
+ [2025-09-12 05:32:38,986] - step:289/900 train_loss:0.4207 lr:0.0003000000 time/step:118.24s
299
+ [2025-09-12 05:34:35,229] - step:290/900 train_loss:0.4018 lr:0.0003000000 time/step:116.24s
300
+ [2025-09-12 05:36:31,796] - step:291/900 train_loss:0.4233 lr:0.0003000000 time/step:116.56s
301
+ [2025-09-12 05:38:28,659] - step:292/900 train_loss:0.4223 lr:0.0003000000 time/step:116.86s
302
+ [2025-09-12 05:40:25,842] - step:293/900 train_loss:0.4412 lr:0.0003000000 time/step:117.18s
303
+ [2025-09-12 05:42:22,767] - step:294/900 train_loss:0.3965 lr:0.0003000000 time/step:116.91s
304
+ [2025-09-12 05:44:20,588] - step:295/900 train_loss:0.4155 lr:0.0003000000 time/step:117.81s
305
+ [2025-09-12 05:46:17,250] - step:296/900 train_loss:0.4051 lr:0.0003000000 time/step:116.66s
306
+ [2025-09-12 05:48:13,495] - step:297/900 train_loss:0.4186 lr:0.0003000000 time/step:116.24s
307
+ [2025-09-12 05:50:10,418] - step:298/900 train_loss:0.4280 lr:0.0003000000 time/step:116.91s
308
+ [2025-09-12 05:52:07,903] - step:299/900 train_loss:0.4225 lr:0.0003000000 time/step:117.46s
309
+ [2025-09-12 05:54:04,575] - step:300/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@300.pt...
310
+ [2025-09-12 05:54:04,576] - step:300/900 train_loss:0.4086 lr:0.0003000000 time/step:116.07s
311
+ [2025-09-12 05:56:02,285] - step:301/900 train_loss:0.4136 lr:0.0003000000 time/step:117.71s
312
+ [2025-09-12 05:58:00,171] - step:302/900 train_loss:0.4114 lr:0.0003000000 time/step:117.88s
313
+ [2025-09-12 05:59:55,795] - step:303/900 train_loss:0.4200 lr:0.0003000000 time/step:115.62s
314
+ [2025-09-12 06:01:52,652] - step:304/900 train_loss:0.4085 lr:0.0003000000 time/step:116.84s
315
+ [2025-09-12 06:03:49,824] - step:305/900 train_loss:0.4311 lr:0.0003000000 time/step:117.16s
316
+ [2025-09-12 06:05:46,285] - step:306/900 train_loss:0.4367 lr:0.0003000000 time/step:116.45s
317
+ [2025-09-12 06:07:44,209] - step:307/900 train_loss:0.4345 lr:0.0003000000 time/step:117.92s
318
+ [2025-09-12 06:09:41,568] - step:308/900 train_loss:0.4016 lr:0.0003000000 time/step:117.35s
319
+ [2025-09-12 06:11:38,074] - step:309/900 train_loss:0.4102 lr:0.0003000000 time/step:116.49s
320
+ [2025-09-12 06:13:34,857] - step:310/900 train_loss:0.4332 lr:0.0003000000 time/step:116.77s
321
+ [2025-09-12 06:15:32,302] - step:311/900 train_loss:0.4186 lr:0.0003000000 time/step:117.43s
322
+ [2025-09-12 06:17:29,124] - step:312/900 train_loss:0.4371 lr:0.0003000000 time/step:116.82s
323
+ [2025-09-12 06:19:26,289] - step:313/900 train_loss:0.4130 lr:0.0003000000 time/step:117.16s
324
+ [2025-09-12 06:21:22,830] - step:314/900 train_loss:0.4031 lr:0.0003000000 time/step:116.53s
325
+ [2025-09-12 06:23:19,454] - step:315/900 train_loss:0.4286 lr:0.0003000000 time/step:116.62s
326
+ [2025-09-12 06:25:17,324] - step:316/900 train_loss:0.4007 lr:0.0003000000 time/step:117.86s
327
+ [2025-09-12 06:27:14,242] - step:317/900 train_loss:0.4114 lr:0.0003000000 time/step:116.91s
328
+ [2025-09-12 06:29:11,325] - step:318/900 train_loss:0.4251 lr:0.0003000000 time/step:117.08s
329
+ [2025-09-12 06:31:08,368] - step:319/900 train_loss:0.4448 lr:0.0003000000 time/step:117.03s
330
+ [2025-09-12 06:33:04,509] - step:320/900 train_loss:0.4103 lr:0.0003000000 time/step:116.14s
331
+ [2025-09-12 06:35:02,658] - step:321/900 train_loss:0.4142 lr:0.0003000000 time/step:118.14s
332
+ [2025-09-12 06:36:59,639] - step:322/900 train_loss:0.3985 lr:0.0003000000 time/step:116.97s
333
+ [2025-09-12 06:38:56,063] - step:323/900 train_loss:0.4057 lr:0.0003000000 time/step:116.42s
334
+ [2025-09-12 06:40:53,684] - step:324/900 train_loss:0.4223 lr:0.0003000000 time/step:117.62s
335
+ [2025-09-12 06:42:50,547] - step:325/900 train_loss:0.4205 lr:0.0003000000 time/step:116.85s
336
+ [2025-09-12 06:44:46,896] - step:326/900 train_loss:0.4172 lr:0.0003000000 time/step:116.34s
337
+ [2025-09-12 06:46:45,176] - step:327/900 train_loss:0.4186 lr:0.0003000000 time/step:118.27s
338
+ [2025-09-12 06:48:42,119] - step:328/900 train_loss:0.4294 lr:0.0003000000 time/step:116.93s
339
+ [2025-09-12 06:50:38,781] - step:329/900 train_loss:0.4072 lr:0.0003000000 time/step:116.66s
340
+ [2025-09-12 06:52:36,425] - step:330/900 train_loss:0.4248 lr:0.0003000000 time/step:117.63s
341
+ [2025-09-12 06:54:33,431] - step:331/900 train_loss:0.4141 lr:0.0003000000 time/step:117.00s
342
+ [2025-09-12 06:56:30,074] - step:332/900 train_loss:0.4124 lr:0.0003000000 time/step:116.64s
343
+ [2025-09-12 06:58:26,556] - step:333/900 train_loss:0.4281 lr:0.0003000000 time/step:116.47s
344
+ [2025-09-12 07:00:23,620] - step:334/900 train_loss:0.4141 lr:0.0003000000 time/step:117.06s
345
+ [2025-09-12 07:02:21,404] - step:335/900 train_loss:0.4197 lr:0.0003000000 time/step:117.77s
346
+ [2025-09-12 07:04:17,967] - step:336/900 train_loss:0.4356 lr:0.0003000000 time/step:116.56s
347
+ [2025-09-12 07:06:16,192] - step:337/900 train_loss:0.3934 lr:0.0003000000 time/step:118.22s
348
+ [2025-09-12 07:08:12,291] - step:338/900 train_loss:0.3917 lr:0.0003000000 time/step:116.09s
349
+ [2025-09-12 07:10:08,910] - step:339/900 train_loss:0.4353 lr:0.0003000000 time/step:116.61s
350
+ [2025-09-12 07:12:06,665] - step:340/900 train_loss:0.4537 lr:0.0003000000 time/step:117.74s
351
+ [2025-09-12 07:14:03,621] - step:341/900 train_loss:0.4146 lr:0.0003000000 time/step:116.95s
352
+ [2025-09-12 07:16:00,835] - step:342/900 train_loss:0.4194 lr:0.0003000000 time/step:117.20s
353
+ [2025-09-12 07:17:57,387] - step:343/900 train_loss:0.4117 lr:0.0003000000 time/step:116.54s
354
+ [2025-09-12 07:19:53,951] - step:344/900 train_loss:0.3925 lr:0.0003000000 time/step:116.56s
355
+ [2025-09-12 07:21:50,959] - step:345/900 train_loss:0.4268 lr:0.0003000000 time/step:117.00s
356
+ [2025-09-12 07:23:49,546] - step:346/900 train_loss:0.4113 lr:0.0003000000 time/step:118.58s
357
+ [2025-09-12 07:25:46,639] - step:347/900 train_loss:0.4211 lr:0.0003000000 time/step:117.08s
358
+ [2025-09-12 07:27:43,350] - step:348/900 train_loss:0.4183 lr:0.0003000000 time/step:116.70s
359
+ [2025-09-12 07:29:39,127] - step:349/900 train_loss:0.4313 lr:0.0003000000 time/step:115.77s
360
+ [2025-09-12 07:31:35,852] - step:350/900 train_loss:0.3881 lr:0.0003000000 time/step:116.71s
361
+ [2025-09-12 07:33:34,104] - step:351/900 train_loss:0.4243 lr:0.0003000000 time/step:118.24s
362
+ [2025-09-12 07:35:31,118] - step:352/900 train_loss:0.4273 lr:0.0003000000 time/step:117.00s
363
+ [2025-09-12 07:37:28,208] - step:353/900 train_loss:0.3925 lr:0.0003000000 time/step:117.06s
364
+ [2025-09-12 07:39:25,351] - step:354/900 train_loss:0.4223 lr:0.0003000000 time/step:117.14s
365
+ [2025-09-12 07:41:21,430] - step:355/900 train_loss:0.3996 lr:0.0003000000 time/step:116.07s
366
+ [2025-09-12 07:43:18,880] - step:356/900 train_loss:0.4095 lr:0.0003000000 time/step:117.45s
367
+ [2025-09-12 07:45:16,716] - step:357/900 train_loss:0.4204 lr:0.0003000000 time/step:117.83s
368
+ [2025-09-12 07:47:14,287] - step:358/900 train_loss:0.4157 lr:0.0003000000 time/step:117.56s
369
+ [2025-09-12 07:49:11,022] - step:359/900 train_loss:0.4179 lr:0.0003000000 time/step:116.72s
370
+ [2025-09-12 07:51:08,126] - step:360/900 train_loss:0.4490 lr:0.0003000000 time/step:117.10s
371
+ [2025-09-12 07:53:04,336] - step:361/900 train_loss:0.4100 lr:0.0003000000 time/step:116.20s
372
+ [2025-09-12 07:55:00,814] - step:362/900 train_loss:0.4050 lr:0.0003000000 time/step:116.47s
373
+ [2025-09-12 07:56:58,814] - step:363/900 train_loss:0.4299 lr:0.0003000000 time/step:117.99s
374
+ [2025-09-12 07:58:55,677] - step:364/900 train_loss:0.3970 lr:0.0003000000 time/step:116.85s
375
+ [2025-09-12 08:00:53,062] - step:365/900 train_loss:0.4180 lr:0.0003000000 time/step:117.38s
376
+ [2025-09-12 08:02:49,522] - step:366/900 train_loss:0.4307 lr:0.0003000000 time/step:116.45s
377
+ [2025-09-12 08:04:45,597] - step:367/900 train_loss:0.4335 lr:0.0003000000 time/step:116.07s
378
+ [2025-09-12 08:06:43,333] - step:368/900 train_loss:0.3967 lr:0.0003000000 time/step:117.73s
379
+ [2025-09-12 08:08:40,432] - step:369/900 train_loss:0.4226 lr:0.0003000000 time/step:117.09s
380
+ [2025-09-12 08:10:38,337] - step:370/900 train_loss:0.4086 lr:0.0003000000 time/step:117.90s
381
+ [2025-09-12 08:12:35,283] - step:371/900 train_loss:0.3949 lr:0.0003000000 time/step:116.93s
382
+ [2025-09-12 08:14:31,782] - step:372/900 train_loss:0.4219 lr:0.0003000000 time/step:116.49s
383
+ [2025-09-12 08:16:29,230] - step:373/900 train_loss:0.4088 lr:0.0003000000 time/step:117.44s
384
+ [2025-09-12 08:18:26,952] - step:374/900 train_loss:0.4184 lr:0.0003000000 time/step:117.71s
385
+ [2025-09-12 08:20:23,596] - step:375/900 train_loss:0.4110 lr:0.0003000000 time/step:116.64s
386
+ [2025-09-12 08:22:20,047] - step:376/900 train_loss:0.4305 lr:0.0003000000 time/step:116.44s
387
+ [2025-09-12 08:24:16,398] - step:377/900 train_loss:0.4143 lr:0.0003000000 time/step:116.35s
388
+ [2025-09-12 08:26:13,665] - step:378/900 train_loss:0.4139 lr:0.0003000000 time/step:117.26s
389
+ [2025-09-12 08:28:09,796] - step:379/900 train_loss:0.4060 lr:0.0003000000 time/step:116.13s
390
+ [2025-09-12 08:30:07,613] - step:380/900 train_loss:0.3921 lr:0.0003000000 time/step:117.81s
391
+ [2025-09-12 08:32:04,597] - step:381/900 train_loss:0.4239 lr:0.0003000000 time/step:116.97s
392
+ [2025-09-12 08:34:01,394] - step:382/900 train_loss:0.4041 lr:0.0003000000 time/step:116.79s
393
+ [2025-09-12 08:35:58,263] - step:383/900 train_loss:0.4115 lr:0.0003000000 time/step:116.86s
394
+ [2025-09-12 08:37:54,649] - step:384/900 train_loss:0.4216 lr:0.0003000000 time/step:116.38s
395
+ [2025-09-12 08:39:51,866] - step:385/900 train_loss:0.4057 lr:0.0003000000 time/step:117.21s
396
+ [2025-09-12 08:41:49,473] - step:386/900 train_loss:0.4021 lr:0.0003000000 time/step:117.60s
397
+ [2025-09-12 08:43:46,456] - step:387/900 train_loss:0.4235 lr:0.0003000000 time/step:116.98s
398
+ [2025-09-12 08:45:42,939] - step:388/900 train_loss:0.4309 lr:0.0003000000 time/step:116.48s
399
+ [2025-09-12 08:47:40,164] - step:389/900 train_loss:0.3930 lr:0.0003000000 time/step:117.22s
400
+ [2025-09-12 08:49:36,386] - step:390/900 train_loss:0.4063 lr:0.0003000000 time/step:116.22s
401
+ [2025-09-12 08:51:33,830] - step:391/900 train_loss:0.4034 lr:0.0003000000 time/step:117.43s
402
+ [2025-09-12 08:53:30,812] - step:392/900 train_loss:0.4071 lr:0.0003000000 time/step:116.97s
403
+ [2025-09-12 08:55:28,574] - step:393/900 train_loss:0.4296 lr:0.0003000000 time/step:117.75s
404
+ [2025-09-12 08:57:25,899] - step:394/900 train_loss:0.4171 lr:0.0003000000 time/step:117.31s
405
+ [2025-09-12 08:59:22,463] - step:395/900 train_loss:0.4167 lr:0.0003000000 time/step:116.56s
406
+ [2025-09-12 09:01:19,086] - step:396/900 train_loss:0.4119 lr:0.0003000000 time/step:116.62s
407
+ [2025-09-12 09:03:16,267] - step:397/900 train_loss:0.4057 lr:0.0003000000 time/step:117.17s
408
+ [2025-09-12 09:05:13,175] - step:398/900 train_loss:0.4064 lr:0.0003000000 time/step:116.90s
409
+ [2025-09-12 09:07:10,958] - step:399/900 train_loss:0.3913 lr:0.0003000000 time/step:117.77s
410
+ [2025-09-12 09:09:08,523] - step:400/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@400.pt...
411
+ [2025-09-12 09:09:08,524] - step:400/900 train_loss:0.4028 lr:0.0003000000 time/step:116.93s
412
+ [2025-09-12 09:11:04,902] - step:401/900 train_loss:0.3889 lr:0.0003000000 time/step:116.38s
413
+ [2025-09-12 09:13:01,467] - step:402/900 train_loss:0.4192 lr:0.0003000000 time/step:116.55s
414
+ [2025-09-12 09:14:58,472] - step:403/900 train_loss:0.4211 lr:0.0003000000 time/step:117.00s
415
+ [2025-09-12 09:16:55,036] - step:404/900 train_loss:0.4354 lr:0.0003000000 time/step:116.56s
416
+ [2025-09-12 09:18:52,741] - step:405/900 train_loss:0.4290 lr:0.0003000000 time/step:117.69s
417
+ [2025-09-12 09:20:49,701] - step:406/900 train_loss:0.4290 lr:0.0003000000 time/step:116.95s
418
+ [2025-09-12 09:22:46,403] - step:407/900 train_loss:0.4257 lr:0.0003000000 time/step:116.69s
419
+ [2025-09-12 09:24:43,048] - step:408/900 train_loss:0.4252 lr:0.0003000000 time/step:116.64s
420
+ [2025-09-12 09:26:39,532] - step:409/900 train_loss:0.3992 lr:0.0003000000 time/step:116.48s
421
+ [2025-09-12 09:28:37,397] - step:410/900 train_loss:0.4191 lr:0.0003000000 time/step:117.86s
422
+ [2025-09-12 09:30:33,697] - step:411/900 train_loss:0.3892 lr:0.0003000000 time/step:116.29s
423
+ [2025-09-12 09:32:30,021] - step:412/900 train_loss:0.3843 lr:0.0003000000 time/step:116.32s
424
+ [2025-09-12 09:34:27,365] - step:413/900 train_loss:0.4010 lr:0.0003000000 time/step:117.34s
425
+ [2025-09-12 09:36:24,146] - step:414/900 train_loss:0.4190 lr:0.0003000000 time/step:116.77s
426
+ [2025-09-12 09:38:20,888] - step:415/900 train_loss:0.4182 lr:0.0003000000 time/step:116.73s
427
+ [2025-09-12 09:40:17,896] - step:416/900 train_loss:0.4236 lr:0.0003000000 time/step:117.00s
428
+ [2025-09-12 09:42:14,418] - step:417/900 train_loss:0.4016 lr:0.0003000000 time/step:116.51s
429
+ [2025-09-12 09:44:11,142] - step:418/900 train_loss:0.4054 lr:0.0003000000 time/step:116.72s
430
+ [2025-09-12 09:46:07,906] - step:419/900 train_loss:0.4162 lr:0.0003000000 time/step:116.75s
431
+ [2025-09-12 09:48:05,609] - step:420/900 train_loss:0.3856 lr:0.0003000000 time/step:117.70s
432
+ [2025-09-12 09:50:02,634] - step:421/900 train_loss:0.3832 lr:0.0003000000 time/step:117.02s
433
+ [2025-09-12 09:51:59,099] - step:422/900 train_loss:0.4000 lr:0.0003000000 time/step:116.45s
434
+ [2025-09-12 09:53:56,083] - step:423/900 train_loss:0.4182 lr:0.0003000000 time/step:116.98s
435
+ [2025-09-12 09:55:53,683] - step:424/900 train_loss:0.4064 lr:0.0003000000 time/step:117.60s
436
+ [2025-09-12 09:57:49,838] - step:425/900 train_loss:0.4186 lr:0.0003000000 time/step:116.14s
437
+ [2025-09-12 09:59:47,210] - step:426/900 train_loss:0.4251 lr:0.0003000000 time/step:117.36s
438
+ [2025-09-12 10:01:43,887] - step:427/900 train_loss:0.3975 lr:0.0003000000 time/step:116.67s
439
+ [2025-09-12 10:03:40,560] - step:428/900 train_loss:0.4212 lr:0.0003000000 time/step:116.66s
440
+ [2025-09-12 10:05:37,859] - step:429/900 train_loss:0.4118 lr:0.0003000000 time/step:117.29s
441
+ [2025-09-12 10:07:35,749] - step:430/900 train_loss:0.3981 lr:0.0003000000 time/step:117.88s
442
+ [2025-09-12 10:09:32,291] - step:431/900 train_loss:0.4237 lr:0.0003000000 time/step:116.53s
443
+ [2025-09-12 10:11:29,229] - step:432/900 train_loss:0.3926 lr:0.0003000000 time/step:116.93s
444
+ [2025-09-12 10:13:26,136] - step:433/900 train_loss:0.4208 lr:0.0003000000 time/step:116.90s
445
+ [2025-09-12 10:15:22,577] - step:434/900 train_loss:0.4102 lr:0.0003000000 time/step:116.44s
446
+ [2025-09-12 10:17:19,961] - step:435/900 train_loss:0.4373 lr:0.0003000000 time/step:117.38s
447
+ [2025-09-12 10:19:18,170] - step:436/900 train_loss:0.4159 lr:0.0003000000 time/step:118.20s
448
+ [2025-09-12 10:21:13,810] - step:437/900 train_loss:0.4083 lr:0.0003000000 time/step:115.63s
449
+ [2025-09-12 10:23:10,450] - step:438/900 train_loss:0.4361 lr:0.0003000000 time/step:116.63s
450
+ [2025-09-12 10:25:07,257] - step:439/900 train_loss:0.4152 lr:0.0003000000 time/step:116.80s
451
+ [2025-09-12 10:27:04,621] - step:440/900 train_loss:0.4100 lr:0.0003000000 time/step:117.36s
452
+ [2025-09-12 10:29:01,561] - step:441/900 train_loss:0.4003 lr:0.0003000000 time/step:116.93s
453
+ [2025-09-12 10:30:58,928] - step:442/900 train_loss:0.4296 lr:0.0003000000 time/step:117.36s
454
+ [2025-09-12 10:32:54,885] - step:443/900 train_loss:0.4175 lr:0.0003000000 time/step:115.95s
455
+ [2025-09-12 10:34:51,250] - step:444/900 train_loss:0.4220 lr:0.0003000000 time/step:116.36s
456
+ [2025-09-12 10:36:48,671] - step:445/900 train_loss:0.4361 lr:0.0003000000 time/step:117.42s
457
+ [2025-09-12 10:38:46,902] - step:446/900 train_loss:0.4034 lr:0.0003000000 time/step:118.22s
458
+ [2025-09-12 10:40:44,143] - step:447/900 train_loss:0.4121 lr:0.0003000000 time/step:117.22s
459
+ [2025-09-12 10:42:40,558] - step:448/900 train_loss:0.4247 lr:0.0003000000 time/step:116.40s
460
+ [2025-09-12 10:44:37,203] - step:449/900 train_loss:0.4502 lr:0.0003000000 time/step:116.64s
461
+ [2025-09-12 10:46:34,074] - step:450/900 train_loss:0.4202 lr:0.0003000000 time/step:116.87s
462
+ [2025-09-12 10:48:32,574] - step:451/900 train_loss:0.4115 lr:0.0003000000 time/step:118.50s
463
+ [2025-09-12 10:50:30,519] - step:452/900 train_loss:0.4416 lr:0.0003000000 time/step:117.93s
464
+ [2025-09-12 10:52:27,400] - step:453/900 train_loss:0.4589 lr:0.0003000000 time/step:116.87s
465
+ [2025-09-12 10:54:23,502] - step:454/900 train_loss:0.4104 lr:0.0003000000 time/step:116.09s
466
+ [2025-09-12 10:56:20,043] - step:455/900 train_loss:0.4428 lr:0.0003000000 time/step:116.54s
467
+ [2025-09-12 10:58:18,649] - step:456/900 train_loss:0.3869 lr:0.0003000000 time/step:118.60s
468
+ [2025-09-12 11:00:16,434] - step:457/900 train_loss:0.3896 lr:0.0003000000 time/step:117.77s
469
+ [2025-09-12 11:02:12,853] - step:458/900 train_loss:0.4199 lr:0.0003000000 time/step:116.41s
470
+ [2025-09-12 11:04:09,871] - step:459/900 train_loss:0.4109 lr:0.0003000000 time/step:117.00s
471
+ [2025-09-12 11:06:05,943] - step:460/900 train_loss:0.4113 lr:0.0003000000 time/step:116.07s
472
+ [2025-09-12 11:08:02,527] - step:461/900 train_loss:0.3895 lr:0.0003000000 time/step:116.58s
473
+ [2025-09-12 11:10:00,790] - step:462/900 train_loss:0.4033 lr:0.0003000000 time/step:118.26s
474
+ [2025-09-12 11:11:58,115] - step:463/900 train_loss:0.4269 lr:0.0003000000 time/step:117.32s
475
+ [2025-09-12 11:13:54,593] - step:464/900 train_loss:0.4080 lr:0.0003000000 time/step:116.46s
476
+ [2025-09-12 11:15:51,480] - step:465/900 train_loss:0.4208 lr:0.0003000000 time/step:116.88s
477
+ [2025-09-12 11:17:48,283] - step:466/900 train_loss:0.4146 lr:0.0003000000 time/step:116.80s
478
+ [2025-09-12 11:19:44,666] - step:467/900 train_loss:0.4178 lr:0.0003000000 time/step:116.38s
479
+ [2025-09-12 11:21:43,091] - step:468/900 train_loss:0.4065 lr:0.0003000000 time/step:118.42s
480
+ [2025-09-12 11:23:40,099] - step:469/900 train_loss:0.4158 lr:0.0003000000 time/step:117.00s
481
+ [2025-09-12 11:25:36,537] - step:470/900 train_loss:0.3969 lr:0.0003000000 time/step:116.43s
482
+ [2025-09-12 11:27:34,080] - step:471/900 train_loss:0.4355 lr:0.0003000000 time/step:117.54s
483
+ [2025-09-12 11:29:30,162] - step:472/900 train_loss:0.3901 lr:0.0003000000 time/step:116.08s
484
+ [2025-09-12 11:31:28,047] - step:473/900 train_loss:0.4142 lr:0.0003000000 time/step:117.88s
485
+ [2025-09-12 11:33:24,570] - step:474/900 train_loss:0.4396 lr:0.0003000000 time/step:116.51s
486
+ [2025-09-12 11:35:21,454] - step:475/900 train_loss:0.3944 lr:0.0003000000 time/step:116.88s
487
+ [2025-09-12 11:37:18,778] - step:476/900 train_loss:0.4112 lr:0.0003000000 time/step:117.32s
488
+ [2025-09-12 11:39:15,275] - step:477/900 train_loss:0.4239 lr:0.0003000000 time/step:116.49s
489
+ [2025-09-12 11:41:11,285] - step:478/900 train_loss:0.4200 lr:0.0003000000 time/step:116.01s
490
+ [2025-09-12 11:43:08,711] - step:479/900 train_loss:0.4177 lr:0.0003000000 time/step:117.41s
491
+ [2025-09-12 11:45:05,127] - step:480/900 train_loss:0.3939 lr:0.0003000000 time/step:116.41s
492
+ [2025-09-12 11:47:02,193] - step:481/900 train_loss:0.4138 lr:0.0003000000 time/step:117.06s
493
+ [2025-09-12 11:48:59,561] - step:482/900 train_loss:0.4252 lr:0.0003000000 time/step:117.36s
494
+ [2025-09-12 11:50:55,554] - step:483/900 train_loss:0.4048 lr:0.0003000000 time/step:115.99s
495
+ [2025-09-12 11:52:52,805] - step:484/900 train_loss:0.4000 lr:0.0003000000 time/step:117.24s
496
+ [2025-09-12 11:54:49,667] - step:485/900 train_loss:0.4216 lr:0.0003000000 time/step:116.85s
497
+ [2025-09-12 11:56:46,072] - step:486/900 train_loss:0.4095 lr:0.0003000000 time/step:116.40s
498
+ [2025-09-12 11:58:43,074] - step:487/900 train_loss:0.4027 lr:0.0003000000 time/step:117.00s
499
+ [2025-09-12 12:00:40,979] - step:488/900 train_loss:0.4245 lr:0.0003000000 time/step:117.90s
500
+ [2025-09-12 12:02:38,064] - step:489/900 train_loss:0.3942 lr:0.0003000000 time/step:117.08s
501
+ [2025-09-12 12:04:34,804] - step:490/900 train_loss:0.4239 lr:0.0003000000 time/step:116.72s
502
+ [2025-09-12 12:06:31,269] - step:491/900 train_loss:0.3853 lr:0.0003000000 time/step:116.46s
503
+ [2025-09-12 12:08:28,111] - step:492/900 train_loss:0.4141 lr:0.0003000000 time/step:116.84s
504
+ [2025-09-12 12:10:24,954] - step:493/900 train_loss:0.4139 lr:0.0003000000 time/step:116.84s
505
+ [2025-09-12 12:12:22,937] - step:494/900 train_loss:0.4166 lr:0.0003000000 time/step:117.98s
506
+ [2025-09-12 12:14:20,061] - step:495/900 train_loss:0.3974 lr:0.0003000000 time/step:117.11s
507
+ [2025-09-12 12:16:16,526] - step:496/900 train_loss:0.4149 lr:0.0003000000 time/step:116.46s
508
+ [2025-09-12 12:18:13,009] - step:497/900 train_loss:0.4181 lr:0.0003000000 time/step:116.48s
509
+ [2025-09-12 12:20:09,790] - step:498/900 train_loss:0.4166 lr:0.0003000000 time/step:116.78s
510
+ [2025-09-12 12:22:06,615] - step:499/900 train_loss:0.4216 lr:0.0003000000 time/step:116.82s
511
+ [2025-09-12 12:24:05,337] - step:500/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@500.pt...
512
+ [2025-09-12 12:24:05,343] - step:500/900 train_loss:0.4161 lr:0.0003000000 time/step:118.13s
513
+ [2025-09-12 12:26:01,342] - step:501/900 train_loss:0.4010 lr:0.0003000000 time/step:116.00s
514
+ [2025-09-12 12:27:58,345] - step:502/900 train_loss:0.4042 lr:0.0003000000 time/step:116.99s
515
+ [2025-09-12 12:29:54,389] - step:503/900 train_loss:0.4216 lr:0.0003000000 time/step:116.04s
516
+ [2025-09-12 12:31:51,252] - step:504/900 train_loss:0.4127 lr:0.0003000000 time/step:116.86s
517
+ [2025-09-12 12:33:49,558] - step:505/900 train_loss:0.4019 lr:0.0003000000 time/step:118.29s
518
+ [2025-09-12 12:35:46,199] - step:506/900 train_loss:0.4076 lr:0.0003000000 time/step:116.64s
519
+ [2025-09-12 12:37:42,246] - step:507/900 train_loss:0.4207 lr:0.0003000000 time/step:116.04s
520
+ [2025-09-12 12:39:39,229] - step:508/900 train_loss:0.4258 lr:0.0003000000 time/step:116.98s
521
+ [2025-09-12 12:41:35,709] - step:509/900 train_loss:0.3826 lr:0.0003000000 time/step:116.48s
522
+ [2025-09-12 12:43:32,441] - step:510/900 train_loss:0.4092 lr:0.0003000000 time/step:116.72s
523
+ [2025-09-12 12:45:30,539] - step:511/900 train_loss:0.3954 lr:0.0003000000 time/step:118.09s
524
+ [2025-09-12 12:47:27,041] - step:512/900 train_loss:0.4335 lr:0.0003000000 time/step:116.49s
525
+ [2025-09-12 12:49:23,522] - step:513/900 train_loss:0.4216 lr:0.0003000000 time/step:116.47s
526
+ [2025-09-12 12:51:20,467] - step:514/900 train_loss:0.3952 lr:0.0003000000 time/step:116.94s
527
+ [2025-09-12 12:53:17,452] - step:515/900 train_loss:0.4052 lr:0.0003000000 time/step:116.98s
528
+ [2025-09-12 12:55:14,098] - step:516/900 train_loss:0.4145 lr:0.0003000000 time/step:116.64s
529
+ [2025-09-12 12:57:11,620] - step:517/900 train_loss:0.4292 lr:0.0003000000 time/step:117.51s
530
+ [2025-09-12 12:59:09,139] - step:518/900 train_loss:0.4204 lr:0.0003000000 time/step:117.51s
531
+ [2025-09-12 13:01:05,186] - step:519/900 train_loss:0.3932 lr:0.0003000000 time/step:116.04s
532
+ [2025-09-12 13:03:01,731] - step:520/900 train_loss:0.4226 lr:0.0003000000 time/step:116.54s
533
+ [2025-09-12 13:04:59,398] - step:521/900 train_loss:0.4080 lr:0.0003000000 time/step:117.65s
534
+ [2025-09-12 13:06:56,876] - step:522/900 train_loss:0.4079 lr:0.0003000000 time/step:117.47s
535
+ [2025-09-12 13:08:53,784] - step:523/900 train_loss:0.4375 lr:0.0003000000 time/step:116.90s
536
+ [2025-09-12 13:11:18,031] - step:524/900 train_loss:0.3876 lr:0.0003000000 time/step:144.24s
537
+ [2025-09-12 13:13:14,894] - step:525/900 train_loss:0.4133 lr:0.0003000000 time/step:116.82s
538
+ [2025-09-12 13:15:16,203] - step:526/900 train_loss:0.3961 lr:0.0003000000 time/step:118.95s
539
+ [2025-09-12 13:17:12,922] - step:527/900 train_loss:0.3895 lr:0.0003000000 time/step:116.71s
540
+ [2025-09-12 13:19:09,906] - step:528/900 train_loss:0.4204 lr:0.0003000000 time/step:116.98s
541
+ [2025-09-12 13:21:08,032] - step:529/900 train_loss:0.4078 lr:0.0003000000 time/step:118.12s
542
+ [2025-09-12 13:23:04,450] - step:530/900 train_loss:0.3973 lr:0.0003000000 time/step:116.41s
543
+ [2025-09-12 13:25:02,156] - step:531/900 train_loss:0.3875 lr:0.0003000000 time/step:117.69s
544
+ [2025-09-12 13:26:58,851] - step:532/900 train_loss:0.3979 lr:0.0003000000 time/step:116.69s
545
+ [2025-09-12 13:28:55,552] - step:533/900 train_loss:0.4210 lr:0.0003000000 time/step:116.69s
546
+ [2025-09-12 13:30:52,352] - step:534/900 train_loss:0.4016 lr:0.0003000000 time/step:116.80s
547
+ [2025-09-12 13:32:50,584] - step:535/900 train_loss:0.3971 lr:0.0003000000 time/step:118.23s
548
+ [2025-09-12 13:34:47,330] - step:536/900 train_loss:0.4167 lr:0.0003000000 time/step:116.73s
549
+ [2025-09-12 13:36:44,747] - step:537/900 train_loss:0.4366 lr:0.0003000000 time/step:117.39s
550
+ [2025-09-12 13:38:42,456] - step:538/900 train_loss:0.4267 lr:0.0003000000 time/step:117.71s
551
+ [2025-09-12 13:40:38,661] - step:539/900 train_loss:0.4092 lr:0.0003000000 time/step:116.20s
552
+ [2025-09-12 13:42:38,305] - step:540/900 train_loss:0.4273 lr:0.0003000000 time/step:119.62s
553
+ [2025-09-12 13:44:37,524] - step:541/900 train_loss:0.4157 lr:0.0003000000 time/step:119.17s
554
+ [2025-09-12 13:46:33,425] - step:542/900 train_loss:0.4237 lr:0.0003000000 time/step:115.89s
555
+ [2025-09-12 13:48:30,101] - step:543/900 train_loss:0.4052 lr:0.0003000000 time/step:116.67s
556
+ [2025-09-12 13:50:27,196] - step:544/900 train_loss:0.4260 lr:0.0003000000 time/step:117.09s
557
+ [2025-09-12 13:52:24,079] - step:545/900 train_loss:0.4021 lr:0.0003000000 time/step:116.88s
558
+ [2025-09-12 13:54:21,661] - step:546/900 train_loss:0.3897 lr:0.0003000000 time/step:117.57s
559
+ [2025-09-12 13:56:19,479] - step:547/900 train_loss:0.4029 lr:0.0003000000 time/step:117.81s
560
+ [2025-09-12 13:58:15,488] - step:548/900 train_loss:0.4107 lr:0.0003000000 time/step:116.00s
561
+ [2025-09-12 14:00:11,893] - step:549/900 train_loss:0.4159 lr:0.0003000000 time/step:116.40s
562
+ [2025-09-12 14:02:08,916] - step:550/900 train_loss:0.4075 lr:0.0003000000 time/step:117.01s
563
+ [2025-09-12 14:04:06,359] - step:551/900 train_loss:0.3932 lr:0.0003000000 time/step:117.43s
564
+ [2025-09-12 14:06:02,862] - step:552/900 train_loss:0.4110 lr:0.0003000000 time/step:116.49s
565
+ [2025-09-12 14:08:00,226] - step:553/900 train_loss:0.4250 lr:0.0003000000 time/step:117.36s
566
+ [2025-09-12 14:09:56,780] - step:554/900 train_loss:0.3990 lr:0.0003000000 time/step:116.54s
567
+ [2025-09-12 14:11:53,353] - step:555/900 train_loss:0.4041 lr:0.0003000000 time/step:116.56s
568
+ [2025-09-12 14:13:50,235] - step:556/900 train_loss:0.4062 lr:0.0003000000 time/step:116.87s
569
+ [2025-09-12 14:15:47,160] - step:557/900 train_loss:0.4144 lr:0.0003000000 time/step:116.92s
570
+ [2025-09-12 14:17:44,967] - step:558/900 train_loss:0.4032 lr:0.0003000000 time/step:117.80s
571
+ [2025-09-12 14:19:40,685] - step:559/900 train_loss:0.4082 lr:0.0003000000 time/step:115.71s
572
+ [2025-09-12 14:21:37,889] - step:560/900 train_loss:0.4140 lr:0.0003000000 time/step:117.20s
573
+ [2025-09-12 14:23:34,834] - step:561/900 train_loss:0.4284 lr:0.0003000000 time/step:116.94s
574
+ [2025-09-12 14:25:31,517] - step:562/900 train_loss:0.4096 lr:0.0003000000 time/step:116.67s
575
+ [2025-09-12 14:27:29,793] - step:563/900 train_loss:0.4017 lr:0.0003000000 time/step:118.26s
576
+ [2025-09-12 14:29:26,683] - step:564/900 train_loss:0.4014 lr:0.0003000000 time/step:116.88s
577
+ [2025-09-12 14:31:22,468] - step:565/900 train_loss:0.4061 lr:0.0003000000 time/step:115.78s
578
+ [2025-09-12 14:33:19,190] - step:566/900 train_loss:0.4188 lr:0.0003000000 time/step:116.72s
579
+ [2025-09-12 14:35:16,130] - step:567/900 train_loss:0.4305 lr:0.0003000000 time/step:116.93s
580
+ [2025-09-12 14:37:13,373] - step:568/900 train_loss:0.3922 lr:0.0003000000 time/step:117.24s
581
+ [2025-09-12 14:39:10,305] - step:569/900 train_loss:0.4190 lr:0.0003000000 time/step:116.92s
582
+ [2025-09-12 14:41:07,121] - step:570/900 train_loss:0.4047 lr:0.0003000000 time/step:116.81s
583
+ [2025-09-12 14:43:03,948] - step:571/900 train_loss:0.4152 lr:0.0003000000 time/step:116.82s
584
+ [2025-09-12 14:45:00,151] - step:572/900 train_loss:0.3946 lr:0.0003000000 time/step:116.19s
585
+ [2025-09-12 14:46:57,634] - step:573/900 train_loss:0.4138 lr:0.0003000000 time/step:117.48s
586
+ [2025-09-12 14:48:55,022] - step:574/900 train_loss:0.4231 lr:0.0003000000 time/step:117.37s
587
+ [2025-09-12 14:50:50,877] - step:575/900 train_loss:0.3978 lr:0.0003000000 time/step:115.85s
588
+ [2025-09-12 14:52:49,128] - step:576/900 train_loss:0.4169 lr:0.0003000000 time/step:118.25s
589
+ [2025-09-12 14:54:45,289] - step:577/900 train_loss:0.3971 lr:0.0003000000 time/step:116.15s
590
+ [2025-09-12 14:56:41,851] - step:578/900 train_loss:0.4058 lr:0.0003000000 time/step:116.56s
591
+ [2025-09-12 14:58:38,779] - step:579/900 train_loss:0.4105 lr:0.0003000000 time/step:116.92s
592
+ [2025-09-12 15:00:35,657] - step:580/900 train_loss:0.4145 lr:0.0003000000 time/step:116.87s
593
+ [2025-09-12 15:02:33,021] - step:581/900 train_loss:0.4067 lr:0.0003000000 time/step:117.36s
594
+ [2025-09-12 15:04:29,564] - step:582/900 train_loss:0.4209 lr:0.0003000000 time/step:116.53s
595
+ [2025-09-12 15:06:26,089] - step:583/900 train_loss:0.4106 lr:0.0003000000 time/step:116.52s
596
+ [2025-09-12 15:08:22,953] - step:584/900 train_loss:0.4220 lr:0.0003000000 time/step:116.86s
597
+ [2025-09-12 15:10:19,376] - step:585/900 train_loss:0.4001 lr:0.0003000000 time/step:116.41s
598
+ [2025-09-12 15:12:16,440] - step:586/900 train_loss:0.3963 lr:0.0003000000 time/step:117.06s
599
+ [2025-09-12 15:14:14,343] - step:587/900 train_loss:0.4118 lr:0.0003000000 time/step:117.89s
600
+ [2025-09-12 15:16:10,568] - step:588/900 train_loss:0.4285 lr:0.0003000000 time/step:116.22s
601
+ [2025-09-12 15:18:06,609] - step:589/900 train_loss:0.4177 lr:0.0003000000 time/step:116.04s
602
+ [2025-09-12 15:20:03,934] - step:590/900 train_loss:0.4256 lr:0.0003000000 time/step:117.32s
603
+ [2025-09-12 15:22:00,505] - step:591/900 train_loss:0.4258 lr:0.0003000000 time/step:116.57s
604
+ [2025-09-12 15:23:57,739] - step:592/900 train_loss:0.4031 lr:0.0003000000 time/step:117.19s
605
+ [2025-09-12 15:25:55,502] - step:593/900 train_loss:0.3975 lr:0.0003000000 time/step:117.76s
606
+ [2025-09-12 15:27:51,604] - step:594/900 train_loss:0.4098 lr:0.0003000000 time/step:116.10s
607
+ [2025-09-12 15:29:48,152] - step:595/900 train_loss:0.4044 lr:0.0003000000 time/step:116.54s
608
+ [2025-09-12 15:31:45,056] - step:596/900 train_loss:0.4394 lr:0.0003000000 time/step:116.89s
609
+ [2025-09-12 15:33:42,598] - step:597/900 train_loss:0.4166 lr:0.0003000000 time/step:117.54s
610
+ [2025-09-12 15:35:38,903] - step:598/900 train_loss:0.3857 lr:0.0003000000 time/step:116.29s
611
+ [2025-09-12 15:37:35,947] - step:599/900 train_loss:0.3944 lr:0.0003000000 time/step:117.04s
612
+ [2025-09-12 15:39:32,999] - step:600/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@600.pt...
613
+ [2025-09-12 15:39:33,008] - step:600/900 train_loss:0.4121 lr:0.0003000000 time/step:116.45s
614
+ [2025-09-12 15:41:29,871] - step:601/900 train_loss:0.4055 lr:0.0003000000 time/step:116.86s
615
+ [2025-09-12 15:43:27,496] - step:602/900 train_loss:0.4131 lr:0.0003000000 time/step:117.62s
616
+ [2025-09-12 15:45:24,775] - step:603/900 train_loss:0.4117 lr:0.0003000000 time/step:117.27s
617
+ [2025-09-12 15:47:21,843] - step:604/900 train_loss:0.4073 lr:0.0003000000 time/step:117.06s
618
+ [2025-09-12 15:49:19,207] - step:605/900 train_loss:0.3994 lr:0.0003000000 time/step:117.35s
619
+ [2025-09-12 15:51:15,705] - step:606/900 train_loss:0.4006 lr:0.0003000000 time/step:116.49s
620
+ [2025-09-12 15:53:12,651] - step:607/900 train_loss:0.4087 lr:0.0003000000 time/step:116.94s
621
+ [2025-09-12 15:55:10,370] - step:608/900 train_loss:0.4194 lr:0.0003000000 time/step:117.71s
622
+ [2025-09-12 15:57:07,183] - step:609/900 train_loss:0.4059 lr:0.0003000000 time/step:116.80s
623
+ [2025-09-12 15:59:03,945] - step:610/900 train_loss:0.3960 lr:0.0003000000 time/step:116.75s
624
+ [2025-09-12 16:01:01,845] - step:611/900 train_loss:0.4203 lr:0.0003000000 time/step:117.89s
625
+ [2025-09-12 16:02:57,870] - step:612/900 train_loss:0.4208 lr:0.0003000000 time/step:116.02s
626
+ [2025-09-12 16:04:55,209] - step:613/900 train_loss:0.4205 lr:0.0003000000 time/step:117.33s
627
+ [2025-09-12 16:06:52,213] - step:614/900 train_loss:0.4023 lr:0.0003000000 time/step:117.00s
628
+ [2025-09-12 16:08:49,316] - step:615/900 train_loss:0.4011 lr:0.0003000000 time/step:117.09s
629
+ [2025-09-12 16:10:47,341] - step:616/900 train_loss:0.3898 lr:0.0003000000 time/step:118.02s
630
+ [2025-09-12 16:12:44,306] - step:617/900 train_loss:0.4223 lr:0.0003000000 time/step:116.96s
631
+ [2025-09-12 16:14:41,166] - step:618/900 train_loss:0.4022 lr:0.0003000000 time/step:116.85s
632
+ [2025-09-12 16:16:38,068] - step:619/900 train_loss:0.4259 lr:0.0003000000 time/step:116.89s
633
+ [2025-09-12 16:18:35,272] - step:620/900 train_loss:0.4129 lr:0.0003000000 time/step:117.20s
634
+ [2025-09-12 16:20:32,436] - step:621/900 train_loss:0.4122 lr:0.0003000000 time/step:117.13s
635
+ [2025-09-12 16:22:30,553] - step:622/900 train_loss:0.4185 lr:0.0003000000 time/step:118.10s
636
+ [2025-09-12 16:24:27,881] - step:623/900 train_loss:0.3991 lr:0.0003000000 time/step:117.28s
637
+ [2025-09-12 16:26:24,425] - step:624/900 train_loss:0.4208 lr:0.0003000000 time/step:116.53s
638
+ [2025-09-12 16:28:21,471] - step:625/900 train_loss:0.4276 lr:0.0003000000 time/step:117.04s
639
+ [2025-09-12 16:30:19,129] - step:626/900 train_loss:0.4259 lr:0.0003000000 time/step:117.64s
640
+ [2025-09-12 16:32:19,616] - step:627/900 train_loss:0.3848 lr:0.0003000000 time/step:120.47s
641
+ [2025-09-12 16:34:17,638] - step:628/900 train_loss:0.4005 lr:0.0003000000 time/step:118.02s
642
+ [2025-09-12 16:36:14,359] - step:629/900 train_loss:0.3988 lr:0.0003000000 time/step:116.71s
643
+ [2025-09-12 16:38:11,222] - step:630/900 train_loss:0.4181 lr:0.0003000000 time/step:116.86s
644
+ [2025-09-12 16:40:08,509] - step:631/900 train_loss:0.4042 lr:0.0003000000 time/step:117.28s
645
+ [2025-09-12 16:42:06,712] - step:632/900 train_loss:0.4010 lr:0.0003000000 time/step:118.19s
646
+ [2025-09-12 16:44:03,814] - step:633/900 train_loss:0.4108 lr:0.0003000000 time/step:117.10s
647
+ [2025-09-12 16:46:01,576] - step:634/900 train_loss:0.4218 lr:0.0003000000 time/step:117.65s
648
+ [2025-09-12 16:47:57,601] - step:635/900 train_loss:0.4339 lr:0.0003000000 time/step:116.02s
649
+ [2025-09-12 16:49:54,473] - step:636/900 train_loss:0.4252 lr:0.0003000000 time/step:116.86s
650
+ [2025-09-12 16:51:52,707] - step:637/900 train_loss:0.3961 lr:0.0003000000 time/step:118.19s
651
+ [2025-09-12 16:53:50,406] - step:638/900 train_loss:0.4049 lr:0.0003000000 time/step:117.69s
652
+ [2025-09-12 16:55:48,233] - step:639/900 train_loss:0.4217 lr:0.0003000000 time/step:117.81s
653
+ [2025-09-12 16:57:44,596] - step:640/900 train_loss:0.4046 lr:0.0003000000 time/step:116.35s
654
+ [2025-09-12 16:59:40,200] - step:641/900 train_loss:0.4136 lr:0.0003000000 time/step:115.60s
655
+ [2025-09-12 17:01:37,286] - step:642/900 train_loss:0.4027 lr:0.0003000000 time/step:117.08s
656
+ [2025-09-12 17:03:35,226] - step:643/900 train_loss:0.3820 lr:0.0003000000 time/step:117.93s
657
+ [2025-09-12 17:05:33,570] - step:644/900 train_loss:0.4089 lr:0.0003000000 time/step:118.33s
658
+ [2025-09-12 17:07:30,395] - step:645/900 train_loss:0.3874 lr:0.0003000000 time/step:116.82s
659
+ [2025-09-12 17:09:27,297] - step:646/900 train_loss:0.4146 lr:0.0003000000 time/step:116.90s
660
+ [2025-09-12 17:11:23,362] - step:647/900 train_loss:0.3988 lr:0.0003000000 time/step:116.06s
661
+ [2025-09-12 17:13:20,787] - step:648/900 train_loss:0.4128 lr:0.0003000000 time/step:117.42s
662
+ [2025-09-12 17:15:18,588] - step:649/900 train_loss:0.4332 lr:0.0003000000 time/step:117.79s
663
+ [2025-09-12 17:17:16,062] - step:650/900 train_loss:0.4214 lr:0.0003000000 time/step:117.47s
664
+ [2025-09-12 17:19:12,730] - step:651/900 train_loss:0.4074 lr:0.0003000000 time/step:116.66s
665
+ [2025-09-12 17:21:09,550] - step:652/900 train_loss:0.4025 lr:0.0003000000 time/step:116.81s
666
+ [2025-09-12 17:23:05,702] - step:653/900 train_loss:0.4008 lr:0.0003000000 time/step:116.15s
667
+ [2025-09-12 17:25:03,925] - step:654/900 train_loss:0.4060 lr:0.0003000000 time/step:118.18s
668
+ [2025-09-12 17:27:02,401] - step:655/900 train_loss:0.3931 lr:0.0003000000 time/step:118.47s
669
+ [2025-09-12 17:28:59,392] - step:656/900 train_loss:0.3985 lr:0.0003000000 time/step:116.97s
670
+ [2025-09-12 17:30:56,335] - step:657/900 train_loss:0.4319 lr:0.0003000000 time/step:116.93s
671
+ [2025-09-12 17:32:52,897] - step:658/900 train_loss:0.4200 lr:0.0003000000 time/step:116.56s
672
+ [2025-09-12 17:34:50,643] - step:659/900 train_loss:0.3811 lr:0.0003000000 time/step:117.73s
673
+ [2025-09-12 17:36:47,661] - step:660/900 train_loss:0.3960 lr:0.0003000000 time/step:117.00s
674
+ [2025-09-12 17:38:45,367] - step:661/900 train_loss:0.3810 lr:0.0003000000 time/step:117.70s
675
+ [2025-09-12 17:40:42,471] - step:662/900 train_loss:0.3948 lr:0.0003000000 time/step:117.10s
676
+ [2025-09-12 17:42:39,354] - step:663/900 train_loss:0.4221 lr:0.0003000000 time/step:116.86s
677
+ [2025-09-12 17:44:37,177] - step:664/900 train_loss:0.4021 lr:0.0003000000 time/step:117.82s
678
+ [2025-09-12 17:46:33,621] - step:665/900 train_loss:0.4521 lr:0.0003000000 time/step:116.43s
679
+ [2025-09-12 17:48:31,225] - step:666/900 train_loss:0.4265 lr:0.0003000000 time/step:117.60s
680
+ [2025-09-12 17:50:28,126] - step:667/900 train_loss:0.4109 lr:0.0003000000 time/step:116.89s
681
+ [2025-09-12 17:52:25,032] - step:668/900 train_loss:0.4247 lr:0.0003000000 time/step:116.90s
682
+ [2025-09-12 17:54:22,433] - step:669/900 train_loss:0.4024 lr:0.0003000000 time/step:117.40s
683
+ [2025-09-12 17:56:19,263] - step:670/900 train_loss:0.4238 lr:0.0003000000 time/step:116.81s
684
+ [2025-09-12 17:58:15,840] - step:671/900 train_loss:0.4240 lr:0.0003000000 time/step:116.57s
685
+ [2025-09-12 18:00:13,196] - step:672/900 train_loss:0.4079 lr:0.0003000000 time/step:117.35s
686
+ [2025-09-12 18:02:09,946] - step:673/900 train_loss:0.4152 lr:0.0003000000 time/step:116.74s
687
+ [2025-09-12 18:04:08,272] - step:674/900 train_loss:0.4386 lr:0.0003000000 time/step:118.32s
688
+ [2025-09-12 18:06:05,695] - step:675/900 train_loss:0.3944 lr:0.0003000000 time/step:117.41s
689
+ [2025-09-12 18:08:01,761] - step:676/900 train_loss:0.3997 lr:0.0003000000 time/step:116.05s
690
+ [2025-09-12 18:09:59,340] - step:677/900 train_loss:0.4081 lr:0.0003000000 time/step:117.57s
691
+ [2025-09-12 18:11:56,223] - step:678/900 train_loss:0.4326 lr:0.0003000000 time/step:116.88s
692
+ [2025-09-12 18:13:53,528] - step:679/900 train_loss:0.4058 lr:0.0003000000 time/step:117.30s
693
+ [2025-09-12 18:15:51,604] - step:680/900 train_loss:0.4257 lr:0.0003000000 time/step:118.06s
694
+ [2025-09-12 18:17:48,495] - step:681/900 train_loss:0.4226 lr:0.0003000000 time/step:116.88s
695
+ [2025-09-12 18:19:44,618] - step:682/900 train_loss:0.3978 lr:0.0003000000 time/step:116.12s
696
+ [2025-09-12 18:21:41,760] - step:683/900 train_loss:0.4064 lr:0.0003000000 time/step:117.14s
697
+ [2025-09-12 18:23:38,665] - step:684/900 train_loss:0.3959 lr:0.0003000000 time/step:116.90s
698
+ [2025-09-12 18:25:36,029] - step:685/900 train_loss:0.4136 lr:0.0003000000 time/step:117.35s
699
+ [2025-09-12 18:27:33,774] - step:686/900 train_loss:0.4058 lr:0.0003000000 time/step:117.62s
700
+ [2025-09-12 18:29:30,658] - step:687/900 train_loss:0.4132 lr:0.0003000000 time/step:116.88s
701
+ [2025-09-12 18:31:27,420] - step:688/900 train_loss:0.4048 lr:0.0003000000 time/step:116.76s
702
+ [2025-09-12 18:33:24,361] - step:689/900 train_loss:0.4023 lr:0.0003000000 time/step:116.94s
703
+ [2025-09-12 18:35:21,754] - step:690/900 train_loss:0.3715 lr:0.0003000000 time/step:117.38s
704
+ [2025-09-12 18:37:19,552] - step:691/900 train_loss:0.4017 lr:0.0003000000 time/step:117.78s
705
+ [2025-09-12 18:39:16,412] - step:692/900 train_loss:0.4232 lr:0.0003000000 time/step:116.85s
706
+ [2025-09-12 18:41:13,974] - step:693/900 train_loss:0.4196 lr:0.0003000000 time/step:117.55s
707
+ [2025-09-12 18:43:10,197] - step:694/900 train_loss:0.4010 lr:0.0003000000 time/step:116.22s
708
+ [2025-09-12 18:45:07,263] - step:695/900 train_loss:0.3904 lr:0.0003000000 time/step:117.06s
709
+ [2025-09-12 18:47:05,813] - step:696/900 train_loss:0.4152 lr:0.0003000000 time/step:118.53s
710
+ [2025-09-12 18:49:02,863] - step:697/900 train_loss:0.4064 lr:0.0003000000 time/step:117.04s
711
+ [2025-09-12 18:50:59,812] - step:698/900 train_loss:0.3980 lr:0.0003000000 time/step:116.94s
712
+ [2025-09-12 18:52:57,370] - step:699/900 train_loss:0.3884 lr:0.0003000000 time/step:117.55s
713
+ [2025-09-12 18:54:54,648] - step:700/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@700.pt...
714
+ [2025-09-12 18:54:54,648] - step:700/900 train_loss:0.3973 lr:0.0003000000 time/step:116.56s
715
+ [2025-09-12 18:56:52,677] - step:701/900 train_loss:0.4030 lr:0.0003000000 time/step:118.01s
716
+ [2025-09-12 18:58:49,207] - step:702/900 train_loss:0.3937 lr:0.0003000000 time/step:116.52s
717
+ [2025-09-12 19:00:46,170] - step:703/900 train_loss:0.4356 lr:0.0003000000 time/step:116.96s
718
+ [2025-09-12 19:02:43,288] - step:704/900 train_loss:0.4294 lr:0.0003000000 time/step:117.11s
719
+ [2025-09-12 19:04:40,157] - step:705/900 train_loss:0.4150 lr:0.0003000000 time/step:116.86s
720
+ [2025-09-12 19:06:38,620] - step:706/900 train_loss:0.4153 lr:0.0003000000 time/step:118.45s
721
+ [2025-09-12 19:08:35,564] - step:707/900 train_loss:0.3966 lr:0.0003000000 time/step:116.92s
722
+ [2025-09-12 19:10:32,548] - step:708/900 train_loss:0.4221 lr:0.0003000000 time/step:116.98s
723
+ [2025-09-12 19:12:29,132] - step:709/900 train_loss:0.3952 lr:0.0003000000 time/step:116.58s
724
+ [2025-09-12 19:14:26,936] - step:710/900 train_loss:0.3849 lr:0.0003000000 time/step:117.80s
725
+ [2025-09-12 19:16:24,310] - step:711/900 train_loss:0.4114 lr:0.0003000000 time/step:117.36s
726
+ [2025-09-12 19:18:20,956] - step:712/900 train_loss:0.4173 lr:0.0003000000 time/step:116.64s
727
+ [2025-09-12 19:20:18,010] - step:713/900 train_loss:0.3898 lr:0.0003000000 time/step:117.05s
728
+ [2025-09-12 19:22:14,781] - step:714/900 train_loss:0.4088 lr:0.0003000000 time/step:116.76s
729
+ [2025-09-12 19:24:11,349] - step:715/900 train_loss:0.3975 lr:0.0003000000 time/step:116.56s
730
+ [2025-09-12 19:26:09,929] - step:716/900 train_loss:0.4089 lr:0.0003000000 time/step:118.57s
731
+ [2025-09-12 19:28:06,399] - step:717/900 train_loss:0.3964 lr:0.0003000000 time/step:116.46s
732
+ [2025-09-12 19:30:02,907] - step:718/900 train_loss:0.4063 lr:0.0003000000 time/step:116.49s
733
+ [2025-09-12 19:31:59,817] - step:719/900 train_loss:0.3934 lr:0.0003000000 time/step:116.90s
734
+ [2025-09-12 19:33:56,770] - step:720/900 train_loss:0.3953 lr:0.0003000000 time/step:116.95s
735
+ [2025-09-12 19:35:54,685] - step:721/900 train_loss:0.4275 lr:0.0003000000 time/step:117.90s
736
+ [2025-09-12 19:37:52,051] - step:722/900 train_loss:0.4074 lr:0.0003000000 time/step:117.36s
737
+ [2025-09-12 19:39:48,232] - step:723/900 train_loss:0.4163 lr:0.0003000000 time/step:116.18s
738
+ [2025-09-12 19:41:45,737] - step:724/900 train_loss:0.4015 lr:0.0003000000 time/step:117.50s
739
+ [2025-09-12 19:43:42,903] - step:725/900 train_loss:0.4202 lr:0.0003000000 time/step:117.16s
740
+ [2025-09-12 19:45:40,142] - step:726/900 train_loss:0.4291 lr:0.0003000000 time/step:117.23s
741
+ [2025-09-12 19:47:38,767] - step:727/900 train_loss:0.4219 lr:0.0003000000 time/step:118.52s
742
+ [2025-09-12 19:49:35,311] - step:728/900 train_loss:0.4267 lr:0.0003000000 time/step:116.54s
743
+ [2025-09-12 19:51:31,352] - step:729/900 train_loss:0.4008 lr:0.0003000000 time/step:116.03s
744
+ [2025-09-12 19:53:29,152] - step:730/900 train_loss:0.4191 lr:0.0003000000 time/step:117.79s
745
+ [2025-09-12 19:55:25,960] - step:731/900 train_loss:0.4093 lr:0.0003000000 time/step:116.80s
746
+ [2025-09-12 19:57:23,584] - step:732/900 train_loss:0.4230 lr:0.0003000000 time/step:117.61s
747
+ [2025-09-12 19:59:20,804] - step:733/900 train_loss:0.4213 lr:0.0003000000 time/step:117.21s
748
+ [2025-09-12 20:01:17,340] - step:734/900 train_loss:0.4071 lr:0.0003000000 time/step:116.53s
749
+ [2025-09-12 20:03:14,230] - step:735/900 train_loss:0.3944 lr:0.0003000000 time/step:116.88s
750
+ [2025-09-12 20:05:11,216] - step:736/900 train_loss:0.3971 lr:0.0003000000 time/step:116.98s
751
+ [2025-09-12 20:07:08,901] - step:737/900 train_loss:0.4144 lr:0.0003000000 time/step:117.67s
752
+ [2025-09-12 20:09:06,344] - step:738/900 train_loss:0.4349 lr:0.0003000000 time/step:117.44s
753
+ [2025-09-12 20:11:03,286] - step:739/900 train_loss:0.3967 lr:0.0003000000 time/step:116.94s
754
+ [2025-09-12 20:12:59,810] - step:740/900 train_loss:0.4104 lr:0.0003000000 time/step:116.52s
755
+ [2025-09-12 20:14:56,833] - step:741/900 train_loss:0.4195 lr:0.0003000000 time/step:117.01s
756
+ [2025-09-12 20:16:53,836] - step:742/900 train_loss:0.4083 lr:0.0003000000 time/step:116.99s
757
+ [2025-09-12 20:18:51,981] - step:743/900 train_loss:0.4021 lr:0.0003000000 time/step:118.14s
758
+ [2025-09-12 20:20:48,901] - step:744/900 train_loss:0.4182 lr:0.0003000000 time/step:116.91s
759
+ [2025-09-12 20:22:46,747] - step:745/900 train_loss:0.3946 lr:0.0003000000 time/step:117.84s
760
+ [2025-09-12 20:24:42,792] - step:746/900 train_loss:0.3826 lr:0.0003000000 time/step:116.03s
761
+ [2025-09-12 20:26:39,772] - step:747/900 train_loss:0.4267 lr:0.0003000000 time/step:116.97s
762
+ [2025-09-12 20:28:37,994] - step:748/900 train_loss:0.3935 lr:0.0003000000 time/step:118.21s
763
+ [2025-09-12 20:30:34,939] - step:749/900 train_loss:0.3979 lr:0.0003000000 time/step:116.93s
764
+ [2025-09-12 20:32:32,443] - step:750/900 train_loss:0.4253 lr:0.0003000000 time/step:117.50s
765
+ [2025-09-12 20:34:29,466] - step:751/900 train_loss:0.4006 lr:0.0003000000 time/step:117.01s
766
+ [2025-09-12 20:36:25,608] - step:752/900 train_loss:0.4219 lr:0.0003000000 time/step:116.13s
767
+ [2025-09-12 20:38:22,452] - step:753/900 train_loss:0.3919 lr:0.0003000000 time/step:116.84s
768
+ [2025-09-12 20:40:21,076] - step:754/900 train_loss:0.4138 lr:0.0003000000 time/step:118.62s
769
+ [2025-09-12 20:42:18,879] - step:755/900 train_loss:0.4144 lr:0.0003000000 time/step:117.79s
770
+ [2025-09-12 20:44:15,840] - step:756/900 train_loss:0.4077 lr:0.0003000000 time/step:116.95s
771
+ [2025-09-12 20:46:12,684] - step:757/900 train_loss:0.4420 lr:0.0003000000 time/step:116.84s
772
+ [2025-09-12 20:48:08,449] - step:758/900 train_loss:0.4310 lr:0.0003000000 time/step:115.75s
773
+ [2025-09-12 20:50:06,514] - step:759/900 train_loss:0.4193 lr:0.0003000000 time/step:118.06s
774
+ [2025-09-12 20:52:03,393] - step:760/900 train_loss:0.4097 lr:0.0003000000 time/step:116.87s
775
+ [2025-09-12 20:54:01,558] - step:761/900 train_loss:0.4206 lr:0.0003000000 time/step:118.16s
776
+ [2025-09-12 20:55:58,603] - step:762/900 train_loss:0.4123 lr:0.0003000000 time/step:117.04s
777
+ [2025-09-12 20:57:55,067] - step:763/900 train_loss:0.3960 lr:0.0003000000 time/step:116.45s
778
+ [2025-09-12 20:59:51,936] - step:764/900 train_loss:0.4299 lr:0.0003000000 time/step:116.85s
779
+ [2025-09-12 21:01:50,033] - step:765/900 train_loss:0.4122 lr:0.0003000000 time/step:118.09s
780
+ [2025-09-12 21:03:47,856] - step:766/900 train_loss:0.3942 lr:0.0003000000 time/step:117.82s
781
+ [2025-09-12 21:05:44,878] - step:767/900 train_loss:0.3948 lr:0.0003000000 time/step:117.01s
782
+ [2025-09-12 21:07:41,799] - step:768/900 train_loss:0.3943 lr:0.0003000000 time/step:116.91s
783
+ [2025-09-12 21:09:38,205] - step:769/900 train_loss:0.4122 lr:0.0003000000 time/step:116.40s
784
+ [2025-09-12 21:11:35,911] - step:770/900 train_loss:0.4029 lr:0.0003000000 time/step:117.70s
785
+ [2025-09-12 21:13:33,673] - step:771/900 train_loss:0.3994 lr:0.0003000000 time/step:117.75s
786
+ [2025-09-12 21:15:30,614] - step:772/900 train_loss:0.4263 lr:0.0003000000 time/step:116.93s
787
+ [2025-09-12 21:17:27,398] - step:773/900 train_loss:0.4199 lr:0.0003000000 time/step:116.77s
788
+ [2025-09-12 21:19:24,243] - step:774/900 train_loss:0.4126 lr:0.0003000000 time/step:116.84s
789
+ [2025-09-12 21:21:21,644] - step:775/900 train_loss:0.3885 lr:0.0003000000 time/step:117.39s
790
+ [2025-09-12 21:23:18,489] - step:776/900 train_loss:0.4123 lr:0.0003000000 time/step:116.84s
791
+ [2025-09-12 21:25:16,373] - step:777/900 train_loss:0.3887 lr:0.0003000000 time/step:117.88s
792
+ [2025-09-12 21:27:13,296] - step:778/900 train_loss:0.4256 lr:0.0003000000 time/step:116.91s
793
+ [2025-09-12 21:29:10,200] - step:779/900 train_loss:0.4090 lr:0.0003000000 time/step:116.90s
794
+ [2025-09-12 21:31:07,409] - step:780/900 train_loss:0.3895 lr:0.0003000000 time/step:117.20s
795
+ [2025-09-12 21:33:04,490] - step:781/900 train_loss:0.4134 lr:0.0003000000 time/step:117.07s
796
+ [2025-09-12 21:35:01,686] - step:782/900 train_loss:0.4317 lr:0.0003000000 time/step:117.19s
797
+ [2025-09-12 21:36:58,773] - step:783/900 train_loss:0.4093 lr:0.0003000000 time/step:117.07s
798
+ [2025-09-12 21:38:55,697] - step:784/900 train_loss:0.4052 lr:0.0003000000 time/step:116.92s
799
+ [2025-09-12 21:40:52,704] - step:785/900 train_loss:0.4158 lr:0.0003000000 time/step:117.00s
800
+ [2025-09-12 21:42:51,059] - step:786/900 train_loss:0.3933 lr:0.0003000000 time/step:118.35s
801
+ [2025-09-12 21:44:47,908] - step:787/900 train_loss:0.4167 lr:0.0003000000 time/step:116.84s
802
+ [2025-09-12 21:46:44,911] - step:788/900 train_loss:0.3970 lr:0.0003000000 time/step:116.99s
803
+ [2025-09-12 21:48:41,985] - step:789/900 train_loss:0.3789 lr:0.0003000000 time/step:117.06s
804
+ [2025-09-12 21:50:38,911] - step:790/900 train_loss:0.4033 lr:0.0003000000 time/step:116.92s
805
+ [2025-09-12 21:52:36,518] - step:791/900 train_loss:0.3703 lr:0.0003000000 time/step:117.60s
806
+ [2025-09-12 21:54:35,924] - step:792/900 train_loss:0.3987 lr:0.0003000000 time/step:119.40s
807
+ [2025-09-12 21:56:32,089] - step:793/900 train_loss:0.4103 lr:0.0003000000 time/step:116.16s
808
+ [2025-09-12 21:58:29,152] - step:794/900 train_loss:0.4121 lr:0.0003000000 time/step:117.05s
809
+ [2025-09-12 22:00:26,076] - step:795/900 train_loss:0.3756 lr:0.0003000000 time/step:116.92s
810
+ [2025-09-12 22:02:23,114] - step:796/900 train_loss:0.4195 lr:0.0003000000 time/step:117.03s
811
+ [2025-09-12 22:04:21,556] - step:797/900 train_loss:0.3852 lr:0.0003000000 time/step:118.43s
812
+ [2025-09-12 22:06:19,445] - step:798/900 train_loss:0.4343 lr:0.0003000000 time/step:117.88s
813
+ [2025-09-12 22:08:15,683] - step:799/900 train_loss:0.4024 lr:0.0003000000 time/step:116.22s
814
+ [2025-09-12 22:10:13,431] - step:800/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@800.pt...
815
+ [2025-09-12 22:10:13,432] - step:800/900 train_loss:0.4081 lr:0.0003000000 time/step:117.14s
816
+ [2025-09-12 22:12:09,931] - step:801/900 train_loss:0.4091 lr:0.0003000000 time/step:116.49s
817
+ [2025-09-12 22:14:08,180] - step:802/900 train_loss:0.4188 lr:0.0003000000 time/step:118.24s
818
+ [2025-09-12 22:16:05,937] - step:803/900 train_loss:0.4227 lr:0.0003000000 time/step:117.74s
819
+ [2025-09-12 22:18:02,679] - step:804/900 train_loss:0.3994 lr:0.0003000000 time/step:116.73s
820
+ [2025-09-12 22:19:59,511] - step:805/900 train_loss:0.3885 lr:0.0003000000 time/step:116.82s
821
+ [2025-09-12 22:21:56,602] - step:806/900 train_loss:0.3937 lr:0.0003000000 time/step:117.08s
822
+ [2025-09-12 22:23:54,831] - step:807/900 train_loss:0.4143 lr:0.0003000000 time/step:118.22s
823
+ [2025-09-12 22:25:52,068] - step:808/900 train_loss:0.4324 lr:0.0003000000 time/step:117.23s
824
+ [2025-09-12 22:27:49,499] - step:809/900 train_loss:0.3988 lr:0.0003000000 time/step:117.42s
825
+ [2025-09-12 22:29:45,897] - step:810/900 train_loss:0.4016 lr:0.0003000000 time/step:116.39s
826
+ [2025-09-12 22:31:42,993] - step:811/900 train_loss:0.4106 lr:0.0003000000 time/step:117.08s
827
+ [2025-09-12 22:33:41,172] - step:812/900 train_loss:0.4097 lr:0.0003000000 time/step:118.17s
828
+ [2025-09-12 22:35:38,349] - step:813/900 train_loss:0.3838 lr:0.0003000000 time/step:117.17s
829
+ [2025-09-12 22:37:36,103] - step:814/900 train_loss:0.3802 lr:0.0003000000 time/step:117.74s
830
+ [2025-09-12 22:39:33,507] - step:815/900 train_loss:0.4195 lr:0.0003000000 time/step:117.40s
831
+ [2025-09-12 22:41:29,750] - step:816/900 train_loss:0.4333 lr:0.0003000000 time/step:116.23s
832
+ [2025-09-12 22:43:26,622] - step:817/900 train_loss:0.4108 lr:0.0003000000 time/step:116.87s
833
+ [2025-09-12 22:45:25,127] - step:818/900 train_loss:0.3866 lr:0.0003000000 time/step:118.49s
834
+ [2025-09-12 22:47:22,168] - step:819/900 train_loss:0.4197 lr:0.0003000000 time/step:117.03s
835
+ [2025-09-12 22:49:19,672] - step:820/900 train_loss:0.3791 lr:0.0003000000 time/step:117.50s
836
+ [2025-09-12 22:51:17,438] - step:821/900 train_loss:0.4053 lr:0.0003000000 time/step:117.76s
837
+ [2025-09-12 22:53:13,613] - step:822/900 train_loss:0.4096 lr:0.0003000000 time/step:116.16s
838
+ [2025-09-12 22:55:11,085] - step:823/900 train_loss:0.4086 lr:0.0003000000 time/step:117.46s
839
+ [2025-09-12 22:57:08,006] - step:824/900 train_loss:0.4028 lr:0.0003000000 time/step:116.90s
840
+ [2025-09-12 22:59:05,729] - step:825/900 train_loss:0.3960 lr:0.0003000000 time/step:117.72s
841
+ [2025-09-12 23:01:03,331] - step:826/900 train_loss:0.4060 lr:0.0003000000 time/step:117.59s
842
+ [2025-09-12 23:03:00,051] - step:827/900 train_loss:0.4147 lr:0.0003000000 time/step:116.71s
843
+ [2025-09-12 23:04:56,347] - step:828/900 train_loss:0.4173 lr:0.0003000000 time/step:116.28s
844
+ [2025-09-12 23:06:53,382] - step:829/900 train_loss:0.4136 lr:0.0003000000 time/step:117.02s
845
+ [2025-09-12 23:08:50,925] - step:830/900 train_loss:0.4135 lr:0.0003000000 time/step:117.53s
846
+ [2025-09-12 23:10:48,709] - step:831/900 train_loss:0.3960 lr:0.0003000000 time/step:117.78s
847
+ [2025-09-12 23:12:45,852] - step:832/900 train_loss:0.3999 lr:0.0003000000 time/step:117.13s
848
+ [2025-09-12 23:14:43,195] - step:833/900 train_loss:0.4046 lr:0.0003000000 time/step:117.33s
849
+ [2025-09-12 23:16:39,299] - step:834/900 train_loss:0.4188 lr:0.0003000000 time/step:116.10s
850
+ [2025-09-12 23:18:36,142] - step:835/900 train_loss:0.3957 lr:0.0003000000 time/step:116.83s
851
+ [2025-09-12 23:20:34,486] - step:836/900 train_loss:0.4188 lr:0.0003000000 time/step:118.34s
852
+ [2025-09-12 23:22:31,489] - step:837/900 train_loss:0.3849 lr:0.0003000000 time/step:116.99s
853
+ [2025-09-12 23:24:28,392] - step:838/900 train_loss:0.4255 lr:0.0003000000 time/step:116.90s
854
+ [2025-09-12 23:26:24,998] - step:839/900 train_loss:0.4019 lr:0.0003000000 time/step:116.59s
855
+ [2025-09-12 23:28:21,798] - step:840/900 train_loss:0.4149 lr:0.0003000000 time/step:116.78s
856
+ [2025-09-12 23:30:20,342] - step:841/900 train_loss:0.3937 lr:0.0003000000 time/step:118.54s
857
+ [2025-09-12 23:32:17,286] - step:842/900 train_loss:0.3996 lr:0.0003000000 time/step:116.94s
858
+ [2025-09-12 23:34:14,169] - step:843/900 train_loss:0.3911 lr:0.0003000000 time/step:116.88s
859
+ [2025-09-12 23:36:11,513] - step:844/900 train_loss:0.4199 lr:0.0003000000 time/step:117.34s
860
+ [2025-09-12 23:38:07,515] - step:845/900 train_loss:0.3990 lr:0.0003000000 time/step:115.99s
861
+ [2025-09-12 23:40:05,382] - step:846/900 train_loss:0.4059 lr:0.0003000000 time/step:117.86s
862
+ [2025-09-12 23:42:03,341] - step:847/900 train_loss:0.4217 lr:0.0003000000 time/step:117.95s
863
+ [2025-09-12 23:44:00,267] - step:848/900 train_loss:0.4059 lr:0.0003000000 time/step:116.92s
864
+ [2025-09-12 23:45:57,550] - step:849/900 train_loss:0.4140 lr:0.0003000000 time/step:117.28s
865
+ [2025-09-12 23:47:54,492] - step:850/900 train_loss:0.3920 lr:0.0003000000 time/step:116.93s
866
+ [2025-09-12 23:49:50,997] - step:851/900 train_loss:0.4194 lr:0.0003000000 time/step:116.50s
867
+ [2025-09-12 23:51:48,718] - step:852/900 train_loss:0.3914 lr:0.0003000000 time/step:117.71s
868
+ [2025-09-12 23:53:45,683] - step:853/900 train_loss:0.4012 lr:0.0003000000 time/step:116.96s
869
+ [2025-09-12 23:55:43,182] - step:854/900 train_loss:0.4198 lr:0.0003000000 time/step:117.47s
870
+ [2025-09-12 23:57:40,227] - step:855/900 train_loss:0.4059 lr:0.0003000000 time/step:117.03s
871
+ [2025-09-12 23:59:37,792] - step:856/900 train_loss:0.4026 lr:0.0003000000 time/step:117.56s
872
+ [2025-09-13 00:01:34,695] - step:857/900 train_loss:0.4171 lr:0.0003000000 time/step:116.89s
873
+ [2025-09-13 00:03:32,341] - step:858/900 train_loss:0.4017 lr:0.0003000000 time/step:117.64s
874
+ [2025-09-13 00:05:29,421] - step:859/900 train_loss:0.4011 lr:0.0003000000 time/step:117.07s
875
+ [2025-09-13 00:07:26,749] - step:860/900 train_loss:0.3910 lr:0.0003000000 time/step:117.32s
876
+ [2025-09-13 00:09:23,608] - step:861/900 train_loss:0.4093 lr:0.0003000000 time/step:116.85s
877
+ [2025-09-13 00:11:21,037] - step:862/900 train_loss:0.4295 lr:0.0003000000 time/step:117.42s
878
+ [2025-09-13 00:13:17,816] - step:863/900 train_loss:0.4025 lr:0.0003000000 time/step:116.77s
879
+ [2025-09-13 00:15:14,919] - step:864/900 train_loss:0.3978 lr:0.0003000000 time/step:117.10s
880
+ [2025-09-13 00:17:12,309] - step:865/900 train_loss:0.3941 lr:0.0003000000 time/step:117.38s
881
+ [2025-09-13 00:19:09,330] - step:866/900 train_loss:0.4150 lr:0.0003000000 time/step:117.01s
882
+ [2025-09-13 00:21:06,411] - step:867/900 train_loss:0.4101 lr:0.0003000000 time/step:117.01s
883
+ [2025-09-13 00:23:03,516] - step:868/900 train_loss:0.4156 lr:0.0003000000 time/step:117.10s
884
+ [2025-09-13 00:25:00,493] - step:869/900 train_loss:0.4128 lr:0.0003000000 time/step:116.97s
885
+ [2025-09-13 00:26:57,821] - step:870/900 train_loss:0.4182 lr:0.0003000000 time/step:117.31s
886
+ [2025-09-13 00:28:54,768] - step:871/900 train_loss:0.3940 lr:0.0003000000 time/step:116.93s
887
+ [2025-09-13 00:30:51,704] - step:872/900 train_loss:0.4091 lr:0.0003000000 time/step:116.93s
888
+ [2025-09-13 00:32:48,692] - step:873/900 train_loss:0.4066 lr:0.0003000000 time/step:116.98s
889
+ [2025-09-13 00:34:47,091] - step:874/900 train_loss:0.4061 lr:0.0003000000 time/step:118.39s
890
+ [2025-09-13 00:36:44,116] - step:875/900 train_loss:0.3712 lr:0.0003000000 time/step:117.01s
891
+ [2025-09-13 00:38:41,019] - step:876/900 train_loss:0.4040 lr:0.0003000000 time/step:116.89s
892
+ [2025-09-13 00:40:38,506] - step:877/900 train_loss:0.3807 lr:0.0003000000 time/step:117.48s
893
+ [2025-09-13 00:42:35,384] - step:878/900 train_loss:0.4103 lr:0.0003000000 time/step:116.87s
894
+ [2025-09-13 00:44:33,175] - step:879/900 train_loss:0.4001 lr:0.0003000000 time/step:117.79s
895
+ [2025-09-13 00:46:29,986] - step:880/900 train_loss:0.3966 lr:0.0003000000 time/step:116.79s
896
+ [2025-09-13 00:48:27,354] - step:881/900 train_loss:0.4188 lr:0.0003000000 time/step:117.29s
897
+ [2025-09-13 00:50:24,406] - step:882/900 train_loss:0.4164 lr:0.0003000000 time/step:117.05s
898
+ [2025-09-13 00:52:22,291] - step:883/900 train_loss:0.3936 lr:0.0003000000 time/step:117.88s
899
+ [2025-09-13 00:54:20,651] - step:884/900 train_loss:0.4148 lr:0.0003000000 time/step:118.35s
900
+ [2025-09-13 00:56:17,788] - step:885/900 train_loss:0.4173 lr:0.0003000000 time/step:117.13s
901
+ [2025-09-13 00:58:14,279] - step:886/900 train_loss:0.4260 lr:0.0003000000 time/step:116.46s
902
+ [2025-09-13 01:00:11,090] - step:887/900 train_loss:0.4037 lr:0.0003000000 time/step:116.80s
903
+ [2025-09-13 01:02:08,948] - step:888/900 train_loss:0.4117 lr:0.0003000000 time/step:117.85s
904
+ [2025-09-13 01:04:07,249] - step:889/900 train_loss:0.4068 lr:0.0003000000 time/step:118.29s
905
+ [2025-09-13 01:06:04,130] - step:890/900 train_loss:0.4187 lr:0.0003000000 time/step:116.87s
906
+ [2025-09-13 01:08:01,508] - step:891/900 train_loss:0.4159 lr:0.0003000000 time/step:117.36s
907
+ [2025-09-13 01:09:57,620] - step:892/900 train_loss:0.3978 lr:0.0003000000 time/step:116.10s
908
+ [2025-09-13 01:11:55,493] - step:893/900 train_loss:0.3925 lr:0.0003000000 time/step:117.86s
909
+ [2025-09-13 01:13:52,516] - step:894/900 train_loss:0.3845 lr:0.0003000000 time/step:117.01s
910
+ [2025-09-13 01:15:50,321] - step:895/900 train_loss:0.4062 lr:0.0003000000 time/step:117.80s
911
+ [2025-09-13 01:17:47,232] - step:896/900 train_loss:0.3879 lr:0.0003000000 time/step:116.90s
912
+ [2025-09-13 01:19:44,630] - step:897/900 train_loss:0.4272 lr:0.0003000000 time/step:117.39s
913
+ [2025-09-13 01:21:41,559] - step:898/900 train_loss:0.4121 lr:0.0003000000 time/step:116.92s
914
+ [2025-09-13 01:23:39,154] - step:899/900 train_loss:0.4079 lr:0.0003000000 time/step:117.59s
915
+ [2025-09-13 01:25:37,577] - step:900/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@900.pt...
916
+ [2025-09-13 01:25:37,578] - step:900/900 train_loss:0.3995 lr:0.0003000000 time/step:117.81s
wandb/debug-internal.log ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-11T20:06:44.955449103+01:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug-core.log"}
2
+ {"time":"2025-09-11T20:06:45.176117844+01:00","level":"INFO","msg":"created new stream","id":"y9v5i9gr"}
3
+ {"time":"2025-09-11T20:06:45.176201537+01:00","level":"INFO","msg":"stream: started","id":"y9v5i9gr"}
4
+ {"time":"2025-09-11T20:06:45.176254637+01:00","level":"INFO","msg":"writer: Do: started","stream_id":"y9v5i9gr"}
5
+ {"time":"2025-09-11T20:06:45.176292219+01:00","level":"INFO","msg":"handler: started","stream_id":"y9v5i9gr"}
6
+ {"time":"2025-09-11T20:06:45.176341928+01:00","level":"INFO","msg":"sender: started","stream_id":"y9v5i9gr"}
7
+ {"time":"2025-09-11T20:06:45.680069036+01:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-09-11T20:19:16.313200337+01:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/circuit-mtp/mtp/y9v5i9gr/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
9
+ {"time":"2025-09-12T00:53:29.590652615+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
10
+ {"time":"2025-09-12T01:30:18.032795292+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
11
+ {"time":"2025-09-12T01:30:50.327057066+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
12
+ {"time":"2025-09-12T01:31:25.000022545+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
13
+ {"time":"2025-09-12T01:32:03.267256543+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
14
+ {"time":"2025-09-12T02:41:34.535497308+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
15
+ {"time":"2025-09-12T02:42:24.914157379+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
16
+ {"time":"2025-09-12T02:42:57.41051518+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
17
+ {"time":"2025-09-12T10:36:51.38167595+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
18
+ {"time":"2025-09-12T10:38:06.370172425+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
19
+ {"time":"2025-09-12T10:38:38.465480726+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
20
+ {"time":"2025-09-12T10:39:07.484991796+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
21
+ {"time":"2025-09-12T10:39:41.575653023+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
22
+ {"time":"2025-09-12T20:16:55.628544216+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
23
+ {"time":"2025-09-12T20:21:25.750812333+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
24
+ {"time":"2025-09-12T22:43:55.97454382+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
25
+ {"time":"2025-09-13T00:52:11.684482933+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
26
+ {"time":"2025-09-13T00:54:54.045134291+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
27
+ {"time":"2025-09-13T00:55:26.197593179+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
28
+ {"time":"2025-09-13T01:05:42.010380611+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
29
+ {"time":"2025-09-13T01:06:14.056932921+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
30
+ {"time":"2025-09-13T01:06:48.575121732+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
31
+ {"time":"2025-09-13T01:07:28.074495024+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
32
+ {"time":"2025-09-13T01:09:42.005493483+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
33
+ {"time":"2025-09-13T01:10:14.454893184+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
34
+ {"time":"2025-09-13T01:10:49.419226595+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
35
+ {"time":"2025-09-13T01:11:21.445954263+01:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/circuit-mtp/mtp/y9v5i9gr/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
36
+ {"time":"2025-09-13T01:11:57.007348427+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
37
+ {"time":"2025-09-13T01:13:57.010172043+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
38
+ {"time":"2025-09-13T01:14:29.220923193+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
39
+ {"time":"2025-09-13T01:15:27.013535251+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
40
+ {"time":"2025-09-13T01:15:59.276998526+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
41
+ {"time":"2025-09-13T01:16:33.628210655+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
42
+ {"time":"2025-09-13T01:17:42.016257241+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
43
+ {"time":"2025-09-13T01:18:14.389776393+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
44
+ {"time":"2025-09-13T01:19:42.02019871+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
45
+ {"time":"2025-09-13T01:25:38.927499163+01:00","level":"INFO","msg":"stream: closing","id":"y9v5i9gr"}
46
+ {"time":"2025-09-13T01:25:38.930059685+01:00","level":"INFO","msg":"Stopping system monitor"}
47
+ {"time":"2025-09-13T01:25:38.990179981+01:00","level":"INFO","msg":"Stopped system monitor"}
48
+ {"time":"2025-09-13T01:25:39.717455712+01:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
49
+ {"time":"2025-09-13T01:25:39.93171592+01:00","level":"INFO","msg":"handler: closed","stream_id":"y9v5i9gr"}
50
+ {"time":"2025-09-13T01:25:39.931829957+01:00","level":"INFO","msg":"writer: Close: closed","stream_id":"y9v5i9gr"}
51
+ {"time":"2025-09-13T01:25:39.932853619+01:00","level":"INFO","msg":"sender: closed","stream_id":"y9v5i9gr"}
52
+ {"time":"2025-09-13T01:25:39.932961632+01:00","level":"INFO","msg":"stream: closed","id":"y9v5i9gr"}
wandb/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-09-11 20:06:44,916 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Configure stats pid to 2716293
3
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from /home/agrivas/.config/wandb/settings
4
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/settings
5
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug.log
7
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug-internal.log
8
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():761] calling init triggers
9
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'compile': True, 'device': 'cuda', 'from_checkpoint': None, 'name': 'nanogpt', 'training': {'random_seed': 13, 'batch_size': 256, 'device_batch_size': 1, 'sequence_length': 8192, 'num_iterations': 900, 'learning_rate': 0.0003, 'use_scheduler': False, 'save_model': True, 'save_optimizer': True, 'save_model_every': 100, 'val_loss_every': 100, 'val_tokens': 4194304, 'expname': 'lr-3e-4-no-lora-cp-n-8-r-8'}, 'model': {'name': 'mtp', 'beta': 0.0, 'gamma': 1, 'kl_algorithm': 'full', 'kl_type': 'forward', 'model': {'_target_': 'mtp.models.mtp.MultiTokenLM', 'lm': '${lm.model}', 'circuit': '${circuit.model}', 'mt_head_kwargs': '${mt_head.hyperparameters}', 'init_from_lm_head': True, 'kl_type': '${model.kl_type}', 'kl_algorithm': '${model.kl_algorithm}', 'beta': 0, 'gamma': 0.9}}, 'circuit': {'name': 'cp', 'n_token': 8, 'n_component': 8, 'model': {'_target_': 'mtp.models.circuits.CircuitModel', 'vocab_size': 320, 'n_token': 8, 'n_component': 8, 'kind': 'cp'}}, 'mt_head': {'name': 'linear-evabyte', 'hyperparameters': {'type': 'evabyte', 'n_embd': 4096, 'transformer_n_head': 32, 'transformer_n_layer': 0, 'expander_type': 'linear', 'expander_n_layer': 1, 'freeze_vocab_unembedding': False, 'share_sum_weights': False, 'contextual_hmm_weights': True, 'init_hmm_identity': True}}, 'adaptor': {'name': 'none', 'hyperparameters': None}, 'lm': {'name': 'evabyte', 'n_embd': 4096, 'n_head': 32, 'model': {'_target_': 'mtp.models.lm.LM', 'lm': None, 'encoder_only': True, 'from_checkpoint': None, 'from_huggingface': 'EvaByte/EvaByte-SFT', 'adaptor_kwargs': None, 'ref_enc': 'model', 'ref_head': 'lm_head', 'freeze': True}}, 'data': {'name': 'tulu3-evabyte', 'train_bin': 'agrv/tulu-v3-sft-evabyte-packed-seq-len-8192', 'val_bin': None, 'vocab_size': 320}, 'generate': {'speculative': False}, '_wandb': {}}
11
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():784] starting backend
12
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-09-11 20:06:44,948 INFO MainThread:2716293 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-09-11 20:06:44,948 INFO MainThread:2716293 [wandb_init.py:init():798] backend started and connected
15
+ 2025-09-11 20:06:44,953 INFO MainThread:2716293 [wandb_init.py:init():891] updated telemetry
16
+ 2025-09-11 20:06:44,961 INFO MainThread:2716293 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-09-11 20:06:45,675 INFO MainThread:2716293 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-09-11 20:06:46,525 INFO MainThread:2716293 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-09-11 20:06:46,526 INFO MainThread:2716293 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-09-11 20:06:46,533 INFO MainThread:2716293 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-09-11 20:06:46,533 INFO MainThread:2716293 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-09-11 20:06:46,549 INFO MainThread:2716293 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-09-13 01:25:38,827 INFO MsgRouterThr:2716293 [mailbox.py:close():129] Closing mailbox, abandoning 1 handles.
wandb/run-20250911_200644-y9v5i9gr/files/config.yaml ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.8
4
+ m:
5
+ - "1": train/ce_loss_at_4
6
+ "5": 2
7
+ "6":
8
+ - 1
9
+ - 3
10
+ "7": []
11
+ - "1": global_step
12
+ "5": 2
13
+ "6":
14
+ - 1
15
+ - 3
16
+ "7": []
17
+ - "1": train/ce_loss_at_7
18
+ "5": 2
19
+ "6":
20
+ - 1
21
+ - 3
22
+ "7": []
23
+ - "1": train/loss
24
+ "5": 2
25
+ "6":
26
+ - 1
27
+ - 3
28
+ "7": []
29
+ - "1": train/ce_loss_at_5
30
+ "5": 2
31
+ "6":
32
+ - 1
33
+ - 3
34
+ "7": []
35
+ - "1": train/ce_loss_at_2
36
+ "5": 2
37
+ "6":
38
+ - 1
39
+ - 3
40
+ "7": []
41
+ - "1": train/ce_loss_at_6
42
+ "5": 2
43
+ "6":
44
+ - 1
45
+ - 3
46
+ "7": []
47
+ - "1": train/ce_loss_at_8
48
+ "5": 2
49
+ "6":
50
+ - 1
51
+ - 3
52
+ "7": []
53
+ - "1": train/ce_loss_at_1
54
+ "5": 2
55
+ "6":
56
+ - 1
57
+ - 3
58
+ "7": []
59
+ - "1": train/ce_loss_at_3
60
+ "5": 2
61
+ "6":
62
+ - 1
63
+ - 3
64
+ "7": []
65
+ python_version: 3.10.16
66
+ t:
67
+ "1":
68
+ - 1
69
+ - 11
70
+ - 41
71
+ - 49
72
+ - 50
73
+ - 51
74
+ - 55
75
+ - 71
76
+ - 84
77
+ - 98
78
+ "2":
79
+ - 1
80
+ - 11
81
+ - 41
82
+ - 49
83
+ - 50
84
+ - 51
85
+ - 55
86
+ - 71
87
+ - 84
88
+ - 98
89
+ "3":
90
+ - 7
91
+ - 13
92
+ - 15
93
+ - 16
94
+ - 23
95
+ - 55
96
+ "4": 3.10.16
97
+ "5": 0.19.8
98
+ "6": 4.49.0
99
+ "8":
100
+ - 5
101
+ "12": 0.19.8
102
+ "13": linux-x86_64
103
+ adaptor:
104
+ value:
105
+ hyperparameters: null
106
+ name: none
107
+ circuit:
108
+ value:
109
+ model:
110
+ _target_: mtp.models.circuits.CircuitModel
111
+ kind: cp
112
+ n_component: 8
113
+ n_token: 8
114
+ vocab_size: 320
115
+ n_component: 8
116
+ n_token: 8
117
+ name: cp
118
+ compile:
119
+ value: true
120
+ data:
121
+ value:
122
+ name: tulu3-evabyte
123
+ train_bin: agrv/tulu-v3-sft-evabyte-packed-seq-len-8192
124
+ val_bin: null
125
+ vocab_size: 320
126
+ device:
127
+ value: cuda
128
+ from_checkpoint:
129
+ value: null
130
+ generate:
131
+ value:
132
+ speculative: false
133
+ lm:
134
+ value:
135
+ model:
136
+ _target_: mtp.models.lm.LM
137
+ adaptor_kwargs: null
138
+ encoder_only: true
139
+ freeze: true
140
+ from_checkpoint: null
141
+ from_huggingface: EvaByte/EvaByte-SFT
142
+ lm: null
143
+ ref_enc: model
144
+ ref_head: lm_head
145
+ n_embd: 4096
146
+ n_head: 32
147
+ name: evabyte
148
+ model:
149
+ value:
150
+ beta: 0
151
+ gamma: 1
152
+ kl_algorithm: full
153
+ kl_type: forward
154
+ model:
155
+ _target_: mtp.models.mtp.MultiTokenLM
156
+ beta: 0
157
+ circuit: ${circuit.model}
158
+ gamma: 0.9
159
+ init_from_lm_head: true
160
+ kl_algorithm: ${model.kl_algorithm}
161
+ kl_type: ${model.kl_type}
162
+ lm: ${lm.model}
163
+ mt_head_kwargs: ${mt_head.hyperparameters}
164
+ name: mtp
165
+ mt_head:
166
+ value:
167
+ hyperparameters:
168
+ contextual_hmm_weights: true
169
+ expander_n_layer: 1
170
+ expander_type: linear
171
+ freeze_vocab_unembedding: false
172
+ init_hmm_identity: true
173
+ n_embd: 4096
174
+ share_sum_weights: false
175
+ transformer_n_head: 32
176
+ transformer_n_layer: 0
177
+ type: evabyte
178
+ name: linear-evabyte
179
+ name:
180
+ value: nanogpt
181
+ training:
182
+ value:
183
+ batch_size: 256
184
+ device_batch_size: 1
185
+ expname: lr-3e-4-no-lora-cp-n-8-r-8
186
+ learning_rate: 0.0003
187
+ num_iterations: 900
188
+ random_seed: 13
189
+ save_model: true
190
+ save_model_every: 100
191
+ save_optimizer: true
192
+ sequence_length: 8192
193
+ use_scheduler: false
194
+ val_loss_every: 100
195
+ val_tokens: 4194304
wandb/run-20250911_200644-y9v5i9gr/files/output.log ADDED
@@ -0,0 +1,936 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-09-11 20:06:46,551] - Saving config and checkpoints to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16...
2
+ [2025-09-11 20:06:46,551] - Save model: True...
3
+ [2025-09-11 20:06:46,552] - Save optimizer: True...
4
+ [2025-09-11 20:06:46,558] - Training on agrv/tulu-v3-sft-evabyte-packed-seq-len-8192...
5
+ Generating train split: 100%|██████████| 233628/233628 [00:06<00:00, 36437.72 examples/s]
6
+ Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
7
+ [2025-09-11 20:07:21,844] - Setting num_proc from 20 back to 1 for the valid split to disable multiprocessing as it only contains one shard.
8
+ Generating valid split: 100%|██████████| 2360/2360 [00:00<00:00, 5736.16 examples/s]
9
+ [2025-09-11 20:07:23,771] - step:0/900 Saving model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@0.pt...
10
+ /home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:87: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
11
+ return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
12
+ [rank0]:W0911 20:08:06.374000 2716293 torch/_dynamo/exc.py:304] [7/0] Backend compiler failed with a fake tensor exception at
13
+ [rank0]:W0911 20:08:06.374000 2716293 torch/_dynamo/exc.py:304] [7/0] File "/disk/scratch/agrivas/nanoGPT/mtp/utils/packing.py", line 39, in torch_dynamo_resume_in_packed_targets_to_target_windows_at_34
14
+ [rank0]:W0911 20:08:06.374000 2716293 torch/_dynamo/exc.py:304] [7/0] return torch.concat(parts, dim=0).reshape(B, S, n)
15
+ [rank0]:W0911 20:08:06.374000 2716293 torch/_dynamo/exc.py:304] [7/0] Adding a graph break.
16
+ [rank0]:W0911 20:08:06.437000 2716293 torch/_dynamo/exc.py:304] [7/0_1] Backend compiler failed with a fake tensor exception at
17
+ [rank0]:W0911 20:08:06.437000 2716293 torch/_dynamo/exc.py:304] [7/0_1] File "/disk/scratch/agrivas/nanoGPT/mtp/utils/packing.py", line 39, in torch_dynamo_resume_in_packed_targets_to_target_windows_at_34
18
+ [rank0]:W0911 20:08:06.437000 2716293 torch/_dynamo/exc.py:304] [7/0_1] return torch.concat(parts, dim=0).reshape(B, S, n)
19
+ [rank0]:W0911 20:08:06.437000 2716293 torch/_dynamo/exc.py:304] [7/0_1] Adding a graph break.
20
+ /home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:87: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
21
+ return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
22
+ /home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:87: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
23
+ return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
24
+ /home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:87: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
25
+ return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
26
+ /home/agrivas/nanoGPT/.venv/lib/python3.10/site-packages/datasets/formatting/torch_formatter.py:87: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
27
+ return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
28
+ [2025-09-11 20:10:21,957] - step:1/900 train_loss:0.5686 lr:0.0003000000 time/step:177.94s
29
+ [2025-09-11 20:12:19,200] - step:2/900 train_loss:0.5480 lr:0.0003000000 time/step:117.24s
30
+ [2025-09-11 20:14:16,539] - step:3/900 train_loss:0.5220 lr:0.0003000000 time/step:117.34s
31
+ [2025-09-11 20:16:13,861] - step:4/900 train_loss:0.5383 lr:0.0003000000 time/step:117.32s
32
+ [2025-09-11 20:18:10,435] - step:5/900 train_loss:0.5371 lr:0.0003000000 time/step:116.57s
33
+ [2025-09-11 20:20:08,627] - step:6/900 train_loss:0.5227 lr:0.0003000000 time/step:118.19s
34
+ [2025-09-11 20:22:06,149] - step:7/900 train_loss:0.5128 lr:0.0003000000 time/step:117.51s
35
+ [2025-09-11 20:24:03,890] - step:8/900 train_loss:0.5420 lr:0.0003000000 time/step:117.74s
36
+ [2025-09-11 20:26:01,252] - step:9/900 train_loss:0.5426 lr:0.0003000000 time/step:117.36s
37
+ [2025-09-11 20:27:59,602] - step:10/900 train_loss:0.5236 lr:0.0003000000 time/step:118.34s
38
+ [2025-09-11 20:29:56,227] - step:11/900 train_loss:0.4860 lr:0.0003000000 time/step:116.61s
39
+ [2025-09-11 20:31:54,042] - step:12/900 train_loss:0.5105 lr:0.0003000000 time/step:117.81s
40
+ [2025-09-11 20:33:51,406] - step:13/900 train_loss:0.4993 lr:0.0003000000 time/step:117.36s
41
+ [2025-09-11 20:35:48,717] - step:14/900 train_loss:0.4925 lr:0.0003000000 time/step:117.31s
42
+ [2025-09-11 20:37:47,558] - step:15/900 train_loss:0.5207 lr:0.0003000000 time/step:118.83s
43
+ [2025-09-11 20:39:45,850] - step:16/900 train_loss:0.4827 lr:0.0003000000 time/step:118.28s
44
+ [2025-09-11 20:41:42,738] - step:17/900 train_loss:0.5033 lr:0.0003000000 time/step:116.88s
45
+ [2025-09-11 20:43:39,898] - step:18/900 train_loss:0.5082 lr:0.0003000000 time/step:117.15s
46
+ [2025-09-11 20:45:37,029] - step:19/900 train_loss:0.4910 lr:0.0003000000 time/step:117.13s
47
+ [2025-09-11 20:47:34,571] - step:20/900 train_loss:0.5006 lr:0.0003000000 time/step:117.54s
48
+ [2025-09-11 20:49:32,312] - step:21/900 train_loss:0.4936 lr:0.0003000000 time/step:117.73s
49
+ [2025-09-11 20:51:29,213] - step:22/900 train_loss:0.4941 lr:0.0003000000 time/step:116.90s
50
+ [2025-09-11 20:53:26,056] - step:23/900 train_loss:0.5131 lr:0.0003000000 time/step:116.83s
51
+ [2025-09-11 20:55:22,982] - step:24/900 train_loss:0.4826 lr:0.0003000000 time/step:116.92s
52
+ [2025-09-11 20:57:20,427] - step:25/900 train_loss:0.4913 lr:0.0003000000 time/step:117.44s
53
+ [2025-09-11 20:59:18,626] - step:26/900 train_loss:0.4607 lr:0.0003000000 time/step:118.18s
54
+ [2025-09-11 21:01:15,710] - step:27/900 train_loss:0.4908 lr:0.0003000000 time/step:117.08s
55
+ [2025-09-11 21:03:12,633] - step:28/900 train_loss:0.4910 lr:0.0003000000 time/step:116.91s
56
+ [2025-09-11 21:05:09,636] - step:29/900 train_loss:0.4657 lr:0.0003000000 time/step:117.00s
57
+ [2025-09-11 21:07:06,700] - step:30/900 train_loss:0.4594 lr:0.0003000000 time/step:117.06s
58
+ [2025-09-11 21:09:04,683] - step:31/900 train_loss:0.4755 lr:0.0003000000 time/step:117.97s
59
+ [2025-09-11 21:11:01,763] - step:32/900 train_loss:0.4541 lr:0.0003000000 time/step:117.08s
60
+ [2025-09-11 21:12:59,791] - step:33/900 train_loss:0.4807 lr:0.0003000000 time/step:118.02s
61
+ [2025-09-11 21:14:55,836] - step:34/900 train_loss:0.4870 lr:0.0003000000 time/step:116.03s
62
+ [2025-09-11 21:16:52,899] - step:35/900 train_loss:0.4625 lr:0.0003000000 time/step:117.06s
63
+ [2025-09-11 21:18:51,003] - step:36/900 train_loss:0.4791 lr:0.0003000000 time/step:118.09s
64
+ [2025-09-11 21:20:48,545] - step:37/900 train_loss:0.4473 lr:0.0003000000 time/step:117.53s
65
+ [2025-09-11 21:22:45,589] - step:38/900 train_loss:0.4752 lr:0.0003000000 time/step:117.04s
66
+ [2025-09-11 21:24:43,273] - step:39/900 train_loss:0.4637 lr:0.0003000000 time/step:117.68s
67
+ [2025-09-11 21:26:39,295] - step:40/900 train_loss:0.4792 lr:0.0003000000 time/step:116.01s
68
+ [2025-09-11 21:28:36,435] - step:41/900 train_loss:0.4486 lr:0.0003000000 time/step:117.13s
69
+ [2025-09-11 21:30:33,920] - step:42/900 train_loss:0.4401 lr:0.0003000000 time/step:117.48s
70
+ [2025-09-11 21:32:30,825] - step:43/900 train_loss:0.4647 lr:0.0003000000 time/step:116.90s
71
+ [2025-09-11 21:34:28,329] - step:44/900 train_loss:0.4925 lr:0.0003000000 time/step:117.50s
72
+ [2025-09-11 21:36:25,926] - step:45/900 train_loss:0.4660 lr:0.0003000000 time/step:117.59s
73
+ [2025-09-11 21:38:22,375] - step:46/900 train_loss:0.4459 lr:0.0003000000 time/step:116.44s
74
+ [2025-09-11 21:40:19,319] - step:47/900 train_loss:0.4487 lr:0.0003000000 time/step:116.93s
75
+ [2025-09-11 21:42:17,801] - step:48/900 train_loss:0.4378 lr:0.0003000000 time/step:118.48s
76
+ [2025-09-11 21:44:15,250] - step:49/900 train_loss:0.4623 lr:0.0003000000 time/step:117.44s
77
+ [2025-09-11 21:46:12,028] - step:50/900 train_loss:0.4788 lr:0.0003000000 time/step:116.77s
78
+ [2025-09-11 21:48:08,924] - step:51/900 train_loss:0.4612 lr:0.0003000000 time/step:116.89s
79
+ [2025-09-11 21:50:05,277] - step:52/900 train_loss:0.4670 lr:0.0003000000 time/step:116.34s
80
+ [2025-09-11 21:52:03,579] - step:53/900 train_loss:0.4948 lr:0.0003000000 time/step:118.20s
81
+ [2025-09-11 21:54:00,439] - step:54/900 train_loss:0.4474 lr:0.0003000000 time/step:116.86s
82
+ [2025-09-11 21:55:57,226] - step:55/900 train_loss:0.4696 lr:0.0003000000 time/step:116.78s
83
+ [2025-09-11 21:57:54,070] - step:56/900 train_loss:0.4636 lr:0.0003000000 time/step:116.84s
84
+ [2025-09-11 21:59:51,015] - step:57/900 train_loss:0.4567 lr:0.0003000000 time/step:116.93s
85
+ [2025-09-11 22:01:48,416] - step:58/900 train_loss:0.4600 lr:0.0003000000 time/step:117.40s
86
+ [2025-09-11 22:03:46,720] - step:59/900 train_loss:0.4678 lr:0.0003000000 time/step:118.30s
87
+ [2025-09-11 22:05:43,544] - step:60/900 train_loss:0.4619 lr:0.0003000000 time/step:116.82s
88
+ [2025-09-11 22:07:40,424] - step:61/900 train_loss:0.4553 lr:0.0003000000 time/step:116.87s
89
+ [2025-09-11 22:09:37,873] - step:62/900 train_loss:0.4719 lr:0.0003000000 time/step:117.43s
90
+ [2025-09-11 22:11:34,969] - step:63/900 train_loss:0.4582 lr:0.0003000000 time/step:117.09s
91
+ [2025-09-11 22:13:31,914] - step:64/900 train_loss:0.4430 lr:0.0003000000 time/step:116.94s
92
+ [2025-09-11 22:15:28,799] - step:65/900 train_loss:0.4268 lr:0.0003000000 time/step:116.88s
93
+ [2025-09-11 22:17:25,704] - step:66/900 train_loss:0.4669 lr:0.0003000000 time/step:116.90s
94
+ [2025-09-11 22:19:22,827] - step:67/900 train_loss:0.4380 lr:0.0003000000 time/step:117.11s
95
+ [2025-09-11 22:21:20,150] - step:68/900 train_loss:0.4785 lr:0.0003000000 time/step:117.32s
96
+ [2025-09-11 22:23:16,126] - step:69/900 train_loss:0.4678 lr:0.0003000000 time/step:115.97s
97
+ [2025-09-11 22:25:13,659] - step:70/900 train_loss:0.4456 lr:0.0003000000 time/step:117.53s
98
+ [2025-09-11 22:27:10,581] - step:71/900 train_loss:0.4403 lr:0.0003000000 time/step:116.91s
99
+ [2025-09-11 22:29:07,930] - step:72/900 train_loss:0.4318 lr:0.0003000000 time/step:117.34s
100
+ [2025-09-11 22:31:05,566] - step:73/900 train_loss:0.4546 lr:0.0003000000 time/step:117.63s
101
+ [2025-09-11 22:33:02,531] - step:74/900 train_loss:0.4860 lr:0.0003000000 time/step:116.96s
102
+ [2025-09-11 22:34:59,254] - step:75/900 train_loss:0.4499 lr:0.0003000000 time/step:116.72s
103
+ [2025-09-11 22:36:57,138] - step:76/900 train_loss:0.4490 lr:0.0003000000 time/step:117.88s
104
+ [2025-09-11 22:38:54,164] - step:77/900 train_loss:0.4490 lr:0.0003000000 time/step:117.02s
105
+ [2025-09-11 22:40:51,448] - step:78/900 train_loss:0.4455 lr:0.0003000000 time/step:117.27s
106
+ [2025-09-11 22:42:48,430] - step:79/900 train_loss:0.4274 lr:0.0003000000 time/step:116.98s
107
+ [2025-09-11 22:44:45,934] - step:80/900 train_loss:0.4519 lr:0.0003000000 time/step:117.50s
108
+ [2025-09-11 22:46:42,798] - step:81/900 train_loss:0.4429 lr:0.0003000000 time/step:116.85s
109
+ [2025-09-11 22:48:39,720] - step:82/900 train_loss:0.4436 lr:0.0003000000 time/step:116.92s
110
+ [2025-09-11 22:50:37,164] - step:83/900 train_loss:0.4713 lr:0.0003000000 time/step:117.43s
111
+ [2025-09-11 22:52:33,983] - step:84/900 train_loss:0.4399 lr:0.0003000000 time/step:116.82s
112
+ [2025-09-11 22:54:31,605] - step:85/900 train_loss:0.4343 lr:0.0003000000 time/step:117.62s
113
+ [2025-09-11 22:56:29,383] - step:86/900 train_loss:0.4587 lr:0.0003000000 time/step:117.77s
114
+ [2025-09-11 22:58:26,338] - step:87/900 train_loss:0.4550 lr:0.0003000000 time/step:116.95s
115
+ [2025-09-11 23:00:23,614] - step:88/900 train_loss:0.4437 lr:0.0003000000 time/step:117.26s
116
+ [2025-09-11 23:02:20,358] - step:89/900 train_loss:0.4575 lr:0.0003000000 time/step:116.74s
117
+ [2025-09-11 23:04:17,289] - step:90/900 train_loss:0.4361 lr:0.0003000000 time/step:116.93s
118
+ [2025-09-11 23:06:15,307] - step:91/900 train_loss:0.4259 lr:0.0003000000 time/step:118.02s
119
+ [2025-09-11 23:08:12,562] - step:92/900 train_loss:0.4340 lr:0.0003000000 time/step:117.25s
120
+ [2025-09-11 23:10:10,001] - step:93/900 train_loss:0.4424 lr:0.0003000000 time/step:117.43s
121
+ [2025-09-11 23:12:07,171] - step:94/900 train_loss:0.4240 lr:0.0003000000 time/step:117.16s
122
+ [2025-09-11 23:14:05,158] - step:95/900 train_loss:0.4425 lr:0.0003000000 time/step:117.99s
123
+ [2025-09-11 23:16:02,641] - step:96/900 train_loss:0.4575 lr:0.0003000000 time/step:117.48s
124
+ [2025-09-11 23:17:59,591] - step:97/900 train_loss:0.4435 lr:0.0003000000 time/step:116.94s
125
+ [2025-09-11 23:19:55,399] - step:98/900 train_loss:0.4466 lr:0.0003000000 time/step:115.80s
126
+ [2025-09-11 23:21:53,531] - step:99/900 train_loss:0.4469 lr:0.0003000000 time/step:118.12s
127
+ [2025-09-11 23:23:52,424] - step:100/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@100.pt...
128
+ [2025-09-11 23:23:52,425] - step:100/900 train_loss:0.4467 lr:0.0003000000 time/step:118.25s
129
+ [2025-09-11 23:25:49,555] - step:101/900 train_loss:0.4462 lr:0.0003000000 time/step:117.13s
130
+ [2025-09-11 23:27:46,681] - step:102/900 train_loss:0.4479 lr:0.0003000000 time/step:117.12s
131
+ [2025-09-11 23:29:43,985] - step:103/900 train_loss:0.4212 lr:0.0003000000 time/step:117.30s
132
+ [2025-09-11 23:31:40,749] - step:104/900 train_loss:0.4385 lr:0.0003000000 time/step:116.76s
133
+ [2025-09-11 23:33:37,611] - step:105/900 train_loss:0.4490 lr:0.0003000000 time/step:116.86s
134
+ [2025-09-11 23:35:34,674] - step:106/900 train_loss:0.4537 lr:0.0003000000 time/step:117.06s
135
+ [2025-09-11 23:37:32,277] - step:107/900 train_loss:0.4278 lr:0.0003000000 time/step:117.60s
136
+ [2025-09-11 23:39:29,569] - step:108/900 train_loss:0.4413 lr:0.0003000000 time/step:117.28s
137
+ [2025-09-11 23:41:26,965] - step:109/900 train_loss:0.4219 lr:0.0003000000 time/step:117.39s
138
+ [2025-09-11 23:43:23,608] - step:110/900 train_loss:0.4455 lr:0.0003000000 time/step:116.64s
139
+ [2025-09-11 23:45:20,608] - step:111/900 train_loss:0.4581 lr:0.0003000000 time/step:117.00s
140
+ [2025-09-11 23:47:17,496] - step:112/900 train_loss:0.4501 lr:0.0003000000 time/step:116.89s
141
+ [2025-09-11 23:49:15,179] - step:113/900 train_loss:0.4332 lr:0.0003000000 time/step:117.66s
142
+ [2025-09-11 23:51:13,020] - step:114/900 train_loss:0.4311 lr:0.0003000000 time/step:117.83s
143
+ [2025-09-11 23:53:10,646] - step:115/900 train_loss:0.4449 lr:0.0003000000 time/step:117.62s
144
+ [2025-09-11 23:55:06,688] - step:116/900 train_loss:0.4424 lr:0.0003000000 time/step:116.04s
145
+ [2025-09-11 23:57:03,652] - step:117/900 train_loss:0.4392 lr:0.0003000000 time/step:116.96s
146
+ [2025-09-11 23:59:01,394] - step:118/900 train_loss:0.4246 lr:0.0003000000 time/step:117.74s
147
+ [2025-09-12 00:00:58,798] - step:119/900 train_loss:0.4339 lr:0.0003000000 time/step:117.39s
148
+ [2025-09-12 00:02:56,142] - step:120/900 train_loss:0.4064 lr:0.0003000000 time/step:117.33s
149
+ [2025-09-12 00:04:53,044] - step:121/900 train_loss:0.4421 lr:0.0003000000 time/step:116.90s
150
+ [2025-09-12 00:06:49,048] - step:122/900 train_loss:0.4306 lr:0.0003000000 time/step:116.00s
151
+ [2025-09-12 00:08:46,671] - step:123/900 train_loss:0.4163 lr:0.0003000000 time/step:117.62s
152
+ [2025-09-12 00:10:44,735] - step:124/900 train_loss:0.4428 lr:0.0003000000 time/step:118.05s
153
+ [2025-09-12 00:12:42,019] - step:125/900 train_loss:0.4188 lr:0.0003000000 time/step:117.27s
154
+ [2025-09-12 00:14:38,901] - step:126/900 train_loss:0.4226 lr:0.0003000000 time/step:116.88s
155
+ [2025-09-12 00:16:35,356] - step:127/900 train_loss:0.4379 lr:0.0003000000 time/step:116.45s
156
+ [2025-09-12 00:18:31,808] - step:128/900 train_loss:0.4475 lr:0.0003000000 time/step:116.45s
157
+ [2025-09-12 00:20:31,092] - step:129/900 train_loss:0.4579 lr:0.0003000000 time/step:119.27s
158
+ [2025-09-12 00:22:28,417] - step:130/900 train_loss:0.4504 lr:0.0003000000 time/step:117.31s
159
+ [2025-09-12 00:24:25,417] - step:131/900 train_loss:0.4345 lr:0.0003000000 time/step:116.99s
160
+ [2025-09-12 00:26:22,282] - step:132/900 train_loss:0.4567 lr:0.0003000000 time/step:116.86s
161
+ [2025-09-12 00:28:18,304] - step:133/900 train_loss:0.4396 lr:0.0003000000 time/step:116.02s
162
+ [2025-09-12 00:30:15,628] - step:134/900 train_loss:0.4440 lr:0.0003000000 time/step:117.32s
163
+ [2025-09-12 00:32:13,051] - step:135/900 train_loss:0.4384 lr:0.0003000000 time/step:117.42s
164
+ [2025-09-12 00:34:10,336] - step:136/900 train_loss:0.4276 lr:0.0003000000 time/step:117.28s
165
+ [2025-09-12 00:36:07,098] - step:137/900 train_loss:0.4424 lr:0.0003000000 time/step:116.76s
166
+ [2025-09-12 00:38:03,861] - step:138/900 train_loss:0.4288 lr:0.0003000000 time/step:116.76s
167
+ [2025-09-12 00:40:00,304] - step:139/900 train_loss:0.4333 lr:0.0003000000 time/step:116.43s
168
+ [2025-09-12 00:41:57,928] - step:140/900 train_loss:0.4347 lr:0.0003000000 time/step:117.62s
169
+ [2025-09-12 00:43:56,252] - step:141/900 train_loss:0.4515 lr:0.0003000000 time/step:118.32s
170
+ [2025-09-12 00:45:53,156] - step:142/900 train_loss:0.4531 lr:0.0003000000 time/step:116.90s
171
+ [2025-09-12 00:47:50,037] - step:143/900 train_loss:0.4426 lr:0.0003000000 time/step:116.88s
172
+ [2025-09-12 00:49:46,863] - step:144/900 train_loss:0.4100 lr:0.0003000000 time/step:116.81s
173
+ [2025-09-12 00:51:42,986] - step:145/900 train_loss:0.4185 lr:0.0003000000 time/step:116.12s
174
+ [2025-09-12 00:53:40,748] - step:146/900 train_loss:0.4556 lr:0.0003000000 time/step:117.75s
175
+ [2025-09-12 00:55:38,614] - step:147/900 train_loss:0.4580 lr:0.0003000000 time/step:117.86s
176
+ [2025-09-12 00:57:35,395] - step:148/900 train_loss:0.4432 lr:0.0003000000 time/step:116.77s
177
+ [2025-09-12 00:59:32,300] - step:149/900 train_loss:0.4260 lr:0.0003000000 time/step:116.90s
178
+ [2025-09-12 01:01:29,963] - step:150/900 train_loss:0.4369 lr:0.0003000000 time/step:117.65s
179
+ [2025-09-12 01:03:26,107] - step:151/900 train_loss:0.4121 lr:0.0003000000 time/step:116.14s
180
+ [2025-09-12 01:05:23,232] - step:152/900 train_loss:0.4488 lr:0.0003000000 time/step:117.12s
181
+ [2025-09-12 01:07:21,054] - step:153/900 train_loss:0.4290 lr:0.0003000000 time/step:117.82s
182
+ [2025-09-12 01:09:17,934] - step:154/900 train_loss:0.4126 lr:0.0003000000 time/step:116.88s
183
+ [2025-09-12 01:11:15,437] - step:155/900 train_loss:0.4201 lr:0.0003000000 time/step:117.49s
184
+ [2025-09-12 01:13:12,295] - step:156/900 train_loss:0.4294 lr:0.0003000000 time/step:116.85s
185
+ [2025-09-12 01:15:08,687] - step:157/900 train_loss:0.4340 lr:0.0003000000 time/step:116.38s
186
+ [2025-09-12 01:17:05,708] - step:158/900 train_loss:0.4543 lr:0.0003000000 time/step:117.01s
187
+ [2025-09-12 01:19:03,353] - step:159/900 train_loss:0.4211 lr:0.0003000000 time/step:117.64s
188
+ [2025-09-12 01:21:00,871] - step:160/900 train_loss:0.4400 lr:0.0003000000 time/step:117.51s
189
+ [2025-09-12 01:22:57,738] - step:161/900 train_loss:0.4259 lr:0.0003000000 time/step:116.86s
190
+ [2025-09-12 01:24:55,051] - step:162/900 train_loss:0.4150 lr:0.0003000000 time/step:117.31s
191
+ [2025-09-12 01:26:51,147] - step:163/900 train_loss:0.4168 lr:0.0003000000 time/step:116.09s
192
+ [2025-09-12 01:28:48,833] - step:164/900 train_loss:0.4024 lr:0.0003000000 time/step:117.68s
193
+ [2025-09-12 01:30:46,610] - step:165/900 train_loss:0.4476 lr:0.0003000000 time/step:117.77s
194
+ [2025-09-12 01:32:43,517] - step:166/900 train_loss:0.4241 lr:0.0003000000 time/step:116.90s
195
+ [2025-09-12 01:34:41,001] - step:167/900 train_loss:0.4268 lr:0.0003000000 time/step:117.48s
196
+ [2025-09-12 01:36:37,582] - step:168/900 train_loss:0.3846 lr:0.0003000000 time/step:116.57s
197
+ [2025-09-12 01:38:34,908] - step:169/900 train_loss:0.4199 lr:0.0003000000 time/step:117.32s
198
+ [2025-09-12 01:40:33,014] - step:170/900 train_loss:0.4037 lr:0.0003000000 time/step:118.09s
199
+ [2025-09-12 01:42:29,854] - step:171/900 train_loss:0.4579 lr:0.0003000000 time/step:116.84s
200
+ [2025-09-12 01:44:27,350] - step:172/900 train_loss:0.4435 lr:0.0003000000 time/step:117.48s
201
+ [2025-09-12 01:46:24,704] - step:173/900 train_loss:0.4139 lr:0.0003000000 time/step:117.34s
202
+ [2025-09-12 01:48:21,009] - step:174/900 train_loss:0.4308 lr:0.0003000000 time/step:116.30s
203
+ [2025-09-12 01:50:19,086] - step:175/900 train_loss:0.4156 lr:0.0003000000 time/step:118.06s
204
+ [2025-09-12 01:52:16,506] - step:176/900 train_loss:0.4204 lr:0.0003000000 time/step:117.41s
205
+ [2025-09-12 01:54:14,395] - step:177/900 train_loss:0.4211 lr:0.0003000000 time/step:117.87s
206
+ [2025-09-12 01:56:11,781] - step:178/900 train_loss:0.4399 lr:0.0003000000 time/step:117.38s
207
+ [2025-09-12 01:58:09,165] - step:179/900 train_loss:0.4327 lr:0.0003000000 time/step:117.38s
208
+ [2025-09-12 02:00:05,670] - step:180/900 train_loss:0.4362 lr:0.0003000000 time/step:116.49s
209
+ [2025-09-12 02:02:03,683] - step:181/900 train_loss:0.4204 lr:0.0003000000 time/step:118.01s
210
+ [2025-09-12 02:04:01,525] - step:182/900 train_loss:0.4528 lr:0.0003000000 time/step:117.84s
211
+ [2025-09-12 02:05:59,256] - step:183/900 train_loss:0.4115 lr:0.0003000000 time/step:117.72s
212
+ [2025-09-12 02:07:56,456] - step:184/900 train_loss:0.4527 lr:0.0003000000 time/step:117.20s
213
+ [2025-09-12 02:09:53,692] - step:185/900 train_loss:0.4378 lr:0.0003000000 time/step:117.23s
214
+ [2025-09-12 02:11:50,835] - step:186/900 train_loss:0.4322 lr:0.0003000000 time/step:117.14s
215
+ [2025-09-12 02:13:49,249] - step:187/900 train_loss:0.4503 lr:0.0003000000 time/step:118.41s
216
+ [2025-09-12 02:15:46,708] - step:188/900 train_loss:0.4137 lr:0.0003000000 time/step:117.45s
217
+ [2025-09-12 02:17:44,588] - step:189/900 train_loss:0.4373 lr:0.0003000000 time/step:117.87s
218
+ [2025-09-12 02:19:41,640] - step:190/900 train_loss:0.4390 lr:0.0003000000 time/step:117.04s
219
+ [2025-09-12 02:21:38,674] - step:191/900 train_loss:0.4540 lr:0.0003000000 time/step:117.02s
220
+ [2025-09-12 02:23:35,317] - step:192/900 train_loss:0.4401 lr:0.0003000000 time/step:116.64s
221
+ [2025-09-12 02:25:32,403] - step:193/900 train_loss:0.4325 lr:0.0003000000 time/step:117.08s
222
+ [2025-09-12 02:27:29,545] - step:194/900 train_loss:0.4249 lr:0.0003000000 time/step:117.13s
223
+ [2025-09-12 02:29:26,648] - step:195/900 train_loss:0.4074 lr:0.0003000000 time/step:117.09s
224
+ [2025-09-12 02:31:23,432] - step:196/900 train_loss:0.4212 lr:0.0003000000 time/step:116.77s
225
+ [2025-09-12 02:33:21,256] - step:197/900 train_loss:0.4408 lr:0.0003000000 time/step:117.82s
226
+ [2025-09-12 02:35:18,019] - step:198/900 train_loss:0.4229 lr:0.0003000000 time/step:116.76s
227
+ [2025-09-12 02:37:15,403] - step:199/900 train_loss:0.4517 lr:0.0003000000 time/step:117.38s
228
+ [2025-09-12 02:39:13,125] - step:200/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@200.pt...
229
+ [2025-09-12 02:39:13,129] - step:200/900 train_loss:0.4149 lr:0.0003000000 time/step:117.11s
230
+ [2025-09-12 02:41:09,907] - step:201/900 train_loss:0.4258 lr:0.0003000000 time/step:116.76s
231
+ [2025-09-12 02:43:06,972] - step:202/900 train_loss:0.4207 lr:0.0003000000 time/step:117.06s
232
+ [2025-09-12 02:45:03,575] - step:203/900 train_loss:0.4432 lr:0.0003000000 time/step:116.60s
233
+ [2025-09-12 02:47:00,062] - step:204/900 train_loss:0.4072 lr:0.0003000000 time/step:116.48s
234
+ [2025-09-12 02:48:57,543] - step:205/900 train_loss:0.4404 lr:0.0003000000 time/step:117.47s
235
+ [2025-09-12 02:50:54,165] - step:206/900 train_loss:0.4151 lr:0.0003000000 time/step:116.61s
236
+ [2025-09-12 02:52:50,609] - step:207/900 train_loss:0.4256 lr:0.0003000000 time/step:116.44s
237
+ [2025-09-12 02:54:48,171] - step:208/900 train_loss:0.4200 lr:0.0003000000 time/step:117.56s
238
+ [2025-09-12 02:56:44,244] - step:209/900 train_loss:0.4159 lr:0.0003000000 time/step:116.06s
239
+ [2025-09-12 02:58:41,740] - step:210/900 train_loss:0.4080 lr:0.0003000000 time/step:117.49s
240
+ [2025-09-12 03:00:38,362] - step:211/900 train_loss:0.4394 lr:0.0003000000 time/step:116.61s
241
+ [2025-09-12 03:02:34,864] - step:212/900 train_loss:0.4461 lr:0.0003000000 time/step:116.49s
242
+ [2025-09-12 03:04:32,289] - step:213/900 train_loss:0.4310 lr:0.0003000000 time/step:117.42s
243
+ [2025-09-12 03:06:29,834] - step:214/900 train_loss:0.4458 lr:0.0003000000 time/step:117.53s
244
+ [2025-09-12 03:08:26,395] - step:215/900 train_loss:0.4322 lr:0.0003000000 time/step:116.56s
245
+ [2025-09-12 03:10:23,441] - step:216/900 train_loss:0.3979 lr:0.0003000000 time/step:117.03s
246
+ [2025-09-12 03:12:19,963] - step:217/900 train_loss:0.4011 lr:0.0003000000 time/step:116.51s
247
+ [2025-09-12 03:14:17,627] - step:218/900 train_loss:0.4372 lr:0.0003000000 time/step:117.66s
248
+ [2025-09-12 03:16:15,332] - step:219/900 train_loss:0.4281 lr:0.0003000000 time/step:117.70s
249
+ [2025-09-12 03:18:11,833] - step:220/900 train_loss:0.4330 lr:0.0003000000 time/step:116.49s
250
+ [2025-09-12 03:20:08,497] - step:221/900 train_loss:0.4534 lr:0.0003000000 time/step:116.65s
251
+ [2025-09-12 03:22:05,021] - step:222/900 train_loss:0.4076 lr:0.0003000000 time/step:116.52s
252
+ [2025-09-12 03:24:01,826] - step:223/900 train_loss:0.4211 lr:0.0003000000 time/step:116.79s
253
+ [2025-09-12 03:25:58,807] - step:224/900 train_loss:0.4075 lr:0.0003000000 time/step:116.98s
254
+ [2025-09-12 03:27:56,427] - step:225/900 train_loss:0.3977 lr:0.0003000000 time/step:117.61s
255
+ [2025-09-12 03:29:53,271] - step:226/900 train_loss:0.4331 lr:0.0003000000 time/step:116.84s
256
+ [2025-09-12 03:31:48,818] - step:227/900 train_loss:0.4424 lr:0.0003000000 time/step:115.53s
257
+ [2025-09-12 03:33:46,260] - step:228/900 train_loss:0.4265 lr:0.0003000000 time/step:117.44s
258
+ [2025-09-12 03:35:42,726] - step:229/900 train_loss:0.4018 lr:0.0003000000 time/step:116.46s
259
+ [2025-09-12 03:37:39,927] - step:230/900 train_loss:0.4277 lr:0.0003000000 time/step:117.20s
260
+ [2025-09-12 03:39:37,253] - step:231/900 train_loss:0.4229 lr:0.0003000000 time/step:117.32s
261
+ [2025-09-12 03:41:34,210] - step:232/900 train_loss:0.4231 lr:0.0003000000 time/step:116.94s
262
+ [2025-09-12 03:43:30,497] - step:233/900 train_loss:0.4125 lr:0.0003000000 time/step:116.28s
263
+ [2025-09-12 03:45:27,022] - step:234/900 train_loss:0.4181 lr:0.0003000000 time/step:116.52s
264
+ [2025-09-12 03:47:23,505] - step:235/900 train_loss:0.4364 lr:0.0003000000 time/step:116.48s
265
+ [2025-09-12 03:49:21,967] - step:236/900 train_loss:0.4135 lr:0.0003000000 time/step:118.46s
266
+ [2025-09-12 03:51:18,413] - step:237/900 train_loss:0.4139 lr:0.0003000000 time/step:116.43s
267
+ [2025-09-12 03:53:14,453] - step:238/900 train_loss:0.4341 lr:0.0003000000 time/step:116.03s
268
+ [2025-09-12 03:55:11,117] - step:239/900 train_loss:0.4174 lr:0.0003000000 time/step:116.66s
269
+ [2025-09-12 03:57:08,642] - step:240/900 train_loss:0.4449 lr:0.0003000000 time/step:117.52s
270
+ [2025-09-12 03:59:06,595] - step:241/900 train_loss:0.4303 lr:0.0003000000 time/step:117.95s
271
+ [2025-09-12 04:01:02,667] - step:242/900 train_loss:0.4350 lr:0.0003000000 time/step:116.06s
272
+ [2025-09-12 04:02:58,652] - step:243/900 train_loss:0.4332 lr:0.0003000000 time/step:115.97s
273
+ [2025-09-12 04:04:55,158] - step:244/900 train_loss:0.4170 lr:0.0003000000 time/step:116.50s
274
+ [2025-09-12 04:06:52,523] - step:245/900 train_loss:0.4325 lr:0.0003000000 time/step:117.35s
275
+ [2025-09-12 04:08:49,506] - step:246/900 train_loss:0.4140 lr:0.0003000000 time/step:116.98s
276
+ [2025-09-12 04:10:46,625] - step:247/900 train_loss:0.4244 lr:0.0003000000 time/step:117.10s
277
+ [2025-09-12 04:12:43,060] - step:248/900 train_loss:0.4435 lr:0.0003000000 time/step:116.43s
278
+ [2025-09-12 04:14:39,932] - step:249/900 train_loss:0.4188 lr:0.0003000000 time/step:116.87s
279
+ [2025-09-12 04:16:36,428] - step:250/900 train_loss:0.4138 lr:0.0003000000 time/step:116.49s
280
+ [2025-09-12 04:18:34,283] - step:251/900 train_loss:0.4045 lr:0.0003000000 time/step:117.84s
281
+ [2025-09-12 04:20:32,264] - step:252/900 train_loss:0.4128 lr:0.0003000000 time/step:117.96s
282
+ [2025-09-12 04:22:28,905] - step:253/900 train_loss:0.4352 lr:0.0003000000 time/step:116.63s
283
+ [2025-09-12 04:24:25,744] - step:254/900 train_loss:0.4090 lr:0.0003000000 time/step:116.83s
284
+ [2025-09-12 04:26:22,527] - step:255/900 train_loss:0.4125 lr:0.0003000000 time/step:116.78s
285
+ [2025-09-12 04:28:18,535] - step:256/900 train_loss:0.3974 lr:0.0003000000 time/step:116.00s
286
+ [2025-09-12 04:30:16,548] - step:257/900 train_loss:0.4056 lr:0.0003000000 time/step:118.00s
287
+ [2025-09-12 04:32:14,016] - step:258/900 train_loss:0.4158 lr:0.0003000000 time/step:117.45s
288
+ [2025-09-12 04:34:10,993] - step:259/900 train_loss:0.4080 lr:0.0003000000 time/step:116.97s
289
+ [2025-09-12 04:36:07,637] - step:260/900 train_loss:0.4217 lr:0.0003000000 time/step:116.64s
290
+ [2025-09-12 04:38:05,072] - step:261/900 train_loss:0.4157 lr:0.0003000000 time/step:117.43s
291
+ [2025-09-12 04:40:01,843] - step:262/900 train_loss:0.4139 lr:0.0003000000 time/step:116.76s
292
+ [2025-09-12 04:41:58,873] - step:263/900 train_loss:0.4401 lr:0.0003000000 time/step:117.01s
293
+ [2025-09-12 04:43:56,795] - step:264/900 train_loss:0.4272 lr:0.0003000000 time/step:117.92s
294
+ [2025-09-12 04:45:53,571] - step:265/900 train_loss:0.4228 lr:0.0003000000 time/step:116.76s
295
+ [2025-09-12 04:47:50,269] - step:266/900 train_loss:0.4242 lr:0.0003000000 time/step:116.69s
296
+ [2025-09-12 04:49:47,027] - step:267/900 train_loss:0.4361 lr:0.0003000000 time/step:116.75s
297
+ [2025-09-12 04:51:43,112] - step:268/900 train_loss:0.4224 lr:0.0003000000 time/step:116.07s
298
+ [2025-09-12 04:53:41,046] - step:269/900 train_loss:0.4076 lr:0.0003000000 time/step:117.92s
299
+ [2025-09-12 04:55:37,470] - step:270/900 train_loss:0.4172 lr:0.0003000000 time/step:116.42s
300
+ [2025-09-12 04:57:33,853] - step:271/900 train_loss:0.4219 lr:0.0003000000 time/step:116.38s
301
+ [2025-09-12 04:59:30,265] - step:272/900 train_loss:0.4281 lr:0.0003000000 time/step:116.41s
302
+ [2025-09-12 05:01:26,500] - step:273/900 train_loss:0.4105 lr:0.0003000000 time/step:116.22s
303
+ [2025-09-12 05:03:24,415] - step:274/900 train_loss:0.4247 lr:0.0003000000 time/step:117.91s
304
+ [2025-09-12 05:05:21,825] - step:275/900 train_loss:0.4172 lr:0.0003000000 time/step:117.40s
305
+ [2025-09-12 05:07:18,643] - step:276/900 train_loss:0.4281 lr:0.0003000000 time/step:116.81s
306
+ [2025-09-12 05:09:15,889] - step:277/900 train_loss:0.4140 lr:0.0003000000 time/step:117.23s
307
+ [2025-09-12 05:11:13,080] - step:278/900 train_loss:0.4459 lr:0.0003000000 time/step:117.18s
308
+ [2025-09-12 05:13:09,433] - step:279/900 train_loss:0.4128 lr:0.0003000000 time/step:116.35s
309
+ [2025-09-12 05:15:07,057] - step:280/900 train_loss:0.4171 lr:0.0003000000 time/step:117.62s
310
+ [2025-09-12 05:17:03,780] - step:281/900 train_loss:0.4083 lr:0.0003000000 time/step:116.71s
311
+ [2025-09-12 05:19:00,703] - step:282/900 train_loss:0.4214 lr:0.0003000000 time/step:116.92s
312
+ [2025-09-12 05:20:57,932] - step:283/900 train_loss:0.4072 lr:0.0003000000 time/step:117.19s
313
+ [2025-09-12 05:22:54,350] - step:284/900 train_loss:0.4471 lr:0.0003000000 time/step:116.39s
314
+ [2025-09-12 05:24:50,794] - step:285/900 train_loss:0.3946 lr:0.0003000000 time/step:116.44s
315
+ [2025-09-12 05:26:47,657] - step:286/900 train_loss:0.4510 lr:0.0003000000 time/step:116.86s
316
+ [2025-09-12 05:28:43,717] - step:287/900 train_loss:0.4409 lr:0.0003000000 time/step:116.05s
317
+ [2025-09-12 05:30:40,741] - step:288/900 train_loss:0.3887 lr:0.0003000000 time/step:117.01s
318
+ [2025-09-12 05:32:38,986] - step:289/900 train_loss:0.4207 lr:0.0003000000 time/step:118.24s
319
+ [2025-09-12 05:34:35,229] - step:290/900 train_loss:0.4018 lr:0.0003000000 time/step:116.24s
320
+ [2025-09-12 05:36:31,796] - step:291/900 train_loss:0.4233 lr:0.0003000000 time/step:116.56s
321
+ [2025-09-12 05:38:28,659] - step:292/900 train_loss:0.4223 lr:0.0003000000 time/step:116.86s
322
+ [2025-09-12 05:40:25,842] - step:293/900 train_loss:0.4412 lr:0.0003000000 time/step:117.18s
323
+ [2025-09-12 05:42:22,767] - step:294/900 train_loss:0.3965 lr:0.0003000000 time/step:116.91s
324
+ [2025-09-12 05:44:20,588] - step:295/900 train_loss:0.4155 lr:0.0003000000 time/step:117.81s
325
+ [2025-09-12 05:46:17,250] - step:296/900 train_loss:0.4051 lr:0.0003000000 time/step:116.66s
326
+ [2025-09-12 05:48:13,495] - step:297/900 train_loss:0.4186 lr:0.0003000000 time/step:116.24s
327
+ [2025-09-12 05:50:10,418] - step:298/900 train_loss:0.4280 lr:0.0003000000 time/step:116.91s
328
+ [2025-09-12 05:52:07,903] - step:299/900 train_loss:0.4225 lr:0.0003000000 time/step:117.46s
329
+ [2025-09-12 05:54:04,575] - step:300/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@300.pt...
330
+ [2025-09-12 05:54:04,576] - step:300/900 train_loss:0.4086 lr:0.0003000000 time/step:116.07s
331
+ [2025-09-12 05:56:02,285] - step:301/900 train_loss:0.4136 lr:0.0003000000 time/step:117.71s
332
+ [2025-09-12 05:58:00,171] - step:302/900 train_loss:0.4114 lr:0.0003000000 time/step:117.88s
333
+ [2025-09-12 05:59:55,795] - step:303/900 train_loss:0.4200 lr:0.0003000000 time/step:115.62s
334
+ [2025-09-12 06:01:52,652] - step:304/900 train_loss:0.4085 lr:0.0003000000 time/step:116.84s
335
+ [2025-09-12 06:03:49,824] - step:305/900 train_loss:0.4311 lr:0.0003000000 time/step:117.16s
336
+ [2025-09-12 06:05:46,285] - step:306/900 train_loss:0.4367 lr:0.0003000000 time/step:116.45s
337
+ [2025-09-12 06:07:44,209] - step:307/900 train_loss:0.4345 lr:0.0003000000 time/step:117.92s
338
+ [2025-09-12 06:09:41,568] - step:308/900 train_loss:0.4016 lr:0.0003000000 time/step:117.35s
339
+ [2025-09-12 06:11:38,074] - step:309/900 train_loss:0.4102 lr:0.0003000000 time/step:116.49s
340
+ [2025-09-12 06:13:34,857] - step:310/900 train_loss:0.4332 lr:0.0003000000 time/step:116.77s
341
+ [2025-09-12 06:15:32,302] - step:311/900 train_loss:0.4186 lr:0.0003000000 time/step:117.43s
342
+ [2025-09-12 06:17:29,124] - step:312/900 train_loss:0.4371 lr:0.0003000000 time/step:116.82s
343
+ [2025-09-12 06:19:26,289] - step:313/900 train_loss:0.4130 lr:0.0003000000 time/step:117.16s
344
+ [2025-09-12 06:21:22,830] - step:314/900 train_loss:0.4031 lr:0.0003000000 time/step:116.53s
345
+ [2025-09-12 06:23:19,454] - step:315/900 train_loss:0.4286 lr:0.0003000000 time/step:116.62s
346
+ [2025-09-12 06:25:17,324] - step:316/900 train_loss:0.4007 lr:0.0003000000 time/step:117.86s
347
+ [2025-09-12 06:27:14,242] - step:317/900 train_loss:0.4114 lr:0.0003000000 time/step:116.91s
348
+ [2025-09-12 06:29:11,325] - step:318/900 train_loss:0.4251 lr:0.0003000000 time/step:117.08s
349
+ [2025-09-12 06:31:08,368] - step:319/900 train_loss:0.4448 lr:0.0003000000 time/step:117.03s
350
+ [2025-09-12 06:33:04,509] - step:320/900 train_loss:0.4103 lr:0.0003000000 time/step:116.14s
351
+ [2025-09-12 06:35:02,658] - step:321/900 train_loss:0.4142 lr:0.0003000000 time/step:118.14s
352
+ [2025-09-12 06:36:59,639] - step:322/900 train_loss:0.3985 lr:0.0003000000 time/step:116.97s
353
+ [2025-09-12 06:38:56,063] - step:323/900 train_loss:0.4057 lr:0.0003000000 time/step:116.42s
354
+ [2025-09-12 06:40:53,684] - step:324/900 train_loss:0.4223 lr:0.0003000000 time/step:117.62s
355
+ [2025-09-12 06:42:50,547] - step:325/900 train_loss:0.4205 lr:0.0003000000 time/step:116.85s
356
+ [2025-09-12 06:44:46,896] - step:326/900 train_loss:0.4172 lr:0.0003000000 time/step:116.34s
357
+ [2025-09-12 06:46:45,176] - step:327/900 train_loss:0.4186 lr:0.0003000000 time/step:118.27s
358
+ [2025-09-12 06:48:42,119] - step:328/900 train_loss:0.4294 lr:0.0003000000 time/step:116.93s
359
+ [2025-09-12 06:50:38,781] - step:329/900 train_loss:0.4072 lr:0.0003000000 time/step:116.66s
360
+ [2025-09-12 06:52:36,425] - step:330/900 train_loss:0.4248 lr:0.0003000000 time/step:117.63s
361
+ [2025-09-12 06:54:33,431] - step:331/900 train_loss:0.4141 lr:0.0003000000 time/step:117.00s
362
+ [2025-09-12 06:56:30,074] - step:332/900 train_loss:0.4124 lr:0.0003000000 time/step:116.64s
363
+ [2025-09-12 06:58:26,556] - step:333/900 train_loss:0.4281 lr:0.0003000000 time/step:116.47s
364
+ [2025-09-12 07:00:23,620] - step:334/900 train_loss:0.4141 lr:0.0003000000 time/step:117.06s
365
+ [2025-09-12 07:02:21,404] - step:335/900 train_loss:0.4197 lr:0.0003000000 time/step:117.77s
366
+ [2025-09-12 07:04:17,967] - step:336/900 train_loss:0.4356 lr:0.0003000000 time/step:116.56s
367
+ [2025-09-12 07:06:16,192] - step:337/900 train_loss:0.3934 lr:0.0003000000 time/step:118.22s
368
+ [2025-09-12 07:08:12,291] - step:338/900 train_loss:0.3917 lr:0.0003000000 time/step:116.09s
369
+ [2025-09-12 07:10:08,910] - step:339/900 train_loss:0.4353 lr:0.0003000000 time/step:116.61s
370
+ [2025-09-12 07:12:06,665] - step:340/900 train_loss:0.4537 lr:0.0003000000 time/step:117.74s
371
+ [2025-09-12 07:14:03,621] - step:341/900 train_loss:0.4146 lr:0.0003000000 time/step:116.95s
372
+ [2025-09-12 07:16:00,835] - step:342/900 train_loss:0.4194 lr:0.0003000000 time/step:117.20s
373
+ [2025-09-12 07:17:57,387] - step:343/900 train_loss:0.4117 lr:0.0003000000 time/step:116.54s
374
+ [2025-09-12 07:19:53,951] - step:344/900 train_loss:0.3925 lr:0.0003000000 time/step:116.56s
375
+ [2025-09-12 07:21:50,959] - step:345/900 train_loss:0.4268 lr:0.0003000000 time/step:117.00s
376
+ [2025-09-12 07:23:49,546] - step:346/900 train_loss:0.4113 lr:0.0003000000 time/step:118.58s
377
+ [2025-09-12 07:25:46,639] - step:347/900 train_loss:0.4211 lr:0.0003000000 time/step:117.08s
378
+ [2025-09-12 07:27:43,350] - step:348/900 train_loss:0.4183 lr:0.0003000000 time/step:116.70s
379
+ [2025-09-12 07:29:39,127] - step:349/900 train_loss:0.4313 lr:0.0003000000 time/step:115.77s
380
+ [2025-09-12 07:31:35,852] - step:350/900 train_loss:0.3881 lr:0.0003000000 time/step:116.71s
381
+ [2025-09-12 07:33:34,104] - step:351/900 train_loss:0.4243 lr:0.0003000000 time/step:118.24s
382
+ [2025-09-12 07:35:31,118] - step:352/900 train_loss:0.4273 lr:0.0003000000 time/step:117.00s
383
+ [2025-09-12 07:37:28,208] - step:353/900 train_loss:0.3925 lr:0.0003000000 time/step:117.06s
384
+ [2025-09-12 07:39:25,351] - step:354/900 train_loss:0.4223 lr:0.0003000000 time/step:117.14s
385
+ [2025-09-12 07:41:21,430] - step:355/900 train_loss:0.3996 lr:0.0003000000 time/step:116.07s
386
+ [2025-09-12 07:43:18,880] - step:356/900 train_loss:0.4095 lr:0.0003000000 time/step:117.45s
387
+ [2025-09-12 07:45:16,716] - step:357/900 train_loss:0.4204 lr:0.0003000000 time/step:117.83s
388
+ [2025-09-12 07:47:14,287] - step:358/900 train_loss:0.4157 lr:0.0003000000 time/step:117.56s
389
+ [2025-09-12 07:49:11,022] - step:359/900 train_loss:0.4179 lr:0.0003000000 time/step:116.72s
390
+ [2025-09-12 07:51:08,126] - step:360/900 train_loss:0.4490 lr:0.0003000000 time/step:117.10s
391
+ [2025-09-12 07:53:04,336] - step:361/900 train_loss:0.4100 lr:0.0003000000 time/step:116.20s
392
+ [2025-09-12 07:55:00,814] - step:362/900 train_loss:0.4050 lr:0.0003000000 time/step:116.47s
393
+ [2025-09-12 07:56:58,814] - step:363/900 train_loss:0.4299 lr:0.0003000000 time/step:117.99s
394
+ [2025-09-12 07:58:55,677] - step:364/900 train_loss:0.3970 lr:0.0003000000 time/step:116.85s
395
+ [2025-09-12 08:00:53,062] - step:365/900 train_loss:0.4180 lr:0.0003000000 time/step:117.38s
396
+ [2025-09-12 08:02:49,522] - step:366/900 train_loss:0.4307 lr:0.0003000000 time/step:116.45s
397
+ [2025-09-12 08:04:45,597] - step:367/900 train_loss:0.4335 lr:0.0003000000 time/step:116.07s
398
+ [2025-09-12 08:06:43,333] - step:368/900 train_loss:0.3967 lr:0.0003000000 time/step:117.73s
399
+ [2025-09-12 08:08:40,432] - step:369/900 train_loss:0.4226 lr:0.0003000000 time/step:117.09s
400
+ [2025-09-12 08:10:38,337] - step:370/900 train_loss:0.4086 lr:0.0003000000 time/step:117.90s
401
+ [2025-09-12 08:12:35,283] - step:371/900 train_loss:0.3949 lr:0.0003000000 time/step:116.93s
402
+ [2025-09-12 08:14:31,782] - step:372/900 train_loss:0.4219 lr:0.0003000000 time/step:116.49s
403
+ [2025-09-12 08:16:29,230] - step:373/900 train_loss:0.4088 lr:0.0003000000 time/step:117.44s
404
+ [2025-09-12 08:18:26,952] - step:374/900 train_loss:0.4184 lr:0.0003000000 time/step:117.71s
405
+ [2025-09-12 08:20:23,596] - step:375/900 train_loss:0.4110 lr:0.0003000000 time/step:116.64s
406
+ [2025-09-12 08:22:20,047] - step:376/900 train_loss:0.4305 lr:0.0003000000 time/step:116.44s
407
+ [2025-09-12 08:24:16,398] - step:377/900 train_loss:0.4143 lr:0.0003000000 time/step:116.35s
408
+ [2025-09-12 08:26:13,665] - step:378/900 train_loss:0.4139 lr:0.0003000000 time/step:117.26s
409
+ [2025-09-12 08:28:09,796] - step:379/900 train_loss:0.4060 lr:0.0003000000 time/step:116.13s
410
+ [2025-09-12 08:30:07,613] - step:380/900 train_loss:0.3921 lr:0.0003000000 time/step:117.81s
411
+ [2025-09-12 08:32:04,597] - step:381/900 train_loss:0.4239 lr:0.0003000000 time/step:116.97s
412
+ [2025-09-12 08:34:01,394] - step:382/900 train_loss:0.4041 lr:0.0003000000 time/step:116.79s
413
+ [2025-09-12 08:35:58,263] - step:383/900 train_loss:0.4115 lr:0.0003000000 time/step:116.86s
414
+ [2025-09-12 08:37:54,649] - step:384/900 train_loss:0.4216 lr:0.0003000000 time/step:116.38s
415
+ [2025-09-12 08:39:51,866] - step:385/900 train_loss:0.4057 lr:0.0003000000 time/step:117.21s
416
+ [2025-09-12 08:41:49,473] - step:386/900 train_loss:0.4021 lr:0.0003000000 time/step:117.60s
417
+ [2025-09-12 08:43:46,456] - step:387/900 train_loss:0.4235 lr:0.0003000000 time/step:116.98s
418
+ [2025-09-12 08:45:42,939] - step:388/900 train_loss:0.4309 lr:0.0003000000 time/step:116.48s
419
+ [2025-09-12 08:47:40,164] - step:389/900 train_loss:0.3930 lr:0.0003000000 time/step:117.22s
420
+ [2025-09-12 08:49:36,386] - step:390/900 train_loss:0.4063 lr:0.0003000000 time/step:116.22s
421
+ [2025-09-12 08:51:33,830] - step:391/900 train_loss:0.4034 lr:0.0003000000 time/step:117.43s
422
+ [2025-09-12 08:53:30,812] - step:392/900 train_loss:0.4071 lr:0.0003000000 time/step:116.97s
423
+ [2025-09-12 08:55:28,574] - step:393/900 train_loss:0.4296 lr:0.0003000000 time/step:117.75s
424
+ [2025-09-12 08:57:25,899] - step:394/900 train_loss:0.4171 lr:0.0003000000 time/step:117.31s
425
+ [2025-09-12 08:59:22,463] - step:395/900 train_loss:0.4167 lr:0.0003000000 time/step:116.56s
426
+ [2025-09-12 09:01:19,086] - step:396/900 train_loss:0.4119 lr:0.0003000000 time/step:116.62s
427
+ [2025-09-12 09:03:16,267] - step:397/900 train_loss:0.4057 lr:0.0003000000 time/step:117.17s
428
+ [2025-09-12 09:05:13,175] - step:398/900 train_loss:0.4064 lr:0.0003000000 time/step:116.90s
429
+ [2025-09-12 09:07:10,958] - step:399/900 train_loss:0.3913 lr:0.0003000000 time/step:117.77s
430
+ [2025-09-12 09:09:08,523] - step:400/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@400.pt...
431
+ [2025-09-12 09:09:08,524] - step:400/900 train_loss:0.4028 lr:0.0003000000 time/step:116.93s
432
+ [2025-09-12 09:11:04,902] - step:401/900 train_loss:0.3889 lr:0.0003000000 time/step:116.38s
433
+ [2025-09-12 09:13:01,467] - step:402/900 train_loss:0.4192 lr:0.0003000000 time/step:116.55s
434
+ [2025-09-12 09:14:58,472] - step:403/900 train_loss:0.4211 lr:0.0003000000 time/step:117.00s
435
+ [2025-09-12 09:16:55,036] - step:404/900 train_loss:0.4354 lr:0.0003000000 time/step:116.56s
436
+ [2025-09-12 09:18:52,741] - step:405/900 train_loss:0.4290 lr:0.0003000000 time/step:117.69s
437
+ [2025-09-12 09:20:49,701] - step:406/900 train_loss:0.4290 lr:0.0003000000 time/step:116.95s
438
+ [2025-09-12 09:22:46,403] - step:407/900 train_loss:0.4257 lr:0.0003000000 time/step:116.69s
439
+ [2025-09-12 09:24:43,048] - step:408/900 train_loss:0.4252 lr:0.0003000000 time/step:116.64s
440
+ [2025-09-12 09:26:39,532] - step:409/900 train_loss:0.3992 lr:0.0003000000 time/step:116.48s
441
+ [2025-09-12 09:28:37,397] - step:410/900 train_loss:0.4191 lr:0.0003000000 time/step:117.86s
442
+ [2025-09-12 09:30:33,697] - step:411/900 train_loss:0.3892 lr:0.0003000000 time/step:116.29s
443
+ [2025-09-12 09:32:30,021] - step:412/900 train_loss:0.3843 lr:0.0003000000 time/step:116.32s
444
+ [2025-09-12 09:34:27,365] - step:413/900 train_loss:0.4010 lr:0.0003000000 time/step:117.34s
445
+ [2025-09-12 09:36:24,146] - step:414/900 train_loss:0.4190 lr:0.0003000000 time/step:116.77s
446
+ [2025-09-12 09:38:20,888] - step:415/900 train_loss:0.4182 lr:0.0003000000 time/step:116.73s
447
+ [2025-09-12 09:40:17,896] - step:416/900 train_loss:0.4236 lr:0.0003000000 time/step:117.00s
448
+ [2025-09-12 09:42:14,418] - step:417/900 train_loss:0.4016 lr:0.0003000000 time/step:116.51s
449
+ [2025-09-12 09:44:11,142] - step:418/900 train_loss:0.4054 lr:0.0003000000 time/step:116.72s
450
+ [2025-09-12 09:46:07,906] - step:419/900 train_loss:0.4162 lr:0.0003000000 time/step:116.75s
451
+ [2025-09-12 09:48:05,609] - step:420/900 train_loss:0.3856 lr:0.0003000000 time/step:117.70s
452
+ [2025-09-12 09:50:02,634] - step:421/900 train_loss:0.3832 lr:0.0003000000 time/step:117.02s
453
+ [2025-09-12 09:51:59,099] - step:422/900 train_loss:0.4000 lr:0.0003000000 time/step:116.45s
454
+ [2025-09-12 09:53:56,083] - step:423/900 train_loss:0.4182 lr:0.0003000000 time/step:116.98s
455
+ [2025-09-12 09:55:53,683] - step:424/900 train_loss:0.4064 lr:0.0003000000 time/step:117.60s
456
+ [2025-09-12 09:57:49,838] - step:425/900 train_loss:0.4186 lr:0.0003000000 time/step:116.14s
457
+ [2025-09-12 09:59:47,210] - step:426/900 train_loss:0.4251 lr:0.0003000000 time/step:117.36s
458
+ [2025-09-12 10:01:43,887] - step:427/900 train_loss:0.3975 lr:0.0003000000 time/step:116.67s
459
+ [2025-09-12 10:03:40,560] - step:428/900 train_loss:0.4212 lr:0.0003000000 time/step:116.66s
460
+ [2025-09-12 10:05:37,859] - step:429/900 train_loss:0.4118 lr:0.0003000000 time/step:117.29s
461
+ [2025-09-12 10:07:35,749] - step:430/900 train_loss:0.3981 lr:0.0003000000 time/step:117.88s
462
+ [2025-09-12 10:09:32,291] - step:431/900 train_loss:0.4237 lr:0.0003000000 time/step:116.53s
463
+ [2025-09-12 10:11:29,229] - step:432/900 train_loss:0.3926 lr:0.0003000000 time/step:116.93s
464
+ [2025-09-12 10:13:26,136] - step:433/900 train_loss:0.4208 lr:0.0003000000 time/step:116.90s
465
+ [2025-09-12 10:15:22,577] - step:434/900 train_loss:0.4102 lr:0.0003000000 time/step:116.44s
466
+ [2025-09-12 10:17:19,961] - step:435/900 train_loss:0.4373 lr:0.0003000000 time/step:117.38s
467
+ [2025-09-12 10:19:18,170] - step:436/900 train_loss:0.4159 lr:0.0003000000 time/step:118.20s
468
+ [2025-09-12 10:21:13,810] - step:437/900 train_loss:0.4083 lr:0.0003000000 time/step:115.63s
469
+ [2025-09-12 10:23:10,450] - step:438/900 train_loss:0.4361 lr:0.0003000000 time/step:116.63s
470
+ [2025-09-12 10:25:07,257] - step:439/900 train_loss:0.4152 lr:0.0003000000 time/step:116.80s
471
+ [2025-09-12 10:27:04,621] - step:440/900 train_loss:0.4100 lr:0.0003000000 time/step:117.36s
472
+ [2025-09-12 10:29:01,561] - step:441/900 train_loss:0.4003 lr:0.0003000000 time/step:116.93s
473
+ [2025-09-12 10:30:58,928] - step:442/900 train_loss:0.4296 lr:0.0003000000 time/step:117.36s
474
+ [2025-09-12 10:32:54,885] - step:443/900 train_loss:0.4175 lr:0.0003000000 time/step:115.95s
475
+ [2025-09-12 10:34:51,250] - step:444/900 train_loss:0.4220 lr:0.0003000000 time/step:116.36s
476
+ [2025-09-12 10:36:48,671] - step:445/900 train_loss:0.4361 lr:0.0003000000 time/step:117.42s
477
+ [2025-09-12 10:38:46,902] - step:446/900 train_loss:0.4034 lr:0.0003000000 time/step:118.22s
478
+ [2025-09-12 10:40:44,143] - step:447/900 train_loss:0.4121 lr:0.0003000000 time/step:117.22s
479
+ [2025-09-12 10:42:40,558] - step:448/900 train_loss:0.4247 lr:0.0003000000 time/step:116.40s
480
+ [2025-09-12 10:44:37,203] - step:449/900 train_loss:0.4502 lr:0.0003000000 time/step:116.64s
481
+ [2025-09-12 10:46:34,074] - step:450/900 train_loss:0.4202 lr:0.0003000000 time/step:116.87s
482
+ [2025-09-12 10:48:32,574] - step:451/900 train_loss:0.4115 lr:0.0003000000 time/step:118.50s
483
+ [2025-09-12 10:50:30,519] - step:452/900 train_loss:0.4416 lr:0.0003000000 time/step:117.93s
484
+ [2025-09-12 10:52:27,400] - step:453/900 train_loss:0.4589 lr:0.0003000000 time/step:116.87s
485
+ [2025-09-12 10:54:23,502] - step:454/900 train_loss:0.4104 lr:0.0003000000 time/step:116.09s
486
+ [2025-09-12 10:56:20,043] - step:455/900 train_loss:0.4428 lr:0.0003000000 time/step:116.54s
487
+ [2025-09-12 10:58:18,649] - step:456/900 train_loss:0.3869 lr:0.0003000000 time/step:118.60s
488
+ [2025-09-12 11:00:16,434] - step:457/900 train_loss:0.3896 lr:0.0003000000 time/step:117.77s
489
+ [2025-09-12 11:02:12,853] - step:458/900 train_loss:0.4199 lr:0.0003000000 time/step:116.41s
490
+ [2025-09-12 11:04:09,871] - step:459/900 train_loss:0.4109 lr:0.0003000000 time/step:117.00s
491
+ [2025-09-12 11:06:05,943] - step:460/900 train_loss:0.4113 lr:0.0003000000 time/step:116.07s
492
+ [2025-09-12 11:08:02,527] - step:461/900 train_loss:0.3895 lr:0.0003000000 time/step:116.58s
493
+ [2025-09-12 11:10:00,790] - step:462/900 train_loss:0.4033 lr:0.0003000000 time/step:118.26s
494
+ [2025-09-12 11:11:58,115] - step:463/900 train_loss:0.4269 lr:0.0003000000 time/step:117.32s
495
+ [2025-09-12 11:13:54,593] - step:464/900 train_loss:0.4080 lr:0.0003000000 time/step:116.46s
496
+ [2025-09-12 11:15:51,480] - step:465/900 train_loss:0.4208 lr:0.0003000000 time/step:116.88s
497
+ [2025-09-12 11:17:48,283] - step:466/900 train_loss:0.4146 lr:0.0003000000 time/step:116.80s
498
+ [2025-09-12 11:19:44,666] - step:467/900 train_loss:0.4178 lr:0.0003000000 time/step:116.38s
499
+ [2025-09-12 11:21:43,091] - step:468/900 train_loss:0.4065 lr:0.0003000000 time/step:118.42s
500
+ [2025-09-12 11:23:40,099] - step:469/900 train_loss:0.4158 lr:0.0003000000 time/step:117.00s
501
+ [2025-09-12 11:25:36,537] - step:470/900 train_loss:0.3969 lr:0.0003000000 time/step:116.43s
502
+ [2025-09-12 11:27:34,080] - step:471/900 train_loss:0.4355 lr:0.0003000000 time/step:117.54s
503
+ [2025-09-12 11:29:30,162] - step:472/900 train_loss:0.3901 lr:0.0003000000 time/step:116.08s
504
+ [2025-09-12 11:31:28,047] - step:473/900 train_loss:0.4142 lr:0.0003000000 time/step:117.88s
505
+ [2025-09-12 11:33:24,570] - step:474/900 train_loss:0.4396 lr:0.0003000000 time/step:116.51s
506
+ [2025-09-12 11:35:21,454] - step:475/900 train_loss:0.3944 lr:0.0003000000 time/step:116.88s
507
+ [2025-09-12 11:37:18,778] - step:476/900 train_loss:0.4112 lr:0.0003000000 time/step:117.32s
508
+ [2025-09-12 11:39:15,275] - step:477/900 train_loss:0.4239 lr:0.0003000000 time/step:116.49s
509
+ [2025-09-12 11:41:11,285] - step:478/900 train_loss:0.4200 lr:0.0003000000 time/step:116.01s
510
+ [2025-09-12 11:43:08,711] - step:479/900 train_loss:0.4177 lr:0.0003000000 time/step:117.41s
511
+ [2025-09-12 11:45:05,127] - step:480/900 train_loss:0.3939 lr:0.0003000000 time/step:116.41s
512
+ [2025-09-12 11:47:02,193] - step:481/900 train_loss:0.4138 lr:0.0003000000 time/step:117.06s
513
+ [2025-09-12 11:48:59,561] - step:482/900 train_loss:0.4252 lr:0.0003000000 time/step:117.36s
514
+ [2025-09-12 11:50:55,554] - step:483/900 train_loss:0.4048 lr:0.0003000000 time/step:115.99s
515
+ [2025-09-12 11:52:52,805] - step:484/900 train_loss:0.4000 lr:0.0003000000 time/step:117.24s
516
+ [2025-09-12 11:54:49,667] - step:485/900 train_loss:0.4216 lr:0.0003000000 time/step:116.85s
517
+ [2025-09-12 11:56:46,072] - step:486/900 train_loss:0.4095 lr:0.0003000000 time/step:116.40s
518
+ [2025-09-12 11:58:43,074] - step:487/900 train_loss:0.4027 lr:0.0003000000 time/step:117.00s
519
+ [2025-09-12 12:00:40,979] - step:488/900 train_loss:0.4245 lr:0.0003000000 time/step:117.90s
520
+ [2025-09-12 12:02:38,064] - step:489/900 train_loss:0.3942 lr:0.0003000000 time/step:117.08s
521
+ [2025-09-12 12:04:34,804] - step:490/900 train_loss:0.4239 lr:0.0003000000 time/step:116.72s
522
+ [2025-09-12 12:06:31,269] - step:491/900 train_loss:0.3853 lr:0.0003000000 time/step:116.46s
523
+ [2025-09-12 12:08:28,111] - step:492/900 train_loss:0.4141 lr:0.0003000000 time/step:116.84s
524
+ [2025-09-12 12:10:24,954] - step:493/900 train_loss:0.4139 lr:0.0003000000 time/step:116.84s
525
+ [2025-09-12 12:12:22,937] - step:494/900 train_loss:0.4166 lr:0.0003000000 time/step:117.98s
526
+ [2025-09-12 12:14:20,061] - step:495/900 train_loss:0.3974 lr:0.0003000000 time/step:117.11s
527
+ [2025-09-12 12:16:16,526] - step:496/900 train_loss:0.4149 lr:0.0003000000 time/step:116.46s
528
+ [2025-09-12 12:18:13,009] - step:497/900 train_loss:0.4181 lr:0.0003000000 time/step:116.48s
529
+ [2025-09-12 12:20:09,790] - step:498/900 train_loss:0.4166 lr:0.0003000000 time/step:116.78s
530
+ [2025-09-12 12:22:06,615] - step:499/900 train_loss:0.4216 lr:0.0003000000 time/step:116.82s
531
+ [2025-09-12 12:24:05,337] - step:500/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@500.pt...
532
+ [2025-09-12 12:24:05,343] - step:500/900 train_loss:0.4161 lr:0.0003000000 time/step:118.13s
533
+ [2025-09-12 12:26:01,342] - step:501/900 train_loss:0.4010 lr:0.0003000000 time/step:116.00s
534
+ [2025-09-12 12:27:58,345] - step:502/900 train_loss:0.4042 lr:0.0003000000 time/step:116.99s
535
+ [2025-09-12 12:29:54,389] - step:503/900 train_loss:0.4216 lr:0.0003000000 time/step:116.04s
536
+ [2025-09-12 12:31:51,252] - step:504/900 train_loss:0.4127 lr:0.0003000000 time/step:116.86s
537
+ [2025-09-12 12:33:49,558] - step:505/900 train_loss:0.4019 lr:0.0003000000 time/step:118.29s
538
+ [2025-09-12 12:35:46,199] - step:506/900 train_loss:0.4076 lr:0.0003000000 time/step:116.64s
539
+ [2025-09-12 12:37:42,246] - step:507/900 train_loss:0.4207 lr:0.0003000000 time/step:116.04s
540
+ [2025-09-12 12:39:39,229] - step:508/900 train_loss:0.4258 lr:0.0003000000 time/step:116.98s
541
+ [2025-09-12 12:41:35,709] - step:509/900 train_loss:0.3826 lr:0.0003000000 time/step:116.48s
542
+ [2025-09-12 12:43:32,441] - step:510/900 train_loss:0.4092 lr:0.0003000000 time/step:116.72s
543
+ [2025-09-12 12:45:30,539] - step:511/900 train_loss:0.3954 lr:0.0003000000 time/step:118.09s
544
+ [2025-09-12 12:47:27,041] - step:512/900 train_loss:0.4335 lr:0.0003000000 time/step:116.49s
545
+ [2025-09-12 12:49:23,522] - step:513/900 train_loss:0.4216 lr:0.0003000000 time/step:116.47s
546
+ [2025-09-12 12:51:20,467] - step:514/900 train_loss:0.3952 lr:0.0003000000 time/step:116.94s
547
+ [2025-09-12 12:53:17,452] - step:515/900 train_loss:0.4052 lr:0.0003000000 time/step:116.98s
548
+ [2025-09-12 12:55:14,098] - step:516/900 train_loss:0.4145 lr:0.0003000000 time/step:116.64s
549
+ [2025-09-12 12:57:11,620] - step:517/900 train_loss:0.4292 lr:0.0003000000 time/step:117.51s
550
+ [2025-09-12 12:59:09,139] - step:518/900 train_loss:0.4204 lr:0.0003000000 time/step:117.51s
551
+ [2025-09-12 13:01:05,186] - step:519/900 train_loss:0.3932 lr:0.0003000000 time/step:116.04s
552
+ [2025-09-12 13:03:01,731] - step:520/900 train_loss:0.4226 lr:0.0003000000 time/step:116.54s
553
+ [2025-09-12 13:04:59,398] - step:521/900 train_loss:0.4080 lr:0.0003000000 time/step:117.65s
554
+ [2025-09-12 13:06:56,876] - step:522/900 train_loss:0.4079 lr:0.0003000000 time/step:117.47s
555
+ [2025-09-12 13:08:53,784] - step:523/900 train_loss:0.4375 lr:0.0003000000 time/step:116.90s
556
+ [2025-09-12 13:11:18,031] - step:524/900 train_loss:0.3876 lr:0.0003000000 time/step:144.24s
557
+ [2025-09-12 13:13:14,894] - step:525/900 train_loss:0.4133 lr:0.0003000000 time/step:116.82s
558
+ [2025-09-12 13:15:16,203] - step:526/900 train_loss:0.3961 lr:0.0003000000 time/step:118.95s
559
+ [2025-09-12 13:17:12,922] - step:527/900 train_loss:0.3895 lr:0.0003000000 time/step:116.71s
560
+ [2025-09-12 13:19:09,906] - step:528/900 train_loss:0.4204 lr:0.0003000000 time/step:116.98s
561
+ [2025-09-12 13:21:08,032] - step:529/900 train_loss:0.4078 lr:0.0003000000 time/step:118.12s
562
+ [2025-09-12 13:23:04,450] - step:530/900 train_loss:0.3973 lr:0.0003000000 time/step:116.41s
563
+ [2025-09-12 13:25:02,156] - step:531/900 train_loss:0.3875 lr:0.0003000000 time/step:117.69s
564
+ [2025-09-12 13:26:58,851] - step:532/900 train_loss:0.3979 lr:0.0003000000 time/step:116.69s
565
+ [2025-09-12 13:28:55,552] - step:533/900 train_loss:0.4210 lr:0.0003000000 time/step:116.69s
566
+ [2025-09-12 13:30:52,352] - step:534/900 train_loss:0.4016 lr:0.0003000000 time/step:116.80s
567
+ [2025-09-12 13:32:50,584] - step:535/900 train_loss:0.3971 lr:0.0003000000 time/step:118.23s
568
+ [2025-09-12 13:34:47,330] - step:536/900 train_loss:0.4167 lr:0.0003000000 time/step:116.73s
569
+ [2025-09-12 13:36:44,747] - step:537/900 train_loss:0.4366 lr:0.0003000000 time/step:117.39s
570
+ [2025-09-12 13:38:42,456] - step:538/900 train_loss:0.4267 lr:0.0003000000 time/step:117.71s
571
+ [2025-09-12 13:40:38,661] - step:539/900 train_loss:0.4092 lr:0.0003000000 time/step:116.20s
572
+ [2025-09-12 13:42:38,305] - step:540/900 train_loss:0.4273 lr:0.0003000000 time/step:119.62s
573
+ [2025-09-12 13:44:37,524] - step:541/900 train_loss:0.4157 lr:0.0003000000 time/step:119.17s
574
+ [2025-09-12 13:46:33,425] - step:542/900 train_loss:0.4237 lr:0.0003000000 time/step:115.89s
575
+ [2025-09-12 13:48:30,101] - step:543/900 train_loss:0.4052 lr:0.0003000000 time/step:116.67s
576
+ [2025-09-12 13:50:27,196] - step:544/900 train_loss:0.4260 lr:0.0003000000 time/step:117.09s
577
+ [2025-09-12 13:52:24,079] - step:545/900 train_loss:0.4021 lr:0.0003000000 time/step:116.88s
578
+ [2025-09-12 13:54:21,661] - step:546/900 train_loss:0.3897 lr:0.0003000000 time/step:117.57s
579
+ [2025-09-12 13:56:19,479] - step:547/900 train_loss:0.4029 lr:0.0003000000 time/step:117.81s
580
+ [2025-09-12 13:58:15,488] - step:548/900 train_loss:0.4107 lr:0.0003000000 time/step:116.00s
581
+ [2025-09-12 14:00:11,893] - step:549/900 train_loss:0.4159 lr:0.0003000000 time/step:116.40s
582
+ [2025-09-12 14:02:08,916] - step:550/900 train_loss:0.4075 lr:0.0003000000 time/step:117.01s
583
+ [2025-09-12 14:04:06,359] - step:551/900 train_loss:0.3932 lr:0.0003000000 time/step:117.43s
584
+ [2025-09-12 14:06:02,862] - step:552/900 train_loss:0.4110 lr:0.0003000000 time/step:116.49s
585
+ [2025-09-12 14:08:00,226] - step:553/900 train_loss:0.4250 lr:0.0003000000 time/step:117.36s
586
+ [2025-09-12 14:09:56,780] - step:554/900 train_loss:0.3990 lr:0.0003000000 time/step:116.54s
587
+ [2025-09-12 14:11:53,353] - step:555/900 train_loss:0.4041 lr:0.0003000000 time/step:116.56s
588
+ [2025-09-12 14:13:50,235] - step:556/900 train_loss:0.4062 lr:0.0003000000 time/step:116.87s
589
+ [2025-09-12 14:15:47,160] - step:557/900 train_loss:0.4144 lr:0.0003000000 time/step:116.92s
590
+ [2025-09-12 14:17:44,967] - step:558/900 train_loss:0.4032 lr:0.0003000000 time/step:117.80s
591
+ [2025-09-12 14:19:40,685] - step:559/900 train_loss:0.4082 lr:0.0003000000 time/step:115.71s
592
+ [2025-09-12 14:21:37,889] - step:560/900 train_loss:0.4140 lr:0.0003000000 time/step:117.20s
593
+ [2025-09-12 14:23:34,834] - step:561/900 train_loss:0.4284 lr:0.0003000000 time/step:116.94s
594
+ [2025-09-12 14:25:31,517] - step:562/900 train_loss:0.4096 lr:0.0003000000 time/step:116.67s
595
+ [2025-09-12 14:27:29,793] - step:563/900 train_loss:0.4017 lr:0.0003000000 time/step:118.26s
596
+ [2025-09-12 14:29:26,683] - step:564/900 train_loss:0.4014 lr:0.0003000000 time/step:116.88s
597
+ [2025-09-12 14:31:22,468] - step:565/900 train_loss:0.4061 lr:0.0003000000 time/step:115.78s
598
+ [2025-09-12 14:33:19,190] - step:566/900 train_loss:0.4188 lr:0.0003000000 time/step:116.72s
599
+ [2025-09-12 14:35:16,130] - step:567/900 train_loss:0.4305 lr:0.0003000000 time/step:116.93s
600
+ [2025-09-12 14:37:13,373] - step:568/900 train_loss:0.3922 lr:0.0003000000 time/step:117.24s
601
+ [2025-09-12 14:39:10,305] - step:569/900 train_loss:0.4190 lr:0.0003000000 time/step:116.92s
602
+ [2025-09-12 14:41:07,121] - step:570/900 train_loss:0.4047 lr:0.0003000000 time/step:116.81s
603
+ [2025-09-12 14:43:03,948] - step:571/900 train_loss:0.4152 lr:0.0003000000 time/step:116.82s
604
+ [2025-09-12 14:45:00,151] - step:572/900 train_loss:0.3946 lr:0.0003000000 time/step:116.19s
605
+ [2025-09-12 14:46:57,634] - step:573/900 train_loss:0.4138 lr:0.0003000000 time/step:117.48s
606
+ [2025-09-12 14:48:55,022] - step:574/900 train_loss:0.4231 lr:0.0003000000 time/step:117.37s
607
+ [2025-09-12 14:50:50,877] - step:575/900 train_loss:0.3978 lr:0.0003000000 time/step:115.85s
608
+ [2025-09-12 14:52:49,128] - step:576/900 train_loss:0.4169 lr:0.0003000000 time/step:118.25s
609
+ [2025-09-12 14:54:45,289] - step:577/900 train_loss:0.3971 lr:0.0003000000 time/step:116.15s
610
+ [2025-09-12 14:56:41,851] - step:578/900 train_loss:0.4058 lr:0.0003000000 time/step:116.56s
611
+ [2025-09-12 14:58:38,779] - step:579/900 train_loss:0.4105 lr:0.0003000000 time/step:116.92s
612
+ [2025-09-12 15:00:35,657] - step:580/900 train_loss:0.4145 lr:0.0003000000 time/step:116.87s
613
+ [2025-09-12 15:02:33,021] - step:581/900 train_loss:0.4067 lr:0.0003000000 time/step:117.36s
614
+ [2025-09-12 15:04:29,564] - step:582/900 train_loss:0.4209 lr:0.0003000000 time/step:116.53s
615
+ [2025-09-12 15:06:26,089] - step:583/900 train_loss:0.4106 lr:0.0003000000 time/step:116.52s
616
+ [2025-09-12 15:08:22,953] - step:584/900 train_loss:0.4220 lr:0.0003000000 time/step:116.86s
617
+ [2025-09-12 15:10:19,376] - step:585/900 train_loss:0.4001 lr:0.0003000000 time/step:116.41s
618
+ [2025-09-12 15:12:16,440] - step:586/900 train_loss:0.3963 lr:0.0003000000 time/step:117.06s
619
+ [2025-09-12 15:14:14,343] - step:587/900 train_loss:0.4118 lr:0.0003000000 time/step:117.89s
620
+ [2025-09-12 15:16:10,568] - step:588/900 train_loss:0.4285 lr:0.0003000000 time/step:116.22s
621
+ [2025-09-12 15:18:06,609] - step:589/900 train_loss:0.4177 lr:0.0003000000 time/step:116.04s
622
+ [2025-09-12 15:20:03,934] - step:590/900 train_loss:0.4256 lr:0.0003000000 time/step:117.32s
623
+ [2025-09-12 15:22:00,505] - step:591/900 train_loss:0.4258 lr:0.0003000000 time/step:116.57s
624
+ [2025-09-12 15:23:57,739] - step:592/900 train_loss:0.4031 lr:0.0003000000 time/step:117.19s
625
+ [2025-09-12 15:25:55,502] - step:593/900 train_loss:0.3975 lr:0.0003000000 time/step:117.76s
626
+ [2025-09-12 15:27:51,604] - step:594/900 train_loss:0.4098 lr:0.0003000000 time/step:116.10s
627
+ [2025-09-12 15:29:48,152] - step:595/900 train_loss:0.4044 lr:0.0003000000 time/step:116.54s
628
+ [2025-09-12 15:31:45,056] - step:596/900 train_loss:0.4394 lr:0.0003000000 time/step:116.89s
629
+ [2025-09-12 15:33:42,598] - step:597/900 train_loss:0.4166 lr:0.0003000000 time/step:117.54s
630
+ [2025-09-12 15:35:38,903] - step:598/900 train_loss:0.3857 lr:0.0003000000 time/step:116.29s
631
+ [2025-09-12 15:37:35,947] - step:599/900 train_loss:0.3944 lr:0.0003000000 time/step:117.04s
632
+ [2025-09-12 15:39:32,999] - step:600/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@600.pt...
633
+ [2025-09-12 15:39:33,008] - step:600/900 train_loss:0.4121 lr:0.0003000000 time/step:116.45s
634
+ [2025-09-12 15:41:29,871] - step:601/900 train_loss:0.4055 lr:0.0003000000 time/step:116.86s
635
+ [2025-09-12 15:43:27,496] - step:602/900 train_loss:0.4131 lr:0.0003000000 time/step:117.62s
636
+ [2025-09-12 15:45:24,775] - step:603/900 train_loss:0.4117 lr:0.0003000000 time/step:117.27s
637
+ [2025-09-12 15:47:21,843] - step:604/900 train_loss:0.4073 lr:0.0003000000 time/step:117.06s
638
+ [2025-09-12 15:49:19,207] - step:605/900 train_loss:0.3994 lr:0.0003000000 time/step:117.35s
639
+ [2025-09-12 15:51:15,705] - step:606/900 train_loss:0.4006 lr:0.0003000000 time/step:116.49s
640
+ [2025-09-12 15:53:12,651] - step:607/900 train_loss:0.4087 lr:0.0003000000 time/step:116.94s
641
+ [2025-09-12 15:55:10,370] - step:608/900 train_loss:0.4194 lr:0.0003000000 time/step:117.71s
642
+ [2025-09-12 15:57:07,183] - step:609/900 train_loss:0.4059 lr:0.0003000000 time/step:116.80s
643
+ [2025-09-12 15:59:03,945] - step:610/900 train_loss:0.3960 lr:0.0003000000 time/step:116.75s
644
+ [2025-09-12 16:01:01,845] - step:611/900 train_loss:0.4203 lr:0.0003000000 time/step:117.89s
645
+ [2025-09-12 16:02:57,870] - step:612/900 train_loss:0.4208 lr:0.0003000000 time/step:116.02s
646
+ [2025-09-12 16:04:55,209] - step:613/900 train_loss:0.4205 lr:0.0003000000 time/step:117.33s
647
+ [2025-09-12 16:06:52,213] - step:614/900 train_loss:0.4023 lr:0.0003000000 time/step:117.00s
648
+ [2025-09-12 16:08:49,316] - step:615/900 train_loss:0.4011 lr:0.0003000000 time/step:117.09s
649
+ [2025-09-12 16:10:47,341] - step:616/900 train_loss:0.3898 lr:0.0003000000 time/step:118.02s
650
+ [2025-09-12 16:12:44,306] - step:617/900 train_loss:0.4223 lr:0.0003000000 time/step:116.96s
651
+ [2025-09-12 16:14:41,166] - step:618/900 train_loss:0.4022 lr:0.0003000000 time/step:116.85s
652
+ [2025-09-12 16:16:38,068] - step:619/900 train_loss:0.4259 lr:0.0003000000 time/step:116.89s
653
+ [2025-09-12 16:18:35,272] - step:620/900 train_loss:0.4129 lr:0.0003000000 time/step:117.20s
654
+ [2025-09-12 16:20:32,436] - step:621/900 train_loss:0.4122 lr:0.0003000000 time/step:117.13s
655
+ [2025-09-12 16:22:30,553] - step:622/900 train_loss:0.4185 lr:0.0003000000 time/step:118.10s
656
+ [2025-09-12 16:24:27,881] - step:623/900 train_loss:0.3991 lr:0.0003000000 time/step:117.28s
657
+ [2025-09-12 16:26:24,425] - step:624/900 train_loss:0.4208 lr:0.0003000000 time/step:116.53s
658
+ [2025-09-12 16:28:21,471] - step:625/900 train_loss:0.4276 lr:0.0003000000 time/step:117.04s
659
+ [2025-09-12 16:30:19,129] - step:626/900 train_loss:0.4259 lr:0.0003000000 time/step:117.64s
660
+ [2025-09-12 16:32:19,616] - step:627/900 train_loss:0.3848 lr:0.0003000000 time/step:120.47s
661
+ [2025-09-12 16:34:17,638] - step:628/900 train_loss:0.4005 lr:0.0003000000 time/step:118.02s
662
+ [2025-09-12 16:36:14,359] - step:629/900 train_loss:0.3988 lr:0.0003000000 time/step:116.71s
663
+ [2025-09-12 16:38:11,222] - step:630/900 train_loss:0.4181 lr:0.0003000000 time/step:116.86s
664
+ [2025-09-12 16:40:08,509] - step:631/900 train_loss:0.4042 lr:0.0003000000 time/step:117.28s
665
+ [2025-09-12 16:42:06,712] - step:632/900 train_loss:0.4010 lr:0.0003000000 time/step:118.19s
666
+ [2025-09-12 16:44:03,814] - step:633/900 train_loss:0.4108 lr:0.0003000000 time/step:117.10s
667
+ [2025-09-12 16:46:01,576] - step:634/900 train_loss:0.4218 lr:0.0003000000 time/step:117.65s
668
+ [2025-09-12 16:47:57,601] - step:635/900 train_loss:0.4339 lr:0.0003000000 time/step:116.02s
669
+ [2025-09-12 16:49:54,473] - step:636/900 train_loss:0.4252 lr:0.0003000000 time/step:116.86s
670
+ [2025-09-12 16:51:52,707] - step:637/900 train_loss:0.3961 lr:0.0003000000 time/step:118.19s
671
+ [2025-09-12 16:53:50,406] - step:638/900 train_loss:0.4049 lr:0.0003000000 time/step:117.69s
672
+ [2025-09-12 16:55:48,233] - step:639/900 train_loss:0.4217 lr:0.0003000000 time/step:117.81s
673
+ [2025-09-12 16:57:44,596] - step:640/900 train_loss:0.4046 lr:0.0003000000 time/step:116.35s
674
+ [2025-09-12 16:59:40,200] - step:641/900 train_loss:0.4136 lr:0.0003000000 time/step:115.60s
675
+ [2025-09-12 17:01:37,286] - step:642/900 train_loss:0.4027 lr:0.0003000000 time/step:117.08s
676
+ [2025-09-12 17:03:35,226] - step:643/900 train_loss:0.3820 lr:0.0003000000 time/step:117.93s
677
+ [2025-09-12 17:05:33,570] - step:644/900 train_loss:0.4089 lr:0.0003000000 time/step:118.33s
678
+ [2025-09-12 17:07:30,395] - step:645/900 train_loss:0.3874 lr:0.0003000000 time/step:116.82s
679
+ [2025-09-12 17:09:27,297] - step:646/900 train_loss:0.4146 lr:0.0003000000 time/step:116.90s
680
+ [2025-09-12 17:11:23,362] - step:647/900 train_loss:0.3988 lr:0.0003000000 time/step:116.06s
681
+ [2025-09-12 17:13:20,787] - step:648/900 train_loss:0.4128 lr:0.0003000000 time/step:117.42s
682
+ [2025-09-12 17:15:18,588] - step:649/900 train_loss:0.4332 lr:0.0003000000 time/step:117.79s
683
+ [2025-09-12 17:17:16,062] - step:650/900 train_loss:0.4214 lr:0.0003000000 time/step:117.47s
684
+ [2025-09-12 17:19:12,730] - step:651/900 train_loss:0.4074 lr:0.0003000000 time/step:116.66s
685
+ [2025-09-12 17:21:09,550] - step:652/900 train_loss:0.4025 lr:0.0003000000 time/step:116.81s
686
+ [2025-09-12 17:23:05,702] - step:653/900 train_loss:0.4008 lr:0.0003000000 time/step:116.15s
687
+ [2025-09-12 17:25:03,925] - step:654/900 train_loss:0.4060 lr:0.0003000000 time/step:118.18s
688
+ [2025-09-12 17:27:02,401] - step:655/900 train_loss:0.3931 lr:0.0003000000 time/step:118.47s
689
+ [2025-09-12 17:28:59,392] - step:656/900 train_loss:0.3985 lr:0.0003000000 time/step:116.97s
690
+ [2025-09-12 17:30:56,335] - step:657/900 train_loss:0.4319 lr:0.0003000000 time/step:116.93s
691
+ [2025-09-12 17:32:52,897] - step:658/900 train_loss:0.4200 lr:0.0003000000 time/step:116.56s
692
+ [2025-09-12 17:34:50,643] - step:659/900 train_loss:0.3811 lr:0.0003000000 time/step:117.73s
693
+ [2025-09-12 17:36:47,661] - step:660/900 train_loss:0.3960 lr:0.0003000000 time/step:117.00s
694
+ [2025-09-12 17:38:45,367] - step:661/900 train_loss:0.3810 lr:0.0003000000 time/step:117.70s
695
+ [2025-09-12 17:40:42,471] - step:662/900 train_loss:0.3948 lr:0.0003000000 time/step:117.10s
696
+ [2025-09-12 17:42:39,354] - step:663/900 train_loss:0.4221 lr:0.0003000000 time/step:116.86s
697
+ [2025-09-12 17:44:37,177] - step:664/900 train_loss:0.4021 lr:0.0003000000 time/step:117.82s
698
+ [2025-09-12 17:46:33,621] - step:665/900 train_loss:0.4521 lr:0.0003000000 time/step:116.43s
699
+ [2025-09-12 17:48:31,225] - step:666/900 train_loss:0.4265 lr:0.0003000000 time/step:117.60s
700
+ [2025-09-12 17:50:28,126] - step:667/900 train_loss:0.4109 lr:0.0003000000 time/step:116.89s
701
+ [2025-09-12 17:52:25,032] - step:668/900 train_loss:0.4247 lr:0.0003000000 time/step:116.90s
702
+ [2025-09-12 17:54:22,433] - step:669/900 train_loss:0.4024 lr:0.0003000000 time/step:117.40s
703
+ [2025-09-12 17:56:19,263] - step:670/900 train_loss:0.4238 lr:0.0003000000 time/step:116.81s
704
+ [2025-09-12 17:58:15,840] - step:671/900 train_loss:0.4240 lr:0.0003000000 time/step:116.57s
705
+ [2025-09-12 18:00:13,196] - step:672/900 train_loss:0.4079 lr:0.0003000000 time/step:117.35s
706
+ [2025-09-12 18:02:09,946] - step:673/900 train_loss:0.4152 lr:0.0003000000 time/step:116.74s
707
+ [2025-09-12 18:04:08,272] - step:674/900 train_loss:0.4386 lr:0.0003000000 time/step:118.32s
708
+ [2025-09-12 18:06:05,695] - step:675/900 train_loss:0.3944 lr:0.0003000000 time/step:117.41s
709
+ [2025-09-12 18:08:01,761] - step:676/900 train_loss:0.3997 lr:0.0003000000 time/step:116.05s
710
+ [2025-09-12 18:09:59,340] - step:677/900 train_loss:0.4081 lr:0.0003000000 time/step:117.57s
711
+ [2025-09-12 18:11:56,223] - step:678/900 train_loss:0.4326 lr:0.0003000000 time/step:116.88s
712
+ [2025-09-12 18:13:53,528] - step:679/900 train_loss:0.4058 lr:0.0003000000 time/step:117.30s
713
+ [2025-09-12 18:15:51,604] - step:680/900 train_loss:0.4257 lr:0.0003000000 time/step:118.06s
714
+ [2025-09-12 18:17:48,495] - step:681/900 train_loss:0.4226 lr:0.0003000000 time/step:116.88s
715
+ [2025-09-12 18:19:44,618] - step:682/900 train_loss:0.3978 lr:0.0003000000 time/step:116.12s
716
+ [2025-09-12 18:21:41,760] - step:683/900 train_loss:0.4064 lr:0.0003000000 time/step:117.14s
717
+ [2025-09-12 18:23:38,665] - step:684/900 train_loss:0.3959 lr:0.0003000000 time/step:116.90s
718
+ [2025-09-12 18:25:36,029] - step:685/900 train_loss:0.4136 lr:0.0003000000 time/step:117.35s
719
+ [2025-09-12 18:27:33,774] - step:686/900 train_loss:0.4058 lr:0.0003000000 time/step:117.62s
720
+ [2025-09-12 18:29:30,658] - step:687/900 train_loss:0.4132 lr:0.0003000000 time/step:116.88s
721
+ [2025-09-12 18:31:27,420] - step:688/900 train_loss:0.4048 lr:0.0003000000 time/step:116.76s
722
+ [2025-09-12 18:33:24,361] - step:689/900 train_loss:0.4023 lr:0.0003000000 time/step:116.94s
723
+ [2025-09-12 18:35:21,754] - step:690/900 train_loss:0.3715 lr:0.0003000000 time/step:117.38s
724
+ [2025-09-12 18:37:19,552] - step:691/900 train_loss:0.4017 lr:0.0003000000 time/step:117.78s
725
+ [2025-09-12 18:39:16,412] - step:692/900 train_loss:0.4232 lr:0.0003000000 time/step:116.85s
726
+ [2025-09-12 18:41:13,974] - step:693/900 train_loss:0.4196 lr:0.0003000000 time/step:117.55s
727
+ [2025-09-12 18:43:10,197] - step:694/900 train_loss:0.4010 lr:0.0003000000 time/step:116.22s
728
+ [2025-09-12 18:45:07,263] - step:695/900 train_loss:0.3904 lr:0.0003000000 time/step:117.06s
729
+ [2025-09-12 18:47:05,813] - step:696/900 train_loss:0.4152 lr:0.0003000000 time/step:118.53s
730
+ [2025-09-12 18:49:02,863] - step:697/900 train_loss:0.4064 lr:0.0003000000 time/step:117.04s
731
+ [2025-09-12 18:50:59,812] - step:698/900 train_loss:0.3980 lr:0.0003000000 time/step:116.94s
732
+ [2025-09-12 18:52:57,370] - step:699/900 train_loss:0.3884 lr:0.0003000000 time/step:117.55s
733
+ [2025-09-12 18:54:54,648] - step:700/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@700.pt...
734
+ [2025-09-12 18:54:54,648] - step:700/900 train_loss:0.3973 lr:0.0003000000 time/step:116.56s
735
+ [2025-09-12 18:56:52,677] - step:701/900 train_loss:0.4030 lr:0.0003000000 time/step:118.01s
736
+ [2025-09-12 18:58:49,207] - step:702/900 train_loss:0.3937 lr:0.0003000000 time/step:116.52s
737
+ [2025-09-12 19:00:46,170] - step:703/900 train_loss:0.4356 lr:0.0003000000 time/step:116.96s
738
+ [2025-09-12 19:02:43,288] - step:704/900 train_loss:0.4294 lr:0.0003000000 time/step:117.11s
739
+ [2025-09-12 19:04:40,157] - step:705/900 train_loss:0.4150 lr:0.0003000000 time/step:116.86s
740
+ [2025-09-12 19:06:38,620] - step:706/900 train_loss:0.4153 lr:0.0003000000 time/step:118.45s
741
+ [2025-09-12 19:08:35,564] - step:707/900 train_loss:0.3966 lr:0.0003000000 time/step:116.92s
742
+ [2025-09-12 19:10:32,548] - step:708/900 train_loss:0.4221 lr:0.0003000000 time/step:116.98s
743
+ [2025-09-12 19:12:29,132] - step:709/900 train_loss:0.3952 lr:0.0003000000 time/step:116.58s
744
+ [2025-09-12 19:14:26,936] - step:710/900 train_loss:0.3849 lr:0.0003000000 time/step:117.80s
745
+ [2025-09-12 19:16:24,310] - step:711/900 train_loss:0.4114 lr:0.0003000000 time/step:117.36s
746
+ [2025-09-12 19:18:20,956] - step:712/900 train_loss:0.4173 lr:0.0003000000 time/step:116.64s
747
+ [2025-09-12 19:20:18,010] - step:713/900 train_loss:0.3898 lr:0.0003000000 time/step:117.05s
748
+ [2025-09-12 19:22:14,781] - step:714/900 train_loss:0.4088 lr:0.0003000000 time/step:116.76s
749
+ [2025-09-12 19:24:11,349] - step:715/900 train_loss:0.3975 lr:0.0003000000 time/step:116.56s
750
+ [2025-09-12 19:26:09,929] - step:716/900 train_loss:0.4089 lr:0.0003000000 time/step:118.57s
751
+ [2025-09-12 19:28:06,399] - step:717/900 train_loss:0.3964 lr:0.0003000000 time/step:116.46s
752
+ [2025-09-12 19:30:02,907] - step:718/900 train_loss:0.4063 lr:0.0003000000 time/step:116.49s
753
+ [2025-09-12 19:31:59,817] - step:719/900 train_loss:0.3934 lr:0.0003000000 time/step:116.90s
754
+ [2025-09-12 19:33:56,770] - step:720/900 train_loss:0.3953 lr:0.0003000000 time/step:116.95s
755
+ [2025-09-12 19:35:54,685] - step:721/900 train_loss:0.4275 lr:0.0003000000 time/step:117.90s
756
+ [2025-09-12 19:37:52,051] - step:722/900 train_loss:0.4074 lr:0.0003000000 time/step:117.36s
757
+ [2025-09-12 19:39:48,232] - step:723/900 train_loss:0.4163 lr:0.0003000000 time/step:116.18s
758
+ [2025-09-12 19:41:45,737] - step:724/900 train_loss:0.4015 lr:0.0003000000 time/step:117.50s
759
+ [2025-09-12 19:43:42,903] - step:725/900 train_loss:0.4202 lr:0.0003000000 time/step:117.16s
760
+ [2025-09-12 19:45:40,142] - step:726/900 train_loss:0.4291 lr:0.0003000000 time/step:117.23s
761
+ [2025-09-12 19:47:38,767] - step:727/900 train_loss:0.4219 lr:0.0003000000 time/step:118.52s
762
+ [2025-09-12 19:49:35,311] - step:728/900 train_loss:0.4267 lr:0.0003000000 time/step:116.54s
763
+ [2025-09-12 19:51:31,352] - step:729/900 train_loss:0.4008 lr:0.0003000000 time/step:116.03s
764
+ [2025-09-12 19:53:29,152] - step:730/900 train_loss:0.4191 lr:0.0003000000 time/step:117.79s
765
+ [2025-09-12 19:55:25,960] - step:731/900 train_loss:0.4093 lr:0.0003000000 time/step:116.80s
766
+ [2025-09-12 19:57:23,584] - step:732/900 train_loss:0.4230 lr:0.0003000000 time/step:117.61s
767
+ [2025-09-12 19:59:20,804] - step:733/900 train_loss:0.4213 lr:0.0003000000 time/step:117.21s
768
+ [2025-09-12 20:01:17,340] - step:734/900 train_loss:0.4071 lr:0.0003000000 time/step:116.53s
769
+ [2025-09-12 20:03:14,230] - step:735/900 train_loss:0.3944 lr:0.0003000000 time/step:116.88s
770
+ [2025-09-12 20:05:11,216] - step:736/900 train_loss:0.3971 lr:0.0003000000 time/step:116.98s
771
+ [2025-09-12 20:07:08,901] - step:737/900 train_loss:0.4144 lr:0.0003000000 time/step:117.67s
772
+ [2025-09-12 20:09:06,344] - step:738/900 train_loss:0.4349 lr:0.0003000000 time/step:117.44s
773
+ [2025-09-12 20:11:03,286] - step:739/900 train_loss:0.3967 lr:0.0003000000 time/step:116.94s
774
+ [2025-09-12 20:12:59,810] - step:740/900 train_loss:0.4104 lr:0.0003000000 time/step:116.52s
775
+ [2025-09-12 20:14:56,833] - step:741/900 train_loss:0.4195 lr:0.0003000000 time/step:117.01s
776
+ [2025-09-12 20:16:53,836] - step:742/900 train_loss:0.4083 lr:0.0003000000 time/step:116.99s
777
+ [2025-09-12 20:18:51,981] - step:743/900 train_loss:0.4021 lr:0.0003000000 time/step:118.14s
778
+ [2025-09-12 20:20:48,901] - step:744/900 train_loss:0.4182 lr:0.0003000000 time/step:116.91s
779
+ [2025-09-12 20:22:46,747] - step:745/900 train_loss:0.3946 lr:0.0003000000 time/step:117.84s
780
+ [2025-09-12 20:24:42,792] - step:746/900 train_loss:0.3826 lr:0.0003000000 time/step:116.03s
781
+ [2025-09-12 20:26:39,772] - step:747/900 train_loss:0.4267 lr:0.0003000000 time/step:116.97s
782
+ [2025-09-12 20:28:37,994] - step:748/900 train_loss:0.3935 lr:0.0003000000 time/step:118.21s
783
+ [2025-09-12 20:30:34,939] - step:749/900 train_loss:0.3979 lr:0.0003000000 time/step:116.93s
784
+ [2025-09-12 20:32:32,443] - step:750/900 train_loss:0.4253 lr:0.0003000000 time/step:117.50s
785
+ [2025-09-12 20:34:29,466] - step:751/900 train_loss:0.4006 lr:0.0003000000 time/step:117.01s
786
+ [2025-09-12 20:36:25,608] - step:752/900 train_loss:0.4219 lr:0.0003000000 time/step:116.13s
787
+ [2025-09-12 20:38:22,452] - step:753/900 train_loss:0.3919 lr:0.0003000000 time/step:116.84s
788
+ [2025-09-12 20:40:21,076] - step:754/900 train_loss:0.4138 lr:0.0003000000 time/step:118.62s
789
+ [2025-09-12 20:42:18,879] - step:755/900 train_loss:0.4144 lr:0.0003000000 time/step:117.79s
790
+ [2025-09-12 20:44:15,840] - step:756/900 train_loss:0.4077 lr:0.0003000000 time/step:116.95s
791
+ [2025-09-12 20:46:12,684] - step:757/900 train_loss:0.4420 lr:0.0003000000 time/step:116.84s
792
+ [2025-09-12 20:48:08,449] - step:758/900 train_loss:0.4310 lr:0.0003000000 time/step:115.75s
793
+ [2025-09-12 20:50:06,514] - step:759/900 train_loss:0.4193 lr:0.0003000000 time/step:118.06s
794
+ [2025-09-12 20:52:03,393] - step:760/900 train_loss:0.4097 lr:0.0003000000 time/step:116.87s
795
+ [2025-09-12 20:54:01,558] - step:761/900 train_loss:0.4206 lr:0.0003000000 time/step:118.16s
796
+ [2025-09-12 20:55:58,603] - step:762/900 train_loss:0.4123 lr:0.0003000000 time/step:117.04s
797
+ [2025-09-12 20:57:55,067] - step:763/900 train_loss:0.3960 lr:0.0003000000 time/step:116.45s
798
+ [2025-09-12 20:59:51,936] - step:764/900 train_loss:0.4299 lr:0.0003000000 time/step:116.85s
799
+ [2025-09-12 21:01:50,033] - step:765/900 train_loss:0.4122 lr:0.0003000000 time/step:118.09s
800
+ [2025-09-12 21:03:47,856] - step:766/900 train_loss:0.3942 lr:0.0003000000 time/step:117.82s
801
+ [2025-09-12 21:05:44,878] - step:767/900 train_loss:0.3948 lr:0.0003000000 time/step:117.01s
802
+ [2025-09-12 21:07:41,799] - step:768/900 train_loss:0.3943 lr:0.0003000000 time/step:116.91s
803
+ [2025-09-12 21:09:38,205] - step:769/900 train_loss:0.4122 lr:0.0003000000 time/step:116.40s
804
+ [2025-09-12 21:11:35,911] - step:770/900 train_loss:0.4029 lr:0.0003000000 time/step:117.70s
805
+ [2025-09-12 21:13:33,673] - step:771/900 train_loss:0.3994 lr:0.0003000000 time/step:117.75s
806
+ [2025-09-12 21:15:30,614] - step:772/900 train_loss:0.4263 lr:0.0003000000 time/step:116.93s
807
+ [2025-09-12 21:17:27,398] - step:773/900 train_loss:0.4199 lr:0.0003000000 time/step:116.77s
808
+ [2025-09-12 21:19:24,243] - step:774/900 train_loss:0.4126 lr:0.0003000000 time/step:116.84s
809
+ [2025-09-12 21:21:21,644] - step:775/900 train_loss:0.3885 lr:0.0003000000 time/step:117.39s
810
+ [2025-09-12 21:23:18,489] - step:776/900 train_loss:0.4123 lr:0.0003000000 time/step:116.84s
811
+ [2025-09-12 21:25:16,373] - step:777/900 train_loss:0.3887 lr:0.0003000000 time/step:117.88s
812
+ [2025-09-12 21:27:13,296] - step:778/900 train_loss:0.4256 lr:0.0003000000 time/step:116.91s
813
+ [2025-09-12 21:29:10,200] - step:779/900 train_loss:0.4090 lr:0.0003000000 time/step:116.90s
814
+ [2025-09-12 21:31:07,409] - step:780/900 train_loss:0.3895 lr:0.0003000000 time/step:117.20s
815
+ [2025-09-12 21:33:04,490] - step:781/900 train_loss:0.4134 lr:0.0003000000 time/step:117.07s
816
+ [2025-09-12 21:35:01,686] - step:782/900 train_loss:0.4317 lr:0.0003000000 time/step:117.19s
817
+ [2025-09-12 21:36:58,773] - step:783/900 train_loss:0.4093 lr:0.0003000000 time/step:117.07s
818
+ [2025-09-12 21:38:55,697] - step:784/900 train_loss:0.4052 lr:0.0003000000 time/step:116.92s
819
+ [2025-09-12 21:40:52,704] - step:785/900 train_loss:0.4158 lr:0.0003000000 time/step:117.00s
820
+ [2025-09-12 21:42:51,059] - step:786/900 train_loss:0.3933 lr:0.0003000000 time/step:118.35s
821
+ [2025-09-12 21:44:47,908] - step:787/900 train_loss:0.4167 lr:0.0003000000 time/step:116.84s
822
+ [2025-09-12 21:46:44,911] - step:788/900 train_loss:0.3970 lr:0.0003000000 time/step:116.99s
823
+ [2025-09-12 21:48:41,985] - step:789/900 train_loss:0.3789 lr:0.0003000000 time/step:117.06s
824
+ [2025-09-12 21:50:38,911] - step:790/900 train_loss:0.4033 lr:0.0003000000 time/step:116.92s
825
+ [2025-09-12 21:52:36,518] - step:791/900 train_loss:0.3703 lr:0.0003000000 time/step:117.60s
826
+ [2025-09-12 21:54:35,924] - step:792/900 train_loss:0.3987 lr:0.0003000000 time/step:119.40s
827
+ [2025-09-12 21:56:32,089] - step:793/900 train_loss:0.4103 lr:0.0003000000 time/step:116.16s
828
+ [2025-09-12 21:58:29,152] - step:794/900 train_loss:0.4121 lr:0.0003000000 time/step:117.05s
829
+ [2025-09-12 22:00:26,076] - step:795/900 train_loss:0.3756 lr:0.0003000000 time/step:116.92s
830
+ [2025-09-12 22:02:23,114] - step:796/900 train_loss:0.4195 lr:0.0003000000 time/step:117.03s
831
+ [2025-09-12 22:04:21,556] - step:797/900 train_loss:0.3852 lr:0.0003000000 time/step:118.43s
832
+ [2025-09-12 22:06:19,445] - step:798/900 train_loss:0.4343 lr:0.0003000000 time/step:117.88s
833
+ [2025-09-12 22:08:15,683] - step:799/900 train_loss:0.4024 lr:0.0003000000 time/step:116.22s
834
+ [2025-09-12 22:10:13,431] - step:800/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@800.pt...
835
+ [2025-09-12 22:10:13,432] - step:800/900 train_loss:0.4081 lr:0.0003000000 time/step:117.14s
836
+ [2025-09-12 22:12:09,931] - step:801/900 train_loss:0.4091 lr:0.0003000000 time/step:116.49s
837
+ [2025-09-12 22:14:08,180] - step:802/900 train_loss:0.4188 lr:0.0003000000 time/step:118.24s
838
+ [2025-09-12 22:16:05,937] - step:803/900 train_loss:0.4227 lr:0.0003000000 time/step:117.74s
839
+ [2025-09-12 22:18:02,679] - step:804/900 train_loss:0.3994 lr:0.0003000000 time/step:116.73s
840
+ [2025-09-12 22:19:59,511] - step:805/900 train_loss:0.3885 lr:0.0003000000 time/step:116.82s
841
+ [2025-09-12 22:21:56,602] - step:806/900 train_loss:0.3937 lr:0.0003000000 time/step:117.08s
842
+ [2025-09-12 22:23:54,831] - step:807/900 train_loss:0.4143 lr:0.0003000000 time/step:118.22s
843
+ [2025-09-12 22:25:52,068] - step:808/900 train_loss:0.4324 lr:0.0003000000 time/step:117.23s
844
+ [2025-09-12 22:27:49,499] - step:809/900 train_loss:0.3988 lr:0.0003000000 time/step:117.42s
845
+ [2025-09-12 22:29:45,897] - step:810/900 train_loss:0.4016 lr:0.0003000000 time/step:116.39s
846
+ [2025-09-12 22:31:42,993] - step:811/900 train_loss:0.4106 lr:0.0003000000 time/step:117.08s
847
+ [2025-09-12 22:33:41,172] - step:812/900 train_loss:0.4097 lr:0.0003000000 time/step:118.17s
848
+ [2025-09-12 22:35:38,349] - step:813/900 train_loss:0.3838 lr:0.0003000000 time/step:117.17s
849
+ [2025-09-12 22:37:36,103] - step:814/900 train_loss:0.3802 lr:0.0003000000 time/step:117.74s
850
+ [2025-09-12 22:39:33,507] - step:815/900 train_loss:0.4195 lr:0.0003000000 time/step:117.40s
851
+ [2025-09-12 22:41:29,750] - step:816/900 train_loss:0.4333 lr:0.0003000000 time/step:116.23s
852
+ [2025-09-12 22:43:26,622] - step:817/900 train_loss:0.4108 lr:0.0003000000 time/step:116.87s
853
+ [2025-09-12 22:45:25,127] - step:818/900 train_loss:0.3866 lr:0.0003000000 time/step:118.49s
854
+ [2025-09-12 22:47:22,168] - step:819/900 train_loss:0.4197 lr:0.0003000000 time/step:117.03s
855
+ [2025-09-12 22:49:19,672] - step:820/900 train_loss:0.3791 lr:0.0003000000 time/step:117.50s
856
+ [2025-09-12 22:51:17,438] - step:821/900 train_loss:0.4053 lr:0.0003000000 time/step:117.76s
857
+ [2025-09-12 22:53:13,613] - step:822/900 train_loss:0.4096 lr:0.0003000000 time/step:116.16s
858
+ [2025-09-12 22:55:11,085] - step:823/900 train_loss:0.4086 lr:0.0003000000 time/step:117.46s
859
+ [2025-09-12 22:57:08,006] - step:824/900 train_loss:0.4028 lr:0.0003000000 time/step:116.90s
860
+ [2025-09-12 22:59:05,729] - step:825/900 train_loss:0.3960 lr:0.0003000000 time/step:117.72s
861
+ [2025-09-12 23:01:03,331] - step:826/900 train_loss:0.4060 lr:0.0003000000 time/step:117.59s
862
+ [2025-09-12 23:03:00,051] - step:827/900 train_loss:0.4147 lr:0.0003000000 time/step:116.71s
863
+ [2025-09-12 23:04:56,347] - step:828/900 train_loss:0.4173 lr:0.0003000000 time/step:116.28s
864
+ [2025-09-12 23:06:53,382] - step:829/900 train_loss:0.4136 lr:0.0003000000 time/step:117.02s
865
+ [2025-09-12 23:08:50,925] - step:830/900 train_loss:0.4135 lr:0.0003000000 time/step:117.53s
866
+ [2025-09-12 23:10:48,709] - step:831/900 train_loss:0.3960 lr:0.0003000000 time/step:117.78s
867
+ [2025-09-12 23:12:45,852] - step:832/900 train_loss:0.3999 lr:0.0003000000 time/step:117.13s
868
+ [2025-09-12 23:14:43,195] - step:833/900 train_loss:0.4046 lr:0.0003000000 time/step:117.33s
869
+ [2025-09-12 23:16:39,299] - step:834/900 train_loss:0.4188 lr:0.0003000000 time/step:116.10s
870
+ [2025-09-12 23:18:36,142] - step:835/900 train_loss:0.3957 lr:0.0003000000 time/step:116.83s
871
+ [2025-09-12 23:20:34,486] - step:836/900 train_loss:0.4188 lr:0.0003000000 time/step:118.34s
872
+ [2025-09-12 23:22:31,489] - step:837/900 train_loss:0.3849 lr:0.0003000000 time/step:116.99s
873
+ [2025-09-12 23:24:28,392] - step:838/900 train_loss:0.4255 lr:0.0003000000 time/step:116.90s
874
+ [2025-09-12 23:26:24,998] - step:839/900 train_loss:0.4019 lr:0.0003000000 time/step:116.59s
875
+ [2025-09-12 23:28:21,798] - step:840/900 train_loss:0.4149 lr:0.0003000000 time/step:116.78s
876
+ [2025-09-12 23:30:20,342] - step:841/900 train_loss:0.3937 lr:0.0003000000 time/step:118.54s
877
+ [2025-09-12 23:32:17,286] - step:842/900 train_loss:0.3996 lr:0.0003000000 time/step:116.94s
878
+ [2025-09-12 23:34:14,169] - step:843/900 train_loss:0.3911 lr:0.0003000000 time/step:116.88s
879
+ [2025-09-12 23:36:11,513] - step:844/900 train_loss:0.4199 lr:0.0003000000 time/step:117.34s
880
+ [2025-09-12 23:38:07,515] - step:845/900 train_loss:0.3990 lr:0.0003000000 time/step:115.99s
881
+ [2025-09-12 23:40:05,382] - step:846/900 train_loss:0.4059 lr:0.0003000000 time/step:117.86s
882
+ [2025-09-12 23:42:03,341] - step:847/900 train_loss:0.4217 lr:0.0003000000 time/step:117.95s
883
+ [2025-09-12 23:44:00,267] - step:848/900 train_loss:0.4059 lr:0.0003000000 time/step:116.92s
884
+ [2025-09-12 23:45:57,550] - step:849/900 train_loss:0.4140 lr:0.0003000000 time/step:117.28s
885
+ [2025-09-12 23:47:54,492] - step:850/900 train_loss:0.3920 lr:0.0003000000 time/step:116.93s
886
+ [2025-09-12 23:49:50,997] - step:851/900 train_loss:0.4194 lr:0.0003000000 time/step:116.50s
887
+ [2025-09-12 23:51:48,718] - step:852/900 train_loss:0.3914 lr:0.0003000000 time/step:117.71s
888
+ [2025-09-12 23:53:45,683] - step:853/900 train_loss:0.4012 lr:0.0003000000 time/step:116.96s
889
+ [2025-09-12 23:55:43,182] - step:854/900 train_loss:0.4198 lr:0.0003000000 time/step:117.47s
890
+ [2025-09-12 23:57:40,227] - step:855/900 train_loss:0.4059 lr:0.0003000000 time/step:117.03s
891
+ [2025-09-12 23:59:37,792] - step:856/900 train_loss:0.4026 lr:0.0003000000 time/step:117.56s
892
+ [2025-09-13 00:01:34,695] - step:857/900 train_loss:0.4171 lr:0.0003000000 time/step:116.89s
893
+ [2025-09-13 00:03:32,341] - step:858/900 train_loss:0.4017 lr:0.0003000000 time/step:117.64s
894
+ [2025-09-13 00:05:29,421] - step:859/900 train_loss:0.4011 lr:0.0003000000 time/step:117.07s
895
+ [2025-09-13 00:07:26,749] - step:860/900 train_loss:0.3910 lr:0.0003000000 time/step:117.32s
896
+ [2025-09-13 00:09:23,608] - step:861/900 train_loss:0.4093 lr:0.0003000000 time/step:116.85s
897
+ [2025-09-13 00:11:21,037] - step:862/900 train_loss:0.4295 lr:0.0003000000 time/step:117.42s
898
+ [2025-09-13 00:13:17,816] - step:863/900 train_loss:0.4025 lr:0.0003000000 time/step:116.77s
899
+ [2025-09-13 00:15:14,919] - step:864/900 train_loss:0.3978 lr:0.0003000000 time/step:117.10s
900
+ [2025-09-13 00:17:12,309] - step:865/900 train_loss:0.3941 lr:0.0003000000 time/step:117.38s
901
+ [2025-09-13 00:19:09,330] - step:866/900 train_loss:0.4150 lr:0.0003000000 time/step:117.01s
902
+ [2025-09-13 00:21:06,411] - step:867/900 train_loss:0.4101 lr:0.0003000000 time/step:117.01s
903
+ [2025-09-13 00:23:03,516] - step:868/900 train_loss:0.4156 lr:0.0003000000 time/step:117.10s
904
+ [2025-09-13 00:25:00,493] - step:869/900 train_loss:0.4128 lr:0.0003000000 time/step:116.97s
905
+ [2025-09-13 00:26:57,821] - step:870/900 train_loss:0.4182 lr:0.0003000000 time/step:117.31s
906
+ [2025-09-13 00:28:54,768] - step:871/900 train_loss:0.3940 lr:0.0003000000 time/step:116.93s
907
+ [2025-09-13 00:30:51,704] - step:872/900 train_loss:0.4091 lr:0.0003000000 time/step:116.93s
908
+ [2025-09-13 00:32:48,692] - step:873/900 train_loss:0.4066 lr:0.0003000000 time/step:116.98s
909
+ [2025-09-13 00:34:47,091] - step:874/900 train_loss:0.4061 lr:0.0003000000 time/step:118.39s
910
+ [2025-09-13 00:36:44,116] - step:875/900 train_loss:0.3712 lr:0.0003000000 time/step:117.01s
911
+ [2025-09-13 00:38:41,019] - step:876/900 train_loss:0.4040 lr:0.0003000000 time/step:116.89s
912
+ [2025-09-13 00:40:38,506] - step:877/900 train_loss:0.3807 lr:0.0003000000 time/step:117.48s
913
+ [2025-09-13 00:42:35,384] - step:878/900 train_loss:0.4103 lr:0.0003000000 time/step:116.87s
914
+ [2025-09-13 00:44:33,175] - step:879/900 train_loss:0.4001 lr:0.0003000000 time/step:117.79s
915
+ [2025-09-13 00:46:29,986] - step:880/900 train_loss:0.3966 lr:0.0003000000 time/step:116.79s
916
+ [2025-09-13 00:48:27,354] - step:881/900 train_loss:0.4188 lr:0.0003000000 time/step:117.29s
917
+ [2025-09-13 00:50:24,406] - step:882/900 train_loss:0.4164 lr:0.0003000000 time/step:117.05s
918
+ [2025-09-13 00:52:22,291] - step:883/900 train_loss:0.3936 lr:0.0003000000 time/step:117.88s
919
+ [2025-09-13 00:54:20,651] - step:884/900 train_loss:0.4148 lr:0.0003000000 time/step:118.35s
920
+ [2025-09-13 00:56:17,788] - step:885/900 train_loss:0.4173 lr:0.0003000000 time/step:117.13s
921
+ [2025-09-13 00:58:14,279] - step:886/900 train_loss:0.4260 lr:0.0003000000 time/step:116.46s
922
+ [2025-09-13 01:00:11,090] - step:887/900 train_loss:0.4037 lr:0.0003000000 time/step:116.80s
923
+ [2025-09-13 01:02:08,948] - step:888/900 train_loss:0.4117 lr:0.0003000000 time/step:117.85s
924
+ [2025-09-13 01:04:07,249] - step:889/900 train_loss:0.4068 lr:0.0003000000 time/step:118.29s
925
+ [2025-09-13 01:06:04,130] - step:890/900 train_loss:0.4187 lr:0.0003000000 time/step:116.87s
926
+ [2025-09-13 01:08:01,508] - step:891/900 train_loss:0.4159 lr:0.0003000000 time/step:117.36s
927
+ [2025-09-13 01:09:57,620] - step:892/900 train_loss:0.3978 lr:0.0003000000 time/step:116.10s
928
+ [2025-09-13 01:11:55,493] - step:893/900 train_loss:0.3925 lr:0.0003000000 time/step:117.86s
929
+ [2025-09-13 01:13:52,516] - step:894/900 train_loss:0.3845 lr:0.0003000000 time/step:117.01s
930
+ [2025-09-13 01:15:50,321] - step:895/900 train_loss:0.4062 lr:0.0003000000 time/step:117.80s
931
+ [2025-09-13 01:17:47,232] - step:896/900 train_loss:0.3879 lr:0.0003000000 time/step:116.90s
932
+ [2025-09-13 01:19:44,630] - step:897/900 train_loss:0.4272 lr:0.0003000000 time/step:117.39s
933
+ [2025-09-13 01:21:41,559] - step:898/900 train_loss:0.4121 lr:0.0003000000 time/step:116.92s
934
+ [2025-09-13 01:23:39,154] - step:899/900 train_loss:0.4079 lr:0.0003000000 time/step:117.59s
935
+ [2025-09-13 01:25:37,577] - step:900/900 Saved model to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/model@900.pt...
936
+ [2025-09-13 01:25:37,578] - step:900/900 train_loss:0.3995 lr:0.0003000000 time/step:117.81s
wandb/run-20250911_200644-y9v5i9gr/files/requirements.txt ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ parso==0.8.4
2
+ pydantic_core==2.27.2
3
+ charset-normalizer==3.4.1
4
+ xxhash==3.5.0
5
+ PyYAML==6.0.2
6
+ transformers==4.49.0
7
+ idna==3.10
8
+ nvidia-cudnn-cu12==9.1.0.70
9
+ numpy==2.2.3
10
+ hydra-core==1.3.2
11
+ Pygments==2.19.1
12
+ rich==14.0.0
13
+ nvidia-cusolver-cu12==11.6.1.9
14
+ urllib3==2.3.0
15
+ nvidia-cusparselt-cu12==0.6.2
16
+ contourpy==1.3.1
17
+ cycler==0.12.1
18
+ decorator==5.2.1
19
+ psutil==7.0.0
20
+ aiohttp==3.11.13
21
+ einops==0.8.1
22
+ nvidia-cuda-runtime-cu12==12.4.127
23
+ exceptiongroup==1.2.2
24
+ stack-data==0.6.3
25
+ setproctitle==1.3.5
26
+ fsspec==2024.12.0
27
+ tueplots==0.2.0
28
+ pexpect==4.9.0
29
+ gitdb==4.0.12
30
+ fonttools==4.56.0
31
+ ipython==8.35.0
32
+ huggingface-hub==0.29.2
33
+ filelock==3.17.0
34
+ torchvision==0.21.0+cu124
35
+ platformdirs==4.3.6
36
+ peft==0.15.1
37
+ nvidia-cuda-nvrtc-cu12==12.4.127
38
+ wandb==0.19.8
39
+ click==8.1.8
40
+ mpmath==1.3.0
41
+ Jinja2==3.1.6
42
+ scipy==1.14.1
43
+ markdown-it-py==3.0.0
44
+ matplotlib-inline==0.1.7
45
+ wheel==0.45.1
46
+ setuptools==75.8.2
47
+ tqdm==4.67.1
48
+ antlr4-python3-runtime==4.9.3
49
+ deepspeed==0.16.7
50
+ omegaconf==2.3.0
51
+ torchaudio==2.6.0+cu124
52
+ aiosignal==1.3.2
53
+ accelerate==1.6.0
54
+ py-cpuinfo==9.0.0
55
+ pyparsing==3.2.1
56
+ ninja==1.11.1.4
57
+ pandas==2.2.3
58
+ six==1.17.0
59
+ wcwidth==0.2.13
60
+ safetensors==0.5.3
61
+ attrs==25.1.0
62
+ python-dateutil==2.9.0.post0
63
+ nvidia-cufft-cu12==11.2.1.3
64
+ multiprocess==0.70.16
65
+ seaborn==0.13.2
66
+ networkx==3.4.2
67
+ regex==2024.11.6
68
+ nvidia-nvtx-cu12==12.4.127
69
+ tokenizers==0.21.0
70
+ datasets==3.3.2
71
+ nvidia-curand-cu12==10.3.5.147
72
+ nvidia-nvjitlink-cu12==12.4.127
73
+ MarkupSafe==3.0.2
74
+ triton==3.1.0
75
+ pip==25.0.1
76
+ jedi==0.19.2
77
+ nvidia-cublas-cu12==12.4.5.8
78
+ iniconfig==2.0.0
79
+ pluggy==1.5.0
80
+ pure_eval==0.2.3
81
+ docker-pycreds==0.4.0
82
+ libcirkit==0.2.1
83
+ mdurl==0.1.2
84
+ annotated-types==0.7.0
85
+ sentry-sdk==2.22.0
86
+ executing==2.2.0
87
+ pydantic==2.10.6
88
+ opt_einsum==3.4.0
89
+ pytz==2025.1
90
+ nvidia-cuda-cupti-cu12==12.4.127
91
+ protobuf==5.29.3
92
+ requests==2.32.3
93
+ tomli==2.2.1
94
+ matplotlib==3.10.1
95
+ hjson==3.1.0
96
+ frozenlist==1.5.0
97
+ pillow==11.1.0
98
+ GitPython==3.1.44
99
+ typing_extensions==4.12.2
100
+ pyarrow==19.0.1
101
+ propcache==0.3.0
102
+ prompt_toolkit==3.0.51
103
+ torch==2.6.0+cu124
104
+ async-timeout==5.0.1
105
+ bitsandbytes==0.45.5
106
+ trl==0.16.1
107
+ ptyprocess==0.7.0
108
+ dill==0.3.8
109
+ pytest==8.3.5
110
+ nvidia-nccl-cu12==2.21.5
111
+ sympy==1.13.1
112
+ flash_attn==2.7.4.post1
113
+ certifi==2025.1.31
114
+ nvidia-cusparse-cu12==12.3.1.170
115
+ tzdata==2025.1
116
+ aiohappyeyeballs==2.5.0
117
+ msgpack==1.1.0
118
+ traitlets==5.14.3
119
+ multidict==6.1.0
120
+ packaging==24.2
121
+ kiwisolver==1.4.8
122
+ smmap==5.0.2
123
+ asttokens==3.0.0
124
+ yarl==1.18.3
125
+ graphviz==0.20.3
wandb/run-20250911_200644-y9v5i9gr/files/wandb-metadata.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-57-generic-x86_64-with-glibc2.39",
3
+ "python": "CPython 3.10.16",
4
+ "startedAt": "2025-09-11T19:06:44.948798Z",
5
+ "args": [
6
+ "data=tulu3-evabyte-packed",
7
+ "training=tulu3-evabyte-1epoch",
8
+ "lm=evabyte",
9
+ "model=mtp",
10
+ "adaptor=none",
11
+ "mt_head=linear-evabyte",
12
+ "circuit=cp",
13
+ "circuit.n_token=8",
14
+ "circuit.n_component=8",
15
+ "training.device_batch_size=1",
16
+ "data.vocab_size=320",
17
+ "model.model.beta=0",
18
+ "model.model.gamma=0.9",
19
+ "data.val_bin=null",
20
+ "training.learning_rate=0.0003",
21
+ "training.expname=lr-3e-4-no-lora-cp-n-8-r-8"
22
+ ],
23
+ "program": "-m mtp.train",
24
+ "git": {
25
+ "remote": "git@github.com:PiotrNawrot/nanoGPT.git",
26
+ "commit": "26cfb78beb2138c5995ff5a43c8f8e1cc44652fd"
27
+ },
28
+ "email": "agrivas@inf.ed.ac.uk",
29
+ "root": "/disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16",
30
+ "host": "scotia01.inf.ed.ac.uk",
31
+ "executable": "/home/agrivas/nanoGPT/.venv/bin/python3",
32
+ "cpu_count": 24,
33
+ "cpu_count_logical": 48,
34
+ "gpu": "NVIDIA L40S",
35
+ "gpu_count": 4,
36
+ "disk": {
37
+ "/": {
38
+ "total": "184643391488",
39
+ "used": "37109506048"
40
+ }
41
+ },
42
+ "memory": {
43
+ "total": "540522938368"
44
+ },
45
+ "cpu": {
46
+ "count": 24,
47
+ "countLogical": 48
48
+ },
49
+ "gpu_nvidia": [
50
+ {
51
+ "name": "NVIDIA L40S",
52
+ "memoryTotal": "48305799168",
53
+ "cudaCores": 18176,
54
+ "architecture": "Ada"
55
+ },
56
+ {
57
+ "name": "NVIDIA L40S",
58
+ "memoryTotal": "48305799168",
59
+ "cudaCores": 18176,
60
+ "architecture": "Ada"
61
+ },
62
+ {
63
+ "name": "NVIDIA L40S",
64
+ "memoryTotal": "48305799168",
65
+ "cudaCores": 18176,
66
+ "architecture": "Ada"
67
+ },
68
+ {
69
+ "name": "NVIDIA L40S",
70
+ "memoryTotal": "48305799168",
71
+ "cudaCores": 18176,
72
+ "architecture": "Ada"
73
+ }
74
+ ],
75
+ "slurm": {
76
+ "cluster_name": "landoniacluster",
77
+ "conf": "/etc/slurm/slurm.conf",
78
+ "cpus_on_node": "16",
79
+ "cpus_per_gpu": "4",
80
+ "gpus_on_node": "4",
81
+ "gtids": "0",
82
+ "job_account": "research-staff",
83
+ "job_cpus_per_node": "16",
84
+ "job_end_time": "1757962299",
85
+ "job_gid": "10000",
86
+ "job_gpus": "0,1,2,3",
87
+ "job_id": "2085792",
88
+ "job_name": "slurm.sh",
89
+ "job_nodelist": "scotia01",
90
+ "job_num_nodes": "1",
91
+ "job_partition": "PGR-Standard",
92
+ "job_qos": "normal",
93
+ "job_start_time": "1757616699",
94
+ "job_uid": "1782564",
95
+ "job_user": "agrivas",
96
+ "jobid": "2085792",
97
+ "localid": "0",
98
+ "mem_per_node": "64000",
99
+ "nnodes": "1",
100
+ "nodeid": "0",
101
+ "nodelist": "scotia01",
102
+ "nprocs": "1",
103
+ "ntasks": "1",
104
+ "prio_process": "0",
105
+ "procid": "0",
106
+ "submit_dir": "/home/agrivas",
107
+ "submit_host": "hastings.inf.ed.ac.uk",
108
+ "task_pid": "2707112",
109
+ "tasks_per_node": "1",
110
+ "topology_addr": "scotia01",
111
+ "topology_addr_pattern": "node"
112
+ },
113
+ "cudaVersion": "12.8"
114
+ }
wandb/run-20250911_200644-y9v5i9gr/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/ce_loss_at_7":0.6471807956695557,"_step":899,"_runtime":105532.670379318,"train/loss":0.3994542360305786,"_timestamp":1.7577231375784273e+09,"train/ce_loss_at_1":0.18259316682815552,"train/ce_loss_at_3":0.31228378415107727,"global_step":900,"train/ce_loss_at_5":0.45094943046569824,"train/ce_loss_at_4":0.3768407106399536,"_wandb":{"runtime":105533},"train/ce_loss_at_6":0.5358694791793823,"train/ce_loss_at_2":0.25190725922584534,"train/ce_loss_at_8":0.7982796430587769}
wandb/run-20250911_200644-y9v5i9gr/logs/debug-internal.log ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-11T20:06:44.955449103+01:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"/disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug-core.log"}
2
+ {"time":"2025-09-11T20:06:45.176117844+01:00","level":"INFO","msg":"created new stream","id":"y9v5i9gr"}
3
+ {"time":"2025-09-11T20:06:45.176201537+01:00","level":"INFO","msg":"stream: started","id":"y9v5i9gr"}
4
+ {"time":"2025-09-11T20:06:45.176254637+01:00","level":"INFO","msg":"writer: Do: started","stream_id":"y9v5i9gr"}
5
+ {"time":"2025-09-11T20:06:45.176292219+01:00","level":"INFO","msg":"handler: started","stream_id":"y9v5i9gr"}
6
+ {"time":"2025-09-11T20:06:45.176341928+01:00","level":"INFO","msg":"sender: started","stream_id":"y9v5i9gr"}
7
+ {"time":"2025-09-11T20:06:45.680069036+01:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-09-11T20:19:16.313200337+01:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/circuit-mtp/mtp/y9v5i9gr/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
9
+ {"time":"2025-09-12T00:53:29.590652615+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
10
+ {"time":"2025-09-12T01:30:18.032795292+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
11
+ {"time":"2025-09-12T01:30:50.327057066+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
12
+ {"time":"2025-09-12T01:31:25.000022545+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
13
+ {"time":"2025-09-12T01:32:03.267256543+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
14
+ {"time":"2025-09-12T02:41:34.535497308+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
15
+ {"time":"2025-09-12T02:42:24.914157379+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
16
+ {"time":"2025-09-12T02:42:57.41051518+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
17
+ {"time":"2025-09-12T10:36:51.38167595+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
18
+ {"time":"2025-09-12T10:38:06.370172425+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
19
+ {"time":"2025-09-12T10:38:38.465480726+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
20
+ {"time":"2025-09-12T10:39:07.484991796+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
21
+ {"time":"2025-09-12T10:39:41.575653023+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
22
+ {"time":"2025-09-12T20:16:55.628544216+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
23
+ {"time":"2025-09-12T20:21:25.750812333+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
24
+ {"time":"2025-09-12T22:43:55.97454382+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
25
+ {"time":"2025-09-13T00:52:11.684482933+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
26
+ {"time":"2025-09-13T00:54:54.045134291+01:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql","body":"{\"errors\":[{\"message\":\"context deadline exceeded\",\"path\":[\"project\",\"run\"]}],\"data\":{\"project\":{\"run\":null}}}"}
27
+ {"time":"2025-09-13T00:55:26.197593179+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
28
+ {"time":"2025-09-13T01:05:42.010380611+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
29
+ {"time":"2025-09-13T01:06:14.056932921+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
30
+ {"time":"2025-09-13T01:06:48.575121732+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
31
+ {"time":"2025-09-13T01:07:28.074495024+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
32
+ {"time":"2025-09-13T01:09:42.005493483+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
33
+ {"time":"2025-09-13T01:10:14.454893184+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
34
+ {"time":"2025-09-13T01:10:49.419226595+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
35
+ {"time":"2025-09-13T01:11:21.445954263+01:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/circuit-mtp/mtp/y9v5i9gr/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
36
+ {"time":"2025-09-13T01:11:57.007348427+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
37
+ {"time":"2025-09-13T01:13:57.010172043+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
38
+ {"time":"2025-09-13T01:14:29.220923193+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
39
+ {"time":"2025-09-13T01:15:27.013535251+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
40
+ {"time":"2025-09-13T01:15:59.276998526+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
41
+ {"time":"2025-09-13T01:16:33.628210655+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
42
+ {"time":"2025-09-13T01:17:42.016257241+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
43
+ {"time":"2025-09-13T01:18:14.389776393+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
44
+ {"time":"2025-09-13T01:19:42.02019871+01:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
45
+ {"time":"2025-09-13T01:25:38.927499163+01:00","level":"INFO","msg":"stream: closing","id":"y9v5i9gr"}
46
+ {"time":"2025-09-13T01:25:38.930059685+01:00","level":"INFO","msg":"Stopping system monitor"}
47
+ {"time":"2025-09-13T01:25:38.990179981+01:00","level":"INFO","msg":"Stopped system monitor"}
48
+ {"time":"2025-09-13T01:25:39.717455712+01:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
49
+ {"time":"2025-09-13T01:25:39.93171592+01:00","level":"INFO","msg":"handler: closed","stream_id":"y9v5i9gr"}
50
+ {"time":"2025-09-13T01:25:39.931829957+01:00","level":"INFO","msg":"writer: Close: closed","stream_id":"y9v5i9gr"}
51
+ {"time":"2025-09-13T01:25:39.932853619+01:00","level":"INFO","msg":"sender: closed","stream_id":"y9v5i9gr"}
52
+ {"time":"2025-09-13T01:25:39.932961632+01:00","level":"INFO","msg":"stream: closed","id":"y9v5i9gr"}
wandb/run-20250911_200644-y9v5i9gr/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-09-11 20:06:44,916 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Configure stats pid to 2716293
3
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from /home/agrivas/.config/wandb/settings
4
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/settings
5
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:setup_run_log_directory():647] Logging user logs to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug.log
7
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to /disk/scratch/agrivas/nanoGPT/logs/2025-09-11/20-06-16/wandb/run-20250911_200644-y9v5i9gr/logs/debug-internal.log
8
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():761] calling init triggers
9
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'compile': True, 'device': 'cuda', 'from_checkpoint': None, 'name': 'nanogpt', 'training': {'random_seed': 13, 'batch_size': 256, 'device_batch_size': 1, 'sequence_length': 8192, 'num_iterations': 900, 'learning_rate': 0.0003, 'use_scheduler': False, 'save_model': True, 'save_optimizer': True, 'save_model_every': 100, 'val_loss_every': 100, 'val_tokens': 4194304, 'expname': 'lr-3e-4-no-lora-cp-n-8-r-8'}, 'model': {'name': 'mtp', 'beta': 0.0, 'gamma': 1, 'kl_algorithm': 'full', 'kl_type': 'forward', 'model': {'_target_': 'mtp.models.mtp.MultiTokenLM', 'lm': '${lm.model}', 'circuit': '${circuit.model}', 'mt_head_kwargs': '${mt_head.hyperparameters}', 'init_from_lm_head': True, 'kl_type': '${model.kl_type}', 'kl_algorithm': '${model.kl_algorithm}', 'beta': 0, 'gamma': 0.9}}, 'circuit': {'name': 'cp', 'n_token': 8, 'n_component': 8, 'model': {'_target_': 'mtp.models.circuits.CircuitModel', 'vocab_size': 320, 'n_token': 8, 'n_component': 8, 'kind': 'cp'}}, 'mt_head': {'name': 'linear-evabyte', 'hyperparameters': {'type': 'evabyte', 'n_embd': 4096, 'transformer_n_head': 32, 'transformer_n_layer': 0, 'expander_type': 'linear', 'expander_n_layer': 1, 'freeze_vocab_unembedding': False, 'share_sum_weights': False, 'contextual_hmm_weights': True, 'init_hmm_identity': True}}, 'adaptor': {'name': 'none', 'hyperparameters': None}, 'lm': {'name': 'evabyte', 'n_embd': 4096, 'n_head': 32, 'model': {'_target_': 'mtp.models.lm.LM', 'lm': None, 'encoder_only': True, 'from_checkpoint': None, 'from_huggingface': 'EvaByte/EvaByte-SFT', 'adaptor_kwargs': None, 'ref_enc': 'model', 'ref_head': 'lm_head', 'freeze': True}}, 'data': {'name': 'tulu3-evabyte', 'train_bin': 'agrv/tulu-v3-sft-evabyte-packed-seq-len-8192', 'val_bin': None, 'vocab_size': 320}, 'generate': {'speculative': False}, '_wandb': {}}
11
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():784] starting backend
12
+ 2025-09-11 20:06:44,917 INFO MainThread:2716293 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-09-11 20:06:44,948 INFO MainThread:2716293 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-09-11 20:06:44,948 INFO MainThread:2716293 [wandb_init.py:init():798] backend started and connected
15
+ 2025-09-11 20:06:44,953 INFO MainThread:2716293 [wandb_init.py:init():891] updated telemetry
16
+ 2025-09-11 20:06:44,961 INFO MainThread:2716293 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-09-11 20:06:45,675 INFO MainThread:2716293 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-09-11 20:06:46,525 INFO MainThread:2716293 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-09-11 20:06:46,526 INFO MainThread:2716293 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-09-11 20:06:46,533 INFO MainThread:2716293 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-09-11 20:06:46,533 INFO MainThread:2716293 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-09-11 20:06:46,549 INFO MainThread:2716293 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-09-13 01:25:38,827 INFO MsgRouterThr:2716293 [mailbox.py:close():129] Closing mailbox, abandoning 1 handles.
wandb/run-20250911_200644-y9v5i9gr/run-y9v5i9gr.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:049e97c6350380a1f5f79cdc0c647b0d0e33cacb013dc81b82c059a2c5672f20
3
+ size 14919748