# NanoChat Easy - SFT Training


## Import model and tokenizer


In [None]:
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, get_linear_schedule_with_warmup


model_id = "karpathy/nanochat-d32"
revision = "refs/pr/1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
model = AutoModelForCausalLM.from_pretrained(
 model_id,
 revision=revision,
 torch_dtype=torch.bfloat16 if device.type == "cuda" else torch.float32,
).to(device)


 from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!


## Demo the model


In [None]:
print("=" * 80)
print("TEST 1: Plain Autoregressive Prompt")
print("=" * 80)
prompt = "The Eiffel Tower stands in Paris and"
test_inputs = tokenizer(prompt, return_tensors="pt").to(device)


with torch.no_grad():
 test_outputs = model.generate(
 **test_inputs,
 max_new_tokens=64,
 do_sample=False,
 pad_token_id=tokenizer.pad_token_id,
 )

generated_tokens = test_outputs[0, test_inputs["input_ids"].shape[1] :]
print(f"Prompt: {prompt}")
print(f"\nGenerated: {tokenizer.decode(generated_tokens, skip_special_tokens=True)}")
print("=" * 80)


TEST 1: Plain Autoregressive Prompt
Prompt: The Eiffel Tower stands in Paris and

Generated: is one of the most famous landmarks in the world. It is located on the Champ de Mars in the heart of the city. The tower was built for the 1889 World's Fair. It was designed by the French engineer Gustave Eiffel and took 2 years to build. The Eiffel Tower stands 324 meters


In [None]:
print("=" * 80)
print("TEST 2: Chat Template")
print("="*80)
conversation = [
 {"role": "user", "content": "What is the capital of France?"},
]

inputs = tokenizer.apply_chat_template(
 conversation, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
).to(device)

print(f"Formatted prompt: {tokenizer.decode(inputs['input_ids'][0])}")
print(f"Input IDs: {inputs['input_ids'][0].tolist()}")

with torch.no_grad():
 outputs = model.generate(
 **inputs,
 max_new_tokens=64,
 do_sample=False
 )

generated_tokens = outputs[0, inputs["input_ids"].shape[1] :]
print(f"\nGenerated: {tokenizer.decode(generated_tokens)}")
print("=" * 80)


TEST 2: Chat Template
Formatted prompt: <|bos|><|user_start|>What is the capital of France?<|user_end|><|assistant_start|>
Input IDs: [65527, 65528, 1442, 309, 261, 3429, 281, 4215, 63, 65529, 65530]

Generated: The capital of France is Paris.<|assistant_end|>
