| from transformers import ( |
| AutoTokenizer, |
| AutoModelForCausalLM, |
| TrainingArguments, |
| Trainer, |
| DataCollatorForLanguageModeling |
| ) |
| from datasets import load_dataset |
| import torch |
| import os |
|
|
| def tokenize_function(examples): |
| return tokenizer( |
| examples["text"], |
| truncation=True, |
| max_length=512, |
| padding="max_length", |
| return_tensors="pt" |
| ) |
|
|
| |
| model_name = "bigcode/starcoder2-15b" |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| torch_dtype=torch.bfloat16, |
| device_map="auto" |
| ) |
|
|
| |
| dataset = load_dataset("officialweaver/code") |
| tokenized_dataset = dataset.map( |
| tokenize_function, |
| batched=True, |
| remove_columns=dataset["train"].column_names |
| ) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir="./starcoder-finetuned", |
| num_train_epochs=3, |
| per_device_train_batch_size=4, |
| per_device_eval_batch_size=4, |
| warmup_steps=500, |
| weight_decay=0.01, |
| logging_dir='./logs', |
| logging_steps=100, |
| evaluation_strategy="steps", |
| eval_steps=500, |
| save_strategy="steps", |
| save_steps=500, |
| learning_rate=5e-5, |
| fp16=True, |
| gradient_accumulation_steps=4, |
| load_best_model_at_end=True, |
| metric_for_best_model="eval_loss", |
| greater_is_better=False, |
| ) |
|
|
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_dataset["train"], |
| eval_dataset=tokenized_dataset["validation"], |
| data_collator=DataCollatorForLanguageModeling( |
| tokenizer=tokenizer, |
| mlm=False |
| ) |
| ) |
|
|
| |
| trainer.train() |
|
|
| |
| trainer.save_model("./starcoder-finetuned-final") |