smollm3-sft-math-tuned / format_dataset.py
robbiemu's picture
scripts used in generating my submission
3eb9439 verified
import os
from datasets import load_dataset
# --- Configuration ---
# Your Hugging Face username is needed to create the new dataset repo.
# Make sure you are logged in via `huggingface-cli login`.
HF_USERNAME = os.getenv("HF_USERNAME")
if not HF_USERNAME:
raise ValueError("Please set the HF_USERNAME environment variable.")
SOURCE_DATASET = "meta-math/MetaMathQA"
NEW_DATASET_NAME = f"{HF_USERNAME}/MetaMathQA-formatted"
# --- Formatting Function ---
# This function creates the single 'text' column the training script needs.
def format_prompt(example):
# The prompt format should ideally match the base model's training style.
# A simple question/answer format is a good starting point.
return {"text": f"Question: {example['query']}\n\nAnswer: {example['response']}"}
# --- Main Script ---
if __name__ == "__main__":
print(f"Loading original dataset '{SOURCE_DATASET}'...")
dataset = load_dataset(SOURCE_DATASET, split="train")
print("Formatting dataset...")
formatted_dataset = dataset.map(format_prompt)
# Optional: remove old columns to keep the dataset clean
formatted_dataset = formatted_dataset.remove_columns(["query", "response", "type"])
print(f"Pushing formatted dataset to '{NEW_DATASET_NAME}'...")
formatted_dataset.push_to_hub(NEW_DATASET_NAME)
print("\n✅ Success! Your formatted dataset is ready on the Hub.")
print("You can now update your train.sh script.")