robbiemu
/

smollm3-sft-math-tuned

Text Generation

Generated from Trainer

Model card Files Files and versions

smollm3-sft-math-tuned / format_dataset.py

robbiemu's picture

scripts used in generating my submission

3eb9439 verified about 2 months ago

history blame contribute delete

1.45 kB

	import os
	from datasets import load_dataset

	# --- Configuration ---
	# Your Hugging Face username is needed to create the new dataset repo.
	# Make sure you are logged in via `huggingface-cli login`.
	HF_USERNAME = os.getenv("HF_USERNAME")
	if not HF_USERNAME:
	raise ValueError("Please set the HF_USERNAME environment variable.")

	SOURCE_DATASET = "meta-math/MetaMathQA"
	NEW_DATASET_NAME = f"{HF_USERNAME}/MetaMathQA-formatted"


	# --- Formatting Function ---
	# This function creates the single 'text' column the training script needs.
	def format_prompt(example):
	# The prompt format should ideally match the base model's training style.
	# A simple question/answer format is a good starting point.
	return {"text": f"Question: {example['query']}\n\nAnswer: {example['response']}"}


	# --- Main Script ---
	if __name__ == "__main__":
	print(f"Loading original dataset '{SOURCE_DATASET}'...")
	dataset = load_dataset(SOURCE_DATASET, split="train")

	print("Formatting dataset...")
	formatted_dataset = dataset.map(format_prompt)

	# Optional: remove old columns to keep the dataset clean
	formatted_dataset = formatted_dataset.remove_columns(["query", "response", "type"])

	print(f"Pushing formatted dataset to '{NEW_DATASET_NAME}'...")
	formatted_dataset.push_to_hub(NEW_DATASET_NAME)

	print("\n✅ Success! Your formatted dataset is ready on the Hub.")
	print("You can now update your train.sh script.")