import os from datasets import load_dataset # --- Configuration --- # Your Hugging Face username is needed to create the new dataset repo. # Make sure you are logged in via `huggingface-cli login`. HF_USERNAME = os.getenv("HF_USERNAME") if not HF_USERNAME: raise ValueError("Please set the HF_USERNAME environment variable.") SOURCE_DATASET = "meta-math/MetaMathQA" NEW_DATASET_NAME = f"{HF_USERNAME}/MetaMathQA-formatted" # --- Formatting Function --- # This function creates the single 'text' column the training script needs. def format_prompt(example): # The prompt format should ideally match the base model's training style. # A simple question/answer format is a good starting point. return {"text": f"Question: {example['query']}\n\nAnswer: {example['response']}"} # --- Main Script --- if __name__ == "__main__": print(f"Loading original dataset '{SOURCE_DATASET}'...") dataset = load_dataset(SOURCE_DATASET, split="train") print("Formatting dataset...") formatted_dataset = dataset.map(format_prompt) # Optional: remove old columns to keep the dataset clean formatted_dataset = formatted_dataset.remove_columns(["query", "response", "type"]) print(f"Pushing formatted dataset to '{NEW_DATASET_NAME}'...") formatted_dataset.push_to_hub(NEW_DATASET_NAME) print("\n✅ Success! Your formatted dataset is ready on the Hub.") print("You can now update your train.sh script.")