| import os | |
| from datasets import load_dataset | |
| # --- Configuration --- | |
| # Your Hugging Face username is needed to create the new dataset repo. | |
| # Make sure you are logged in via `huggingface-cli login`. | |
| HF_USERNAME = os.getenv("HF_USERNAME") | |
| if not HF_USERNAME: | |
| raise ValueError("Please set the HF_USERNAME environment variable.") | |
| SOURCE_DATASET = "meta-math/MetaMathQA" | |
| NEW_DATASET_NAME = f"{HF_USERNAME}/MetaMathQA-formatted" | |
| # --- Formatting Function --- | |
| # This function creates the single 'text' column the training script needs. | |
| def format_prompt(example): | |
| # The prompt format should ideally match the base model's training style. | |
| # A simple question/answer format is a good starting point. | |
| return {"text": f"Question: {example['query']}\n\nAnswer: {example['response']}"} | |
| # --- Main Script --- | |
| if __name__ == "__main__": | |
| print(f"Loading original dataset '{SOURCE_DATASET}'...") | |
| dataset = load_dataset(SOURCE_DATASET, split="train") | |
| print("Formatting dataset...") | |
| formatted_dataset = dataset.map(format_prompt) | |
| # Optional: remove old columns to keep the dataset clean | |
| formatted_dataset = formatted_dataset.remove_columns(["query", "response", "type"]) | |
| print(f"Pushing formatted dataset to '{NEW_DATASET_NAME}'...") | |
| formatted_dataset.push_to_hub(NEW_DATASET_NAME) | |
| print("\n✅ Success! Your formatted dataset is ready on the Hub.") | |
| print("You can now update your train.sh script.") | |