import os
from datasets import load_dataset

# --- Configuration ---
# Your Hugging Face username is needed to create the new dataset repo.
# Make sure you are logged in via `huggingface-cli login`.
HF_USERNAME = os.getenv("HF_USERNAME")
if not HF_USERNAME:
    raise ValueError("Please set the HF_USERNAME environment variable.")

SOURCE_DATASET = "meta-math/MetaMathQA"
NEW_DATASET_NAME = f"{HF_USERNAME}/MetaMathQA-formatted"


# --- Formatting Function ---
# This function creates the single 'text' column the training script needs.
def format_prompt(example):
    # The prompt format should ideally match the base model's training style.
    # A simple question/answer format is a good starting point.
    return {"text": f"Question: {example['query']}\n\nAnswer: {example['response']}"}


# --- Main Script ---
if __name__ == "__main__":
    print(f"Loading original dataset '{SOURCE_DATASET}'...")
    dataset = load_dataset(SOURCE_DATASET, split="train")

    print("Formatting dataset...")
    formatted_dataset = dataset.map(format_prompt)

    # Optional: remove old columns to keep the dataset clean
    formatted_dataset = formatted_dataset.remove_columns(["query", "response", "type"])

    print(f"Pushing formatted dataset to '{NEW_DATASET_NAME}'...")
    formatted_dataset.push_to_hub(NEW_DATASET_NAME)

    print("\n✅ Success! Your formatted dataset is ready on the Hub.")
    print("You can now update your train.sh script.")