theme_model / train_theme_model.py

Upload train_theme_model.py

36b732d verified 4 months ago

4.24 kB

	import json, os, math, random
	from dataclasses import dataclass
	from typing import Dict, List, Any

	import numpy as np
	from datasets import Dataset, DatasetDict
	from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
	DataCollatorWithPadding, TrainingArguments, Trainer)
	import evaluate
	from sklearn.metrics import precision_recall_fscore_support

	# ------------------
	# CONFIG
	# ------------------
	MODEL_NAME = "bert-base-uncased" # swap to a lighter model (e.g., distilbert-base-uncased) if desired
	LABELS = ["mentorship", "entrepreneurship", "startup success"]
	TEXT_FIELDS = ["original_text", "summary"] # we'll concat these to give the model more signal
	SEED = 42
	HF_REPO_ID = "4hnk/theme-multilabel-model" # <--- change this to your namespace

	random.seed(SEED)
	np.random.seed(SEED)

	# ------------------
	# LOAD YOUR JSON
	# ------------------
	# Change this path if needed; it matches the file you mentioned.
	DATA_PATH = "theme_response.json"

	with open(DATA_PATH, "r", encoding="utf-8") as f:
	data = json.load(f)["knowledge_theme_training_data"]

	def to_example(row: Dict[str, Any]) -> Dict[str, Any]:
	text = " ".join([row.get(k, "") for k in TEXT_FIELDS if row.get(k)])
	y = [1 if lbl in row.get("themes", []) else 0 for lbl in LABELS]
	return {"text": text.strip(), "labels": y}

	examples = [to_example(r) for r in data if r.get("original_text")]
	ds_full = Dataset.from_list(examples)

	# ------------------
	# TRAIN/VAL SPLIT (80/20)
	# ------------------
	ds_full = ds_full.shuffle(seed=SEED)
	n = len(ds_full)
	n_train = max(1, int(0.8 * n))
	ds = DatasetDict({
	"train": ds_full.select(range(n_train)),
	"validation": ds_full.select(range(n_train, n))
	})

	# ------------------
	# TOKENIZATION
	# ------------------
	tok = AutoTokenizer.from_pretrained(MODEL_NAME)

	def tokenize(batch):
	return tok(batch["text"], truncation=True)

	ds = ds.map(tokenize, batched=True, remove_columns=["text"])
	data_collator = DataCollatorWithPadding(tokenizer=tok)

	# ------------------
	# MODEL
	# ------------------
	model = AutoModelForSequenceClassification.from_pretrained(
	MODEL_NAME,
	num_labels=len(LABELS),
	problem_type="multi_label_classification"
	)
	model.config.id2label = {i: l for i, l in enumerate(LABELS)}
	model.config.label2id = {l: i for i, l in enumerate(LABELS)}

	# ------------------
	# METRICS (multi-label)
	# ------------------
	metric = evaluate.load("accuracy") # not super meaningful for multi-label, but we’ll compute real ones below

	def sigmoid(x):
	return 1 / (1 + np.exp(-x))

	def compute_metrics(eval_pred, threshold=0.5):
	logits, labels = eval_pred
	probs = sigmoid(logits)
	preds = (probs >= threshold).astype(int)

	# micro/macro PRF
	micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(
	labels, preds, average="micro", zero_division=0
	)
	macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
	labels, preds, average="macro", zero_division=0
	)
	# per-label support could be useful too
	out = {
	"micro/precision": micro_p,
	"micro/recall": micro_r,
	"micro/f1": micro_f1,
	"macro/precision": macro_p,
	"macro/recall": macro_r,
	"macro/f1": macro_f1,
	}
	return out

	# ------------------
	# TRAINING ARGS
	# ------------------
	args = TrainingArguments(
	output_dir="./theme_model_outputs",
	evaluation_strategy="epoch",
	save_strategy="epoch",
	learning_rate=2e-5,
	per_device_train_batch_size=8,
	per_device_eval_batch_size=16,
	num_train_epochs=10, # small dataset -> more epochs
	weight_decay=0.01,
	load_best_model_at_end=True,
	metric_for_best_model="micro/f1",
	greater_is_better=True,
	push_to_hub=True, # <--- enable Hub push
	hub_model_id=HF_REPO_ID
	)

	# ------------------
	# TRAIN
	# ------------------
	trainer = Trainer(
	model=model,
	args=args,
	train_dataset=ds["train"],
	eval_dataset=ds["validation"],
	tokenizer=tok,
	data_collator=data_collator,
	compute_metrics=compute_metrics
	)

	trainer.train()
	trainer.evaluate()

	# ------------------
	# SAVE + PUSH
	# ------------------
	trainer.push_to_hub()