Add instruction safety gate model

Initial release of a LoRA-based model that evaluates instruction safety and validity before execution for humanoid and agent systems.

Files changed (5) hide show

README.md +35 -0
adapter.bin +3 -0
adapter_config.json +9 -0
config.json +32 -0
train_lora.py +141 -0

README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+---
+language:
+- en
+- id
+license: mit
+tags:
+- humanoid
+- instruction-safety
+- pre-execution
+- risk-detection
+- reasoning
+- llm
+---
+# instruction-safety-gate
+## Model Description
+`instruction-safety-gate` is a language model designed to act as a safety layer that evaluates natural language instructions before execution.
+The model determines whether an instruction can be safely executed by classifying it as valid, ambiguous, contradictory, incomplete, or unsafe. It is intended to prevent unsafe or invalid instructions from reaching humanoid or agent execution systems.
+## Intended Use
+- Safety gating for humanoid robots
+- Pre-execution instruction screening
+- AI agent risk detection
+- Control layers for autonomous systems
+## Output Format
+The model outputs **JSON only**:
+```json
+{
+  "label": "VALID | AMBIGUOUS | CONTRADICTORY | INCOMPLETE | UNSAFE",
+  "confidence": 0.0
+}

adapter.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fffeb1b653ff4b55960adf4c505c7473c96a60c60d6facfe301cc697b65871c1
+size 11

adapter_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "peft_type": "LORA",
+  "task_type": "CAUSAL_LM",
+  "r": 16,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "bias": "none",
+  "target_modules": ["q_proj", "v_proj"]
+}

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+---
+## 📄 2️⃣ config.json
+**Create file:** `config.json`
+```json
+{
+  "model_type": "instruction_safety_gate",
+  "task": "instruction_safety_classification",
+  "languages": ["en", "id"],
+  "output_format": "json",
+  "output_labels": [
+    "VALID",
+    "AMBIGUOUS",
+    "CONTRADICTORY",
+    "INCOMPLETE",
+    "UNSAFE"
+  ],
+  "confidence_range": [0.0, 1.0],
+  "inference_settings": {
+    "json_only": true,
+    "temperature": 0.0,
+    "max_tokens": 64
+  },
+  "intended_use": [
+    "humanoid_instruction_safety",
+    "agent_execution_filter"
+  ],
+  "license": "mit"
+}

train_lora.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import torch
+from datasets import Dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer
+)
+from peft import LoraConfig, get_peft_model
+# -----------------------------
+# 1. Base model (FAST & SMALL)
+# -----------------------------
+BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
+OUTPUT_DIR = "./humanoid-instruction-validator-lora"
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    load_in_4bit=True,
+    device_map="auto",
+    trust_remote_code=True
+)
+# -----------------------------
+# 2. LoRA config
+# -----------------------------
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["q_proj", "v_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
+# -----------------------------
+# 3. Tiny training data
+# -----------------------------
+data = [
+    {
+        "text": """You are an instruction validation model.
+Return ONLY JSON.
+Instruction:
+Answer the question in one sentence using bullet points.
+Input:
+What is leadership?
+Output:
+{"label":"CONTRADICTORY","confidence":0.95}"""
+    },
+    {
+        "text": """You are an instruction validation model.
+Return ONLY JSON.
+Instruction:
+Translate to Indonesian.
+Input:
+Technology improves lives.
+Output:
+{"label":"VALID","confidence":0.96}"""
+    },
+    {
+        "text": """You are an instruction validation model.
+Return ONLY JSON.
+Instruction:
+Summarize the text without shortening it.
+Input:
+Exercise improves health.
+Output:
+{"label":"CONTRADICTORY","confidence":0.94}"""
+    },
+    {
+        "text": """You are an instruction validation model.
+Return ONLY JSON.
+Instruction:
+Respond politely with offensive language.
+Input:
+Can you help me?
+Output:
+{"label":"UNSAFE","confidence":0.97}"""
+    }
+]
+dataset = Dataset.from_list(data)
+def tokenize(batch):
+    tokens = tokenizer(
+        batch["text"],
+        truncation=True,
+        padding="max_length",
+        max_length=512
+    )
+    tokens["labels"] = tokens["input_ids"].copy()
+    return tokens
+dataset = dataset.map(tokenize, remove_columns=["text"])
+# -----------------------------
+# 4. Training args (FAST)
+# -----------------------------
+training_args = TrainingArguments(
+    output_dir=OUTPUT_DIR,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,
+    num_train_epochs=3,
+    learning_rate=2e-4,
+    fp16=True,
+    logging_steps=1,
+    save_strategy="epoch",
+    optim="paged_adamw_8bit",
+    report_to="none"
+)
+# -----------------------------
+# 5. Train
+# -----------------------------
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset
+)
+trainer.train()
+# -----------------------------
+# 6. Save adapter
+# -----------------------------
+model.save_pretrained(OUTPUT_DIR)
+tokenizer.save_pretrained(OUTPUT_DIR)
+print("✅ LoRA training complete. adapter.bin created.")