Add instruction safety gate model
Browse filesInitial release of a LoRA-based model that evaluates instruction safety and validity before execution for humanoid and agent systems.
- README.md +35 -0
- adapter.bin +3 -0
- adapter_config.json +9 -0
- config.json +32 -0
- train_lora.py +141 -0
README.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
- id
|
| 5 |
+
license: mit
|
| 6 |
+
tags:
|
| 7 |
+
- humanoid
|
| 8 |
+
- instruction-safety
|
| 9 |
+
- pre-execution
|
| 10 |
+
- risk-detection
|
| 11 |
+
- reasoning
|
| 12 |
+
- llm
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# instruction-safety-gate
|
| 16 |
+
|
| 17 |
+
## Model Description
|
| 18 |
+
`instruction-safety-gate` is a language model designed to act as a safety layer that evaluates natural language instructions before execution.
|
| 19 |
+
|
| 20 |
+
The model determines whether an instruction can be safely executed by classifying it as valid, ambiguous, contradictory, incomplete, or unsafe. It is intended to prevent unsafe or invalid instructions from reaching humanoid or agent execution systems.
|
| 21 |
+
|
| 22 |
+
## Intended Use
|
| 23 |
+
- Safety gating for humanoid robots
|
| 24 |
+
- Pre-execution instruction screening
|
| 25 |
+
- AI agent risk detection
|
| 26 |
+
- Control layers for autonomous systems
|
| 27 |
+
|
| 28 |
+
## Output Format
|
| 29 |
+
The model outputs **JSON only**:
|
| 30 |
+
|
| 31 |
+
```json
|
| 32 |
+
{
|
| 33 |
+
"label": "VALID | AMBIGUOUS | CONTRADICTORY | INCOMPLETE | UNSAFE",
|
| 34 |
+
"confidence": 0.0
|
| 35 |
+
}
|
adapter.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fffeb1b653ff4b55960adf4c505c7473c96a60c60d6facfe301cc697b65871c1
|
| 3 |
+
size 11
|
adapter_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"peft_type": "LORA",
|
| 3 |
+
"task_type": "CAUSAL_LM",
|
| 4 |
+
"r": 16,
|
| 5 |
+
"lora_alpha": 32,
|
| 6 |
+
"lora_dropout": 0.05,
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"target_modules": ["q_proj", "v_proj"]
|
| 9 |
+
}
|
config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
---
|
| 3 |
+
|
| 4 |
+
## 📄 2️⃣ config.json
|
| 5 |
+
|
| 6 |
+
**Create file:** `config.json`
|
| 7 |
+
|
| 8 |
+
```json
|
| 9 |
+
{
|
| 10 |
+
"model_type": "instruction_safety_gate",
|
| 11 |
+
"task": "instruction_safety_classification",
|
| 12 |
+
"languages": ["en", "id"],
|
| 13 |
+
"output_format": "json",
|
| 14 |
+
"output_labels": [
|
| 15 |
+
"VALID",
|
| 16 |
+
"AMBIGUOUS",
|
| 17 |
+
"CONTRADICTORY",
|
| 18 |
+
"INCOMPLETE",
|
| 19 |
+
"UNSAFE"
|
| 20 |
+
],
|
| 21 |
+
"confidence_range": [0.0, 1.0],
|
| 22 |
+
"inference_settings": {
|
| 23 |
+
"json_only": true,
|
| 24 |
+
"temperature": 0.0,
|
| 25 |
+
"max_tokens": 64
|
| 26 |
+
},
|
| 27 |
+
"intended_use": [
|
| 28 |
+
"humanoid_instruction_safety",
|
| 29 |
+
"agent_execution_filter"
|
| 30 |
+
],
|
| 31 |
+
"license": "mit"
|
| 32 |
+
}
|
train_lora.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from datasets import Dataset
|
| 3 |
+
from transformers import (
|
| 4 |
+
AutoModelForCausalLM,
|
| 5 |
+
AutoTokenizer,
|
| 6 |
+
TrainingArguments,
|
| 7 |
+
Trainer
|
| 8 |
+
)
|
| 9 |
+
from peft import LoraConfig, get_peft_model
|
| 10 |
+
|
| 11 |
+
# -----------------------------
|
| 12 |
+
# 1. Base model (FAST & SMALL)
|
| 13 |
+
# -----------------------------
|
| 14 |
+
BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
|
| 15 |
+
OUTPUT_DIR = "./humanoid-instruction-validator-lora"
|
| 16 |
+
|
| 17 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 18 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 19 |
+
BASE_MODEL,
|
| 20 |
+
load_in_4bit=True,
|
| 21 |
+
device_map="auto",
|
| 22 |
+
trust_remote_code=True
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# -----------------------------
|
| 26 |
+
# 2. LoRA config
|
| 27 |
+
# -----------------------------
|
| 28 |
+
lora_config = LoraConfig(
|
| 29 |
+
r=16,
|
| 30 |
+
lora_alpha=32,
|
| 31 |
+
target_modules=["q_proj", "v_proj"],
|
| 32 |
+
lora_dropout=0.05,
|
| 33 |
+
bias="none",
|
| 34 |
+
task_type="CAUSAL_LM"
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
model = get_peft_model(model, lora_config)
|
| 38 |
+
model.print_trainable_parameters()
|
| 39 |
+
|
| 40 |
+
# -----------------------------
|
| 41 |
+
# 3. Tiny training data
|
| 42 |
+
# -----------------------------
|
| 43 |
+
data = [
|
| 44 |
+
{
|
| 45 |
+
"text": """You are an instruction validation model.
|
| 46 |
+
Return ONLY JSON.
|
| 47 |
+
|
| 48 |
+
Instruction:
|
| 49 |
+
Answer the question in one sentence using bullet points.
|
| 50 |
+
Input:
|
| 51 |
+
What is leadership?
|
| 52 |
+
|
| 53 |
+
Output:
|
| 54 |
+
{"label":"CONTRADICTORY","confidence":0.95}"""
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"text": """You are an instruction validation model.
|
| 58 |
+
Return ONLY JSON.
|
| 59 |
+
|
| 60 |
+
Instruction:
|
| 61 |
+
Translate to Indonesian.
|
| 62 |
+
Input:
|
| 63 |
+
Technology improves lives.
|
| 64 |
+
|
| 65 |
+
Output:
|
| 66 |
+
{"label":"VALID","confidence":0.96}"""
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"text": """You are an instruction validation model.
|
| 70 |
+
Return ONLY JSON.
|
| 71 |
+
|
| 72 |
+
Instruction:
|
| 73 |
+
Summarize the text without shortening it.
|
| 74 |
+
Input:
|
| 75 |
+
Exercise improves health.
|
| 76 |
+
|
| 77 |
+
Output:
|
| 78 |
+
{"label":"CONTRADICTORY","confidence":0.94}"""
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"text": """You are an instruction validation model.
|
| 82 |
+
Return ONLY JSON.
|
| 83 |
+
|
| 84 |
+
Instruction:
|
| 85 |
+
Respond politely with offensive language.
|
| 86 |
+
Input:
|
| 87 |
+
Can you help me?
|
| 88 |
+
|
| 89 |
+
Output:
|
| 90 |
+
{"label":"UNSAFE","confidence":0.97}"""
|
| 91 |
+
}
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
dataset = Dataset.from_list(data)
|
| 95 |
+
|
| 96 |
+
def tokenize(batch):
|
| 97 |
+
tokens = tokenizer(
|
| 98 |
+
batch["text"],
|
| 99 |
+
truncation=True,
|
| 100 |
+
padding="max_length",
|
| 101 |
+
max_length=512
|
| 102 |
+
)
|
| 103 |
+
tokens["labels"] = tokens["input_ids"].copy()
|
| 104 |
+
return tokens
|
| 105 |
+
|
| 106 |
+
dataset = dataset.map(tokenize, remove_columns=["text"])
|
| 107 |
+
|
| 108 |
+
# -----------------------------
|
| 109 |
+
# 4. Training args (FAST)
|
| 110 |
+
# -----------------------------
|
| 111 |
+
training_args = TrainingArguments(
|
| 112 |
+
output_dir=OUTPUT_DIR,
|
| 113 |
+
per_device_train_batch_size=1,
|
| 114 |
+
gradient_accumulation_steps=4,
|
| 115 |
+
num_train_epochs=3,
|
| 116 |
+
learning_rate=2e-4,
|
| 117 |
+
fp16=True,
|
| 118 |
+
logging_steps=1,
|
| 119 |
+
save_strategy="epoch",
|
| 120 |
+
optim="paged_adamw_8bit",
|
| 121 |
+
report_to="none"
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# -----------------------------
|
| 125 |
+
# 5. Train
|
| 126 |
+
# -----------------------------
|
| 127 |
+
trainer = Trainer(
|
| 128 |
+
model=model,
|
| 129 |
+
args=training_args,
|
| 130 |
+
train_dataset=dataset
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
trainer.train()
|
| 134 |
+
|
| 135 |
+
# -----------------------------
|
| 136 |
+
# 6. Save adapter
|
| 137 |
+
# -----------------------------
|
| 138 |
+
model.save_pretrained(OUTPUT_DIR)
|
| 139 |
+
tokenizer.save_pretrained(OUTPUT_DIR)
|
| 140 |
+
|
| 141 |
+
print("✅ LoRA training complete. adapter.bin created.")
|