SenseVoice-Small-ko (Fine-tuned SenseVoiceSmall on EDIE dataset)

이 리포지터리는 SenseVoiceSmall를 한국어 음성/감정/이벤트 인식용 EDIE 데이터셋으로 파인튜닝한 모델입니다.

베이스 모델: iic/SenseVoiceSmall
테스크: STT (ASR) + Emotion (SER) + Event (AED)
주요 라벨:
- 텍스트 라벨
- 감정 라벨: <|HAPPY|>, <|SAD|>, <|ANGRY|>, <|NEUTRAL|>, <|FEARFUL|>, <|DISGUSTED|>, <|SURPRISED|>

0. 모델 입출력 포멧

입력

input: 단일 wav 경로 또는 경로 리스트

출력 출력 예시 (AutoModel)

text: 인식된 텍스트
language: 언어 ID (<|ko|> 등)
emo: 감정 라벨 (<|HAPPY|>, <|SAD|> 등)
event: 이벤트 라벨 (<|Speech|>, <|BGM|> 등)

1. 설치

pip install -U "funasr>=1.2.7" torch

GPU를 사용할 경우 사전에 CUDA 호환 PyTorch를 설치해 주세요

2. 간단하게 모델 사용하기

FunASR의 AutoModel을 이용하여 허깅페이스 모델 허브에서 모델 레파지토리의 모델을 바로 로드해서 사용할 수 있습니다.

#!/usr/bin/env python3
from pathlib import Path
import os
import argparse

from huggingface_hub import snapshot_download
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess

HF_REPO_ID = "AeiROBOT/SenseVoice-Small-ko"   # 업로드한 HF 리포 ID
LOCAL_DIR = "/home/khw/Workspace/SenseVoice/hf_models/SenseVoice-Small-ko"

# ----- SenseVoice 토큰 파서 -----
LANG_TOKENS = {"<|zh|>", "<|en|>", "<|yue|>", "<|ja|>", "<|ko|>", "<|nospeech|>"}
EMO_TOKENS = {"<|HAPPY|>", "<|SAD|>", "<|ANGRY|>", "<|NEUTRAL|>", "<|FEARFUL|>", "<|DISGUSTED|>", "<|SURPRISED|>"}
EVENT_TOKENS = {"<|BGM|>", "<|Speech|>", "<|Applause|>", "<|Laughter|>", "<|Cry|>", "<|Sneeze|>", "<|Breath|>", "<|Cough|>"}
WITH_ITN_TOKENS = {"<|withitn|>", "<|woitn|>"}


def _consume(prefixes, text: str):
    for p in prefixes:
        if text.startswith(p):
            return p, text[len(p):]
    return None, text


def parse_sensevoice_text(raw: str):
    """SenseVoice 출력 문자열에서 (lang, emo, event, with_itn, text) 분리.

    예:
        "<|ko|><|NEUTRAL|><|Speech|><|withitn|>조 금만 생각 을 하 면서 살 면 훨씬 편할 거야." ->
        {
          "language": "<|ko|>",
          "emo": "<|NEUTRAL|>",
          "event": "<|Speech|>",
          "with_itn": "<|withitn|>",
          "text": "조 금만 생각 을 하 면서 살 면 훨씬 편할 거야."
        }
    """
    if not raw:
        return {"language": None, "emo": None, "event": None, "with_itn": None, "text": ""}

    rest = raw.strip()
    lang, rest = _consume(LANG_TOKENS, rest)
    emo, rest = _consume(EMO_TOKENS, rest)
    event, rest = _consume(EVENT_TOKENS, rest)
    with_itn, rest = _consume(WITH_ITN_TOKENS, rest)

    clean_text = rest.strip()
    return {
        "language": lang,
        "emo": emo,
        "event": event,
        "with_itn": with_itn,
        "text": clean_text,
    }


def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--wav_file", default="dataset/wav_dataset/DISGUSTED/test_2025_12_12_040201.wav", help="pretrained 모델 이름 또는 로컬 디렉터리")
    return p.parse_args()

def get_model():
    local_path = snapshot_download(
        repo_id=HF_REPO_ID,
        repo_type="model",
        local_dir=LOCAL_DIR,
        local_dir_use_symlinks=False,
        token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),  # private 이므로 필요
    )
    print("다운로드 경로:", local_path)

    # 2) AutoModel에 로컬 경로를 넘겨서 사용
    model_dir = local_path  # 또는 LOCAL_DIR

    model = AutoModel(
        model=model_dir,
        trust_remote_code=True,
        remote_code=str(Path(model_dir) / "model.py"),  # HF 리포에 있는 model.py 사용
        vad_model="fsmn-vad",
        vad_kwargs={"max_single_segment_time": 30000},
        device="cuda:0",
    )
    
    return model

def main():
    args = parse_args()
    wav_path = args.wav_file
    
    model = get_model()
    
    res = model.generate(
        input=wav_path,
        cache={},
        language="auto",   # 또는 "ko"
        use_itn=True,
        batch_size_s=60,
        merge_vad=True,
        merge_length_s=15,
    )

    raw_text = res[0]["text"]
    parsed = parse_sensevoice_text(raw_text)

    # ITN 후처리
    pretty_text = rich_transcription_postprocess(parsed["text"]) if parsed["text"] else ""

    print("=== Raw ===")
    print(raw_text)
    print("=== Parsed ===")
    print("lang   :", parsed["language"])
    print("emo    :", parsed["emo"])
    print("event  :", parsed["event"])
    print("withitn:", parsed["with_itn"])
    print("text   :", pretty_text)


if __name__ == "__main__":
    main()

3. 학습 데이터셋으로 평가하기

#!/usr/bin/env python3
import os
import json
import argparse
import unicodedata
from pathlib import Path
from typing import List, Dict, Tuple, Optional

import torch
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess


# =======================
# SenseVoice 토큰 파서
# =======================
LANG_TOKENS = {"<|zh|>", "<|en|>", "<|yue|>", "<|ja|>", "<|ko|>", "<|nospeech|>"}
EMO_TOKENS = {"<|HAPPY|>", "<|SAD|>", "<|ANGRY|>", "<|NEUTRAL|>", "<|FEARFUL|>", "<|DISGUSTED|>", "<|SURPRISED|>"}
EVENT_TOKENS = {"<|BGM|>", "<|Speech|>", "<|Applause|>", "<|Laughter|>", "<|Cry|>", "<|Sneeze|>", "<|Breath|>", "<|Cough|>"}
WITH_ITN_TOKENS = {"<|withitn|>", "<|woitn|>"}


def _consume(prefixes, text: str):
    for p in prefixes:
        if text.startswith(p):
            return p, text[len(p):]
    return None, text


def parse_sensevoice_text(raw: str) -> Dict[str, Optional[str]]:
    if not raw:
        return {"language": None, "emo": None, "event": None, "with_itn": None, "text": ""}

    rest = raw.strip()
    lang, rest = _consume(LANG_TOKENS, rest)
    emo, rest = _consume(EMO_TOKENS, rest)
    event, rest = _consume(EVENT_TOKENS, rest)
    with_itn, rest = _consume(WITH_ITN_TOKENS, rest)

    clean_text = rest.strip()
    return {
        "language": lang,
        "emo": emo,
        "event": event,
        "with_itn": with_itn,
        "text": clean_text,
    }


# =======================
# 텍스트 정규화 & 지표
# =======================

def normalize_text(s: str, lower: bool, strip_punct: bool, strip_spaces: bool) -> str:
    if s is None:
        return ""
    t = s
    if lower:
        t = t.lower()
    if strip_punct:
        t = "".join(ch for ch in t if not unicodedata.category(ch).startswith("P"))
    if strip_spaces:
        t = "".join(t.split())
    return t


def _levenshtein(a: List[str], b: List[str]) -> int:
    n, m = len(a), len(b)
    if n == 0:
        return m
    if m == 0:
        return n
    prev = list(range(m + 1))
    for i in range(1, n + 1):
        curr = [i] + [0] * m
        ai = a[i - 1]
        for j in range(1, m + 1):
            cost = 0 if ai == b[j - 1] else 1
            curr[j] = min(
                prev[j] + 1,
                curr[j - 1] + 1,
                prev[j - 1] + cost,
            )
        prev = curr
    return prev[m]


def cer(ref: str, hyp: str) -> float:
    r = list(ref)
    h = list(hyp)
    dist = _levenshtein(r, h)
    return dist / max(1, len(r))


def wer(ref: str, hyp: str) -> float:
    r = ref.split()
    h = hyp.split()
    dist = _levenshtein(r, h)
    return dist / max(1, len(r))


def norm_emo(label: Optional[str]) -> str:
    if not label:
        return ""
    t = label.strip()
    if t.startswith("<|") and t.endswith("|>"):
        t = t[2:-2]
    return t.upper()


# =======================
# IO & argparse
# =======================

def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--model-dir", default="/home/khw/Workspace/SenseVoice/outputs", help="finetune 산출물 디렉터리")
    p.add_argument("--jsonl", default="/home/khw/Workspace/SenseVoice/data/train.jsonl", help="입력 JSONL 경로")
    p.add_argument("--base-audio-dir", default="/home/khw/Workspace/SenseVoice", help="source 상대경로의 기준 디렉터리")
    p.add_argument("--remote-code", default="/home/khw/Workspace/SenseVoice/model.py", help="SenseVoice 모델 구현 경로")
    p.add_argument("--device", default=None, help="cuda:0 / cpu (미지정 시 자동 결정)")
    p.add_argument("--batch-size", type=int, default=64, help="배치 크기(짧은 음원 다수 가정)")
    p.add_argument("--use-best-ckpt", action="store_true", help="model.pt.best를 model.pt로 심볼릭 링크 생성")
    p.add_argument("--lang", default="ko", choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"], help="언어 강제 설정. 기본 ko")
    p.add_argument("--lower", action="store_true", help="정밀도 계산 시 소문자화")
    p.add_argument("--strip-punct", action="store_true", help="정밀도 계산 시 문장부호 제거")
    p.add_argument("--strip-spaces", action="store_true", help="정밀도 계산 시 모든 공백 제거")
    p.add_argument("--out", default="/home/khw/Workspace/SenseVoice/results/preds_train.jsonl", help="추론 결과 JSONL")
    return p.parse_args()


def _find_latest_epoch_ckpt(model_dir: Path) -> Optional[Path]:
    """model.pt.ep* 중에서 가장 큰 epoch 번호를 가진 체크포인트를 찾는다."""
    candidates = []
    for p in model_dir.glob("model.pt.ep*"):
        name = p.name
        try:
            # 이름에서 숫자 부분만 파싱: model.pt.ep50 -> 50
            ep_str = name.split("model.pt.ep", 1)[1]
            ep = int(ep_str)
            candidates.append((ep, p))
        except (IndexError, ValueError):
            # 패턴이 안 맞으면 무시
            continue

    if not candidates:
        return None

    candidates.sort(key=lambda x: x[0])  # epoch 오름차순 정렬
    return candidates[-1][1]  # 가장 큰 epoch


def prepare_checkpoint(model_dir: Path) -> Path:
    """주어진 model_dir 안에서 사용할 체크포인트를 선택하고, model.pt를 준비한다.

    우선순위:
      1) model.pt.best
      2) model.pt.ep* 중 가장 큰 epoch
      3) model.pt (기존 파일)

    셋 다 없으면 SystemExit으로 종료.

    선택된 파일이 model.pt가 아니라면, model.pt를 해당 파일을 가리키는
    심볼릭 링크(또는 복사본)으로 만든다.
    """
    best = model_dir / "model.pt.best"
    target = model_dir / "model.pt"  # AutoModel이 최종적으로 보게 될 파일

    chosen: Optional[Path] = None

    # 1) model.pt.best 최우선
    if best.exists():
        chosen = best
        reason = "model.pt.best"
    else:
        # 2) 가장 마지막 epoch의 model.pt.ep*
        latest_ep = _find_latest_epoch_ckpt(model_dir)
        if latest_ep is not None:
            chosen = latest_ep
            reason = latest_ep.name
        # 3) 기존 model.pt
        elif target.exists():
            chosen = target
            reason = "existing model.pt"
        else:
            reason = "(none)"

    if chosen is None:
        raise SystemExit(
            f"[fatal] No checkpoint found in {model_dir}. "
            f"Expected one of: model.pt.best, model.pt.ep*, model.pt. Program will exit."
        )

    # 선택된 체크포인트를 model.pt로 맞춰준다 (링크 또는 복사)
    if chosen != target:
        if target.exists() or target.is_symlink():
            try:
                target.unlink()
            except Exception as e:
                print(f"[warn] failed to remove existing {target}: {e}")

        try:
            # 상대 이름으로 심볼릭 링크 생성
            target.symlink_to(chosen.name)
            print(f"[info] using checkpoint: {chosen.name} (linked as model.pt)")
        except Exception as e:
            # 일부 파일시스템/권한 환경에서 symlink가 안 될 수 있으므로, 복사로 폴백
            print(f"[warn] symlink failed ({e}), will try to copy instead.")
            import shutil
            try:
                shutil.copy2(str(chosen), str(target))
                print(f"[info] using checkpoint: {chosen.name} (copied to model.pt)")
            except Exception as e2:
                raise SystemExit(
                    f"[fatal] failed to prepare checkpoint at {target}: {e2}. Program will exit."
                )
    else:
        print(f"[info] using checkpoint: {reason}")

    return chosen


def load_items(jsonl_path: Path) -> List[Dict]:
    items = []
    with jsonl_path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                items.append(obj)
            except Exception as e:
                print(f"[warn] skip bad line: {e}")
    return items


def to_abs_paths(items: List[Dict], base_audio_dir: Path) -> Tuple[List[Dict], int]:
    missing = 0
    for it in items:
        src = it.get("source")
        if src:
            p = (base_audio_dir / src).resolve()
            if not p.exists():
                missing += 1
            it["abs_source"] = str(p)
        else:
            it["abs_source"] = None
            missing += 1
    return items, missing


def batched(iterable, n: int):
    batch = []
    for x in iterable:
        batch.append(x)
        if len(batch) == n:
            yield batch
            batch = []
    if batch:
        yield batch


# =======================
# main
# =======================

def main():
    args = parse_args()

    model_dir = Path(args.model_dir)
    jsonl_path = Path(args.jsonl)
    base_audio_dir = Path(args.base_audio_dir)

    # 체크포인트 우선순위 적용: model.pt.best > model.pt.ep* (최대 epoch) > model.pt
    ckpt = prepare_checkpoint(model_dir)
    print(f"[info] final checkpoint file: {ckpt}")

    device = args.device or ("cuda:0" if torch.cuda.is_available() else "cpu")

    # model.py(remote_code)는 반드시 존재해야 한다. 없으면 바로 종료.
    remote_code_path = Path(args.remote_code)
    if not remote_code_path.exists():
        raise SystemExit(
            f"[fatal] remote_code not found at {remote_code_path}. "
            f"Expected model.py for SenseVoice. Program will exit."
        )

    trust_remote = True

    model = AutoModel(
        model=str(model_dir),  # 로컬 디렉터리만 사용
        trust_remote_code=trust_remote,
        remote_code=str(remote_code_path),
        device=device,
        vad_model=None,
    )

    items = load_items(jsonl_path)
    items, _ = to_abs_paths(items, base_audio_dir)

    valid_items = [it for it in items if it.get("abs_source") and Path(it["abs_source"]).exists()]
    missing = len(items) - len(valid_items)
    if missing:
        print(f"[warn] {missing} items skipped due to missing files")

    out_path = Path(args.out)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    total = len(valid_items)
    print(f"[info] total inputs used: {total}, device: {device}, language: {args.lang}")
    if total == 0:
        print("[exit] No valid audio found. Check --base-audio-dir or 'source' paths.")
        with out_path.open("w", encoding="utf-8") as wf:
            pass
        return

    # 지표 누적
    exact_matches = 0
    cer_sum = 0.0
    wer_sum = 0.0
    text_pairs = 0

    emo_correct = 0
    emo_total = 0

    written = 0
    with out_path.open("w", encoding="utf-8") as wf:
        for batch in batched(valid_items, args.batch_size):
            wav_list = [b["abs_source"] for b in batch]

            try:
                res = model.generate(
                    input=wav_list,
                    cache={},
                    language=args.lang,
                    use_itn=True,
                    batch_size=len(wav_list),
                )
            except Exception as e:
                print(f"[error] inference failed on batch starting key={batch[0].get('key')}: {e}")
                continue

            for it, r in zip(batch, res):
                raw_text = r.get("text", "") or ""
                parsed = parse_sensevoice_text(raw_text)
                pretty_text = rich_transcription_postprocess(parsed["text"]) if parsed["text"] else ""

                ref_text = it.get("target") or ""

                # 텍스트 지표
                if ref_text:
                    nt_ref = normalize_text(ref_text, args.lower, args.strip_punct, args.strip_spaces)
                    nt_hyp = normalize_text(pretty_text, args.lower, args.strip_punct, args.strip_spaces)

                    if nt_ref == nt_hyp:
                        exact_matches += 1
                    cer_sum += cer(nt_ref, nt_hyp)
                    wer_sum += wer(nt_ref, nt_hyp)
                    text_pairs += 1

                # 감정 지표
                tgt_emo_n = norm_emo(it.get("emo_target"))
                pred_emo_n = norm_emo(parsed["emo"])
                if tgt_emo_n:
                    emo_total += 1
                    if pred_emo_n == tgt_emo_n:
                        emo_correct += 1

                out_obj = {
                    "key": it.get("key"),
                    "audio": it.get("abs_source"),
                    "pred_raw": raw_text,
                    "pred_text": pretty_text,
                    "ref_text": ref_text,
                    "pred_language": parsed["language"],
                    "pred_emo": pred_emo_n or parsed["emo"] or "",
                    "ref_emo": tgt_emo_n or it.get("emo_target") or "",
                    "pred_event": parsed["event"] or "",
                    "with_itn": parsed["with_itn"] or "",
                }
                wf.write(json.dumps(out_obj, ensure_ascii=False) + "\n")

                # ===== 사람이 보기 좋은 per-sample 출력 =====
                idx = written + 1
                print("\n[{}] key={}".format(idx, it.get("key")))
                print("REF_TEXT :", ref_text)
                print("REF_EMO  :", tgt_emo_n or it.get("emo_target"))
                print("PRED_TEXT:", pretty_text)
                print("PRED_EMO :", pred_emo_n or parsed["emo"])  # 토큰 그대로 보여줘도 됨
                print("PRED_EVT :", parsed["event"])  # 이벤트도 같이 확인
                print("-" * 80)

                written += 1

    # 요약 출력
    print("\n===== Summary =====")
    print(f"Samples inferred: {written}")
    if text_pairs > 0:
        exact_acc = exact_matches / text_pairs * 100.0
        avg_cer = cer_sum / text_pairs
        avg_wer = wer_sum / text_pairs
        print(f"Text pairs (with ref): {text_pairs}")
        print(f"- Exact match accuracy: {exact_acc:.2f}%")
        print(f"- Avg CER: {avg_cer:.4f}")
        print(f"- Avg WER: {avg_wer:.4f}")
    else:
        print("No text references found; text metrics skipped.")

    if emo_total > 0:
        emo_acc = emo_correct / emo_total * 100.0
        print(f"Emotion pairs: {emo_total}")
        print(f"- Emotion accuracy: {emo_acc:.2f}%")
    else:
        print("No emotion references found; emotion metrics skipped.")

    print(f"Results saved to: {out_path}")


if __name__ == "__main__":
    main()

4. 학습 후 허깅페이스에 모델 업로드

upload_model_to_huggingface.py


#!/usr/bin/env python3
import os
from pathlib import Path

from huggingface_hub import HfApi, create_repo, upload_folder

# ===== 사용자 설정 =====
# 실제로 만들 Hugging Face 모델 repo ID (예시)
REPO_ID = "AeiROBOT/SenseVoice-Small-ko"  # <-- 원하는 이름으로 수정

# 업로드할 로컬 폴더 (학습 결과)
MODEL_DIR = Path("/home/khw/Workspace/SenseVoice/outputs")

# 로컬에 있는 model.py를 함께 올리고 싶으면 (FunASR/SenseVoice용)
# outputs 안에 이미 복사해 두었으면 생략 가능
EXTRA_FILES = [
    Path("/home/khw/Workspace/SenseVoice/model.py"),  # 없으면 주석 처리
]


def main():
    # 1) 토큰 가져오기 (환경변수 사용 권장)
    # 미리 export HUGGINGFACE_HUB_TOKEN=hf_xxx 하기
    token = os.environ.get("HUGGINGFACE_HUB_TOKEN")
    if token is None:
        raise RuntimeError(
            "HUGGINGFACE_HUB_TOKEN 환경변수가 설정되어 있지 않습니다. "
            "https://huggingface.co/settings/tokens 에서 토큰을 만들고,\n"
            "export HUGGINGFACE_HUB_TOKEN=hf_xxx 로 설정한 뒤 다시 실행하세요."
        )

    api = HfApi()

    # 2) 리포지터리 생성 (이미 있으면 exist_ok=True 로 그냥 통과)
    create_repo(
        repo_id=REPO_ID,
        token=token,
        private=True,   # 비공개로 올리려면 True
        exist_ok=True,
        repo_type="model",
    )

    # 3) 추가로 올릴 파일(model.py 등)을 outputs 안으로 복사 (선택)
    #    -> HF 리포 root에 README.md, model.pt, config.yaml, configuration.json, model.py 등이 같이 있도록 추천
    for extra in EXTRA_FILES:
        if extra.is_file():
            target = MODEL_DIR / extra.name
            if not target.exists():
                print(f"[info] copy {extra} -> {target}")
                target.write_bytes(extra.read_bytes())
        else:
            print(f"[warn] extra file not found: {extra}")

    # 3-1) 모델 카드(README) 업로드: 실행 위치(CWD)의 README_huggingface.md를 outputs/README.md로 복사
    #      - HF 모델 허브는 repo 루트의 README.md를 모델 카드로 인식합니다.
    readme_src = Path.cwd() / "README_huggingface.md"
    readme_dst = MODEL_DIR / "README.md"
    if readme_src.is_file():
        print(f"[info] copy {readme_src} -> {readme_dst}")
        readme_dst.write_text(readme_src.read_text(encoding="utf-8"), encoding="utf-8")
    else:
        print(f"[warn] README_huggingface.md not found in CWD: {Path.cwd()}")

    # 4) 폴더 통째로 업로드
    print(f"[info] uploading folder: {MODEL_DIR} -> {REPO_ID}")
    upload_folder(
        repo_id=REPO_ID,
        folder_path=str(MODEL_DIR),
        path_in_repo=".",         # 리포 루트에 그대로 올리기
        token=token,
        repo_type="model",
        ignore_patterns=[
            "model.pt.ep*",   # 체크포인트들 제외
            "*.pt.ep*",       # 혹시 다른 파일명도 비슷하게 찍히면 같이 제외
        ],
    )

    print("[done] uploaded to:", f"https://huggingface.co/{REPO_ID}")


if __name__ == "__main__":
    main()

Downloads last month: 5

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Collection including AeiROBOT/SenseVoice-Small-ko

SenseVoice Model

Collection

SenseVoice Model 저장소 • 2 items • Updated Feb 10