LJKJHI commited on
Commit
4904365
·
1 Parent(s): 08929e1

refactor: Externalize `espeak` dependency to `packages.txt` and refine Gradio app initialization and input handling.

Browse files
Files changed (3) hide show
  1. app.py +34 -25
  2. packages.txt +1 -0
  3. requirements.txt +6 -0
app.py CHANGED
@@ -1,35 +1,44 @@
 
1
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
2
  import torch
3
- import gradio as gr
4
- import librosa
5
  import os
6
- import subprocess
7
 
8
- # Install system dependencies
9
- subprocess.run(["apt-get", "update"], check=True)
10
- subprocess.run(["apt-get", "install", "-y", "espeak"], check=True)
 
 
 
11
 
12
- # load model and processor
13
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
14
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
15
-
16
- # define prediction function
17
  def audio2phoneme(audio_path):
18
- audio, sr = librosa.load(audio_path, sr=16000)
19
- input_values = processor(audio, return_tensors="pt", padding=True).input_values
20
- with torch.no_grad():
21
- logits = model(input_values).logits
22
- predicted_ids = torch.argmax(logits, dim=-1)
23
- transcription = processor.batch_decode(predicted_ids)
24
- return ' '.join(transcription)
 
 
 
 
 
 
 
 
 
25
 
 
 
26
  app = gr.Interface(
27
  fn=audio2phoneme,
28
- inputs=gr.Audio(sources=["upload","microphone"], type="filepath"),
29
- outputs=gr.Textbox(label="Phoneme Transcription", show_copy_button=True, show_label=True),
30
- description="Get phonemes from audio",
31
- title="Audio to Phoneme Transcription using facebook/wav2vec2-lv-60-espeak-cv",
32
- )
33
 
34
- # start space
35
- app.launch(share=True)
 
1
+ import gradio as gr
2
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
  import torch
4
+ import librosa
 
5
  import os
 
6
 
7
+ # 1. Load Model (Chạy 1 lần khi khởi động)
8
+ print("Đang tải model...")
9
+ MODEL_ID = "facebook/wav2vec2-lv-60-espeak-cv-ft"
10
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
11
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
12
+ print("Model đã sẵn sàng!")
13
 
14
+ # 2. Hàm xử lý chính
 
 
 
 
15
  def audio2phoneme(audio_path):
16
+ if audio_path is None:
17
+ return "Không tìm thấy file âm thanh."
18
+
19
+ # Load file âm thanh, ép về 16kHz
20
+ audio, sr = librosa.load(audio_path, sr=16000)
21
+
22
+ # Xử lý qua Model
23
+ input_values = processor(audio, return_tensors="pt", padding=True).input_values
24
+ with torch.no_grad():
25
+ logits = model(input_values).logits
26
+
27
+ # Giải mã ra âm vị
28
+ predicted_ids = torch.argmax(logits, dim=-1)
29
+ transcription = processor.batch_decode(predicted_ids)
30
+
31
+ return ' '.join(transcription)
32
 
33
+ # 3. Tạo giao diện và API
34
+ # Lưu ý: "api_name" chính là tên endpoint bạn sẽ gọi
35
  app = gr.Interface(
36
  fn=audio2phoneme,
37
+ inputs=gr.Audio(type="filepath"),
38
+ outputs=gr.Textbox(label="Phoneme Transcription"),
39
+ title="API Phân tích Âm vị (Phoneme)",
40
+ description="Gửi file ghi âm giọng đọc tiếng Anh để nhận về chuỗi âm vị IPA."
41
+ )
42
 
43
+ if __name__ == "__main__":
44
+ app.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ espeak-ng
requirements.txt CHANGED
@@ -6,3 +6,9 @@ transformers==4.44.2
6
  datasets==3.0.2
7
  librosa==0.10.2.post1
8
  phonemizer==3.3.0
 
 
 
 
 
 
 
6
  datasets==3.0.2
7
  librosa==0.10.2.post1
8
  phonemizer==3.3.0
9
+ torch
10
+ torchaudio
11
+ transformers
12
+ librosa
13
+ phonemizer
14
+ gradio