frascuchon HF Staff commited on
Commit
5dc3b1e
·
1 Parent(s): a5d8e64

fixing voice removal download URLs

Browse files
mcp_server.py CHANGED
@@ -1318,8 +1318,8 @@ def replace_voice_mcp(
1318
  the target audio while preserving the linguistic content and timing.
1319
 
1320
  Args:
1321
- source_audio_path: Path to the source audio file (voice to be replaced)
1322
- target_audio_path: Path to the target audio file (voice to use)
1323
  diffusion_steps: Number of diffusion steps for inference (default: 10)
1324
  length_adjust: Length adjustment factor (default: 1.0)
1325
  inference_cfg_rate: Classifier-free guidance rate (default: 0.7)
@@ -1337,6 +1337,12 @@ def replace_voice_mcp(
1337
  >>> replace_voice_mcp("speech.mp3", "singer.wav", diffusion_steps=15, pitch_shift=2)
1338
  # Returns path to voice-replaced audio with custom settings
1339
 
 
 
 
 
 
 
1340
  Note:
1341
  - Uses Seed-VC model for high-quality voice conversion
1342
  - Preserves linguistic content and timing from source audio
@@ -1953,11 +1959,13 @@ def create_interface() -> gr.TabbedInterface:
1953
  inputs=[
1954
  gr.Audio(
1955
  type="filepath",
1956
- label="Source Audio (voice to be replaced)",
1957
  sources=["upload"],
1958
  ),
1959
  gr.Audio(
1960
- type="filepath", label="Target Audio (voice to use)", sources=["upload"]
 
 
1961
  ),
1962
  gr.Number(value=10, label="Diffusion Steps", minimum=1, maximum=50),
1963
  gr.Number(value=1.0, label="Length Adjust", minimum=0.1, maximum=3.0),
 
1318
  the target audio while preserving the linguistic content and timing.
1319
 
1320
  Args:
1321
+ source_audio_path: Path to the source audio file or URL (voice to be replaced)
1322
+ target_audio_path: Path to the target audio file or URL (voice to use)
1323
  diffusion_steps: Number of diffusion steps for inference (default: 10)
1324
  length_adjust: Length adjustment factor (default: 1.0)
1325
  inference_cfg_rate: Classifier-free guidance rate (default: 0.7)
 
1337
  >>> replace_voice_mcp("speech.mp3", "singer.wav", diffusion_steps=15, pitch_shift=2)
1338
  # Returns path to voice-replaced audio with custom settings
1339
 
1340
+ >>> replace_voice_mcp("https://example.com/source.wav", "target.wav")
1341
+ # Downloads source audio and replaces voice with target voice
1342
+
1343
+ >>> replace_voice_mcp("source.wav", "https://example.com/voice.mp3", pitch_shift=2)
1344
+ # Downloads target voice and applies to source with pitch shift
1345
+
1346
  Note:
1347
  - Uses Seed-VC model for high-quality voice conversion
1348
  - Preserves linguistic content and timing from source audio
 
1959
  inputs=[
1960
  gr.Audio(
1961
  type="filepath",
1962
+ label="Source Audio (voice to be replaced) - Local file or URL",
1963
  sources=["upload"],
1964
  ),
1965
  gr.Audio(
1966
+ type="filepath",
1967
+ label="Target Audio (voice to use) - Local file or URL",
1968
+ sources=["upload"],
1969
  ),
1970
  gr.Number(value=10, label="Diffusion Steps", minimum=1, maximum=50),
1971
  gr.Number(value=1.0, label="Length Adjust", minimum=0.1, maximum=3.0),
tools/audio_cleaning.py CHANGED
@@ -256,3 +256,78 @@ def remove_noise(
256
 
257
  except Exception as e:
258
  raise RuntimeError(f"Error removing noise: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  except Exception as e:
258
  raise RuntimeError(f"Error removing noise: {str(e)}")
259
+
260
+
261
+ def remove_noise_wrapper(audio_path: str, noise_reduction_factor: float = 0.5) -> str:
262
+ """
263
+ Wrapper function for noise removal with error handling for MCP integration.
264
+
265
+ Args:
266
+ audio_path: Path to the input audio file
267
+ noise_reduction_factor: Noise reduction strength (0.1-1.0, default: 0.5)
268
+
269
+ Returns:
270
+ Path to cleaned audio file or error message
271
+ """
272
+ try:
273
+ return remove_noise(audio_path, "general", noise_reduction_factor)
274
+ except Exception as e:
275
+ return f"Error: {str(e)}"
276
+
277
+
278
+ if __name__ == "__main__":
279
+ """
280
+ Script section for running audio cleaning locally.
281
+
282
+ Usage:
283
+ python tools/audio_cleaning.py input.wav
284
+ python tools/audio_cleaning.py input.wav --reduction 0.7
285
+ """
286
+ import argparse
287
+ import sys
288
+
289
+ parser = argparse.ArgumentParser(
290
+ description="Remove noise from audio files",
291
+ formatter_class=argparse.RawDescriptionHelpFormatter,
292
+ epilog="""
293
+ Examples:
294
+ python tools/audio_cleaning.py noisy.wav
295
+ python tools/audio_cleaning.py noisy.wav --reduction 0.7
296
+ python tools/audio_cleaning.py noisy.wav --output cleaned/
297
+ """,
298
+ )
299
+
300
+ parser.add_argument("audio_path", help="Path to the input audio file")
301
+ parser.add_argument(
302
+ "--reduction",
303
+ type=float,
304
+ default=0.5,
305
+ help="Noise reduction factor (0.1-1.0, default: 0.5)",
306
+ )
307
+ parser.add_argument("--output", help="Output directory (default: output/)")
308
+
309
+ args = parser.parse_args()
310
+
311
+ print("Audio Cleaning Tool")
312
+ print("=" * 25)
313
+ print(f"Input: {args.audio_path}")
314
+ print(f"Noise reduction: {args.reduction}")
315
+ if args.output:
316
+ print(f"Output directory: {args.output}")
317
+ print()
318
+
319
+ try:
320
+ result = remove_noise_wrapper(
321
+ audio_path=args.audio_path, noise_reduction_factor=args.reduction
322
+ )
323
+
324
+ if result.startswith("Error:"):
325
+ print(f"❌ {result}")
326
+ sys.exit(1)
327
+ else:
328
+ print("✅ Audio cleaning completed!")
329
+ print(f"Output saved to: {result}")
330
+
331
+ except Exception as e:
332
+ print(f"❌ Error: {e}")
333
+ sys.exit(1)
tools/audio_insertion.py CHANGED
@@ -372,3 +372,196 @@ def replace_section(
372
 
373
  except Exception as e:
374
  raise RuntimeError(f"Error replacing audio section: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
373
  except Exception as e:
374
  raise RuntimeError(f"Error replacing audio section: {str(e)}")
375
+
376
+
377
+ def insert_section_wrapper(
378
+ audio_path: str,
379
+ insert_path: str,
380
+ insert_time: float,
381
+ crossfade_duration: float = 0.1,
382
+ output_format: str = "wav",
383
+ ) -> str:
384
+ """
385
+ Wrapper function for inserting audio sections with error handling for MCP integration.
386
+
387
+ Args:
388
+ audio_path: Path to the main audio file
389
+ insert_path: Path to the audio section to insert
390
+ insert_time: Time to insert the section (in seconds)
391
+ crossfade_duration: Length of crossfade in seconds (default: 0.1)
392
+ output_format: Output format ('wav' or 'mp3', default: 'wav')
393
+
394
+ Returns:
395
+ Path to output file or error message
396
+ """
397
+ try:
398
+ return insert_section(
399
+ audio_path=audio_path,
400
+ section_path=insert_path,
401
+ insert_time=insert_time,
402
+ crossfade_duration=crossfade_duration,
403
+ output_path=None,
404
+ output_format=output_format,
405
+ )
406
+ except Exception as e:
407
+ return f"Error: {str(e)}"
408
+
409
+
410
+ def replace_section_wrapper(
411
+ audio_path: str,
412
+ start_time: float,
413
+ end_time: float,
414
+ replacement_path: str,
415
+ crossfade_duration: float = 0.1,
416
+ output_format: str = "wav",
417
+ ) -> str:
418
+ """
419
+ Wrapper function for replacing audio sections with error handling for MCP integration.
420
+
421
+ Args:
422
+ audio_path: Path to the main audio file
423
+ start_time: Start time of section to replace (in seconds)
424
+ end_time: End time of section to replace (in seconds)
425
+ replacement_path: Path to the replacement audio segment
426
+ crossfade_duration: Length of crossfade in seconds (default: 0.1)
427
+ output_format: Output format ('wav' or 'mp3', default: 'wav')
428
+
429
+ Returns:
430
+ Path to output file or error message
431
+ """
432
+ try:
433
+ return replace_section(
434
+ audio_path=audio_path,
435
+ start_time=start_time,
436
+ end_time=end_time,
437
+ replacement_path=replacement_path,
438
+ crossfade_duration=crossfade_duration,
439
+ output_path=None,
440
+ output_format=output_format,
441
+ )
442
+ except Exception as e:
443
+ return f"Error: {str(e)}"
444
+
445
+
446
+ if __name__ == "__main__":
447
+ """
448
+ Script section for running audio insertion/replacement locally.
449
+
450
+ Usage:
451
+ python tools/audio_insertion.py insert main.wav insert.wav 30.0
452
+ python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav
453
+ """
454
+ import argparse
455
+ import sys
456
+
457
+ parser = argparse.ArgumentParser(
458
+ description="Insert or replace audio sections",
459
+ formatter_class=argparse.RawDescriptionHelpFormatter,
460
+ epilog="""
461
+ Examples:
462
+ # Insert section at 30 seconds
463
+ python tools/audio_insertion.py insert main.wav insert.wav 30.0
464
+
465
+ # Replace section from 10s to 20s
466
+ python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav
467
+
468
+ # With custom crossfade
469
+ python tools/audio_insertion.py insert main.wav insert.wav 30.0 --crossfade 0.2
470
+ """,
471
+ )
472
+
473
+ subparsers = parser.add_subparsers(dest="command", help="Command to run")
474
+
475
+ # Insert command
476
+ insert_parser = subparsers.add_parser("insert", help="Insert audio section")
477
+ insert_parser.add_argument("main", help="Main audio file")
478
+ insert_parser.add_argument("insert", help="Audio section to insert")
479
+ insert_parser.add_argument("time", type=float, help="Insert time in seconds")
480
+ insert_parser.add_argument(
481
+ "--crossfade",
482
+ type=float,
483
+ default=0.1,
484
+ help="Crossfade duration in seconds (default: 0.1)",
485
+ )
486
+ insert_parser.add_argument(
487
+ "--format",
488
+ choices=["wav", "mp3"],
489
+ default="wav",
490
+ help="Output format (default: wav)",
491
+ )
492
+
493
+ # Replace command
494
+ replace_parser = subparsers.add_parser("replace", help="Replace audio section")
495
+ replace_parser.add_argument("main", help="Main audio file")
496
+ replace_parser.add_argument("start", type=float, help="Start time in seconds")
497
+ replace_parser.add_argument("end", type=float, help="End time in seconds")
498
+ replace_parser.add_argument("replacement", help="Replacement audio section")
499
+ replace_parser.add_argument(
500
+ "--crossfade",
501
+ type=float,
502
+ default=0.1,
503
+ help="Crossfade duration in seconds (default: 0.1)",
504
+ )
505
+ replace_parser.add_argument(
506
+ "--format",
507
+ choices=["wav", "mp3"],
508
+ default="wav",
509
+ help="Output format (default: wav)",
510
+ )
511
+
512
+ args = parser.parse_args()
513
+
514
+ if not args.command:
515
+ parser.print_help()
516
+ sys.exit(1)
517
+
518
+ print("Audio Insertion Tool")
519
+ print("=" * 25)
520
+
521
+ try:
522
+ result = None
523
+
524
+ if args.command == "insert":
525
+ print(f"Main audio: {args.main}")
526
+ print(f"Insert section: {args.insert}")
527
+ print(f"Insert time: {args.time}s")
528
+ print(f"Crossfade: {args.crossfade}s")
529
+ print()
530
+
531
+ result = insert_section_wrapper(
532
+ audio_path=args.main,
533
+ insert_path=args.insert,
534
+ insert_time=args.time,
535
+ crossfade_duration=args.crossfade,
536
+ output_format=args.format,
537
+ )
538
+
539
+ elif args.command == "replace":
540
+ print(f"Main audio: {args.main}")
541
+ print(f"Replace section: {args.start}s - {args.end}s")
542
+ print(f"Replacement: {args.replacement}")
543
+ print(f"Crossfade: {args.crossfade}s")
544
+ print()
545
+
546
+ result = replace_section_wrapper(
547
+ audio_path=args.main,
548
+ start_time=args.start,
549
+ end_time=args.end,
550
+ replacement_path=args.replacement,
551
+ crossfade_duration=args.crossfade,
552
+ output_format=args.format,
553
+ )
554
+
555
+ if result is None:
556
+ print("❌ No command executed")
557
+ sys.exit(1)
558
+ elif result.startswith("Error:"):
559
+ print(f"❌ {result}")
560
+ sys.exit(1)
561
+ else:
562
+ print(f"✅ Audio {args.command}ion completed!")
563
+ print(f"Output saved to: {result}")
564
+
565
+ except Exception as e:
566
+ print(f"❌ Error: {e}")
567
+ sys.exit(1)
tools/voice_replacement.py CHANGED
@@ -1,10 +1,138 @@
1
- import os
 
 
2
  from datetime import datetime
3
  from pathlib import Path
 
4
 
5
  from gradio_client import Client, handle_file
 
6
 
7
- from tools.audio_info import validate_audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def replace_voice(
@@ -20,13 +148,25 @@ def replace_voice(
20
  """
21
  Replace voice in source audio with voice from target audio using Seed-VC.
22
 
23
- This function uses the Seed-VC Gradio space to perform voice conversion,
24
- replacing the voice characteristics in the source audio with those from
25
- the target audio while preserving the linguistic content and timing.
 
 
 
 
 
 
 
 
 
 
26
 
27
  Args:
28
- source_audio_path: Path to the source audio file (voice to be replaced)
29
- target_audio_path: Path to the target audio file (voice to use)
 
 
30
  diffusion_steps: Number of diffusion steps for inference (default: 10)
31
  length_adjust: Length adjustment factor (default: 1.0)
32
  inference_cfg_rate: Classifier-free guidance rate (default: 0.7)
@@ -35,17 +175,26 @@ def replace_voice(
35
  pitch_shift: Pitch shift in semitones (default: 0)
36
 
37
  Returns:
38
- Path to the generated voice-replaced audio file
39
 
40
  Raises:
41
  FileNotFoundError: If source or target audio files don't exist
42
  ValueError: If parameters are invalid
43
  RuntimeError: If voice replacement fails
44
  """
 
 
 
45
  try:
46
- # Validate input paths
47
- source_abs_path = validate_audio_path(source_audio_path)
48
- target_abs_path = validate_audio_path(target_audio_path)
 
 
 
 
 
 
49
 
50
  # Validate parameters
51
  if diffusion_steps < 1 or diffusion_steps > 50:
@@ -57,13 +206,17 @@ def replace_voice(
57
  if pitch_shift < -12 or pitch_shift > 12:
58
  raise ValueError("pitch_shift must be between -12 and 12 semitones")
59
 
60
- # Initialize Seed-VC client
61
- client = Client("Plachta/Seed-VC")
 
 
 
 
62
 
63
  # Perform voice replacement
64
  result = client.predict(
65
- source_audio_path=handle_file(source_abs_path),
66
- target_audio_path=handle_file(target_abs_path),
67
  diffusion_steps=diffusion_steps,
68
  length_adjust=length_adjust,
69
  inference_cfg_rate=inference_cfg_rate,
@@ -74,7 +227,7 @@ def replace_voice(
74
  )
75
 
76
  # Create output directory
77
- output_dir = Path("output")
78
  output_dir.mkdir(exist_ok=True)
79
 
80
  # Generate output filename with timestamp
@@ -86,14 +239,38 @@ def replace_voice(
86
  )
87
  output_path = output_dir / output_filename
88
 
89
- # Save the result
90
- if isinstance(result, str) and os.path.exists(result):
91
- # If result is a file path, copy it to output location
 
 
 
92
  import shutil
93
 
94
  shutil.copy2(result, output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  else:
96
- # If result is audio data, save it using soundfile
97
  import soundfile as sf
98
 
99
  sf.write(str(output_path), result, 22050)
@@ -101,7 +278,49 @@ def replace_voice(
101
  return str(output_path)
102
 
103
  except Exception as e:
104
- raise RuntimeError(f"Voice replacement failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
 
107
  def replace_voice_wrapper(
@@ -118,8 +337,8 @@ def replace_voice_wrapper(
118
  Wrapper function for voice replacement with error handling for MCP integration.
119
 
120
  Args:
121
- source_audio_path: Path to the source audio file
122
- target_audio_path: Path to the target audio file
123
  diffusion_steps: Number of diffusion steps (default: 10)
124
  length_adjust: Length adjustment factor (default: 1.0)
125
  inference_cfg_rate: CFG rate (default: 0.7)
@@ -129,6 +348,13 @@ def replace_voice_wrapper(
129
 
130
  Returns:
131
  Path to generated audio file or error message
 
 
 
 
 
 
 
132
  """
133
  try:
134
  return replace_voice(
@@ -143,3 +369,97 @@ def replace_voice_wrapper(
143
  )
144
  except Exception as e:
145
  return f"Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ssl
2
+ import tempfile
3
+ import urllib.request
4
  from datetime import datetime
5
  from pathlib import Path
6
+ from typing import Optional
7
 
8
  from gradio_client import Client, handle_file
9
+ from gradio_client.client import DEFAULT_TEMP_DIR
10
 
11
+ # Handle imports for both module and script usage
12
+ try:
13
+ from tools.audio_info import validate_audio_path
14
+ except ImportError:
15
+ from audio_info import validate_audio_path
16
+
17
+
18
+ def resolve_audio_path(audio_path: str) -> str:
19
+ """
20
+ Resolve audio path - handle both local files and URLs.
21
+
22
+ Args:
23
+ audio_path: Path to local audio file or URL
24
+
25
+ Returns:
26
+ Path to local audio file (downloads if URL)
27
+
28
+ Raises:
29
+ ValueError: If path is invalid
30
+ RuntimeError: If URL download fails
31
+ """
32
+ if not audio_path:
33
+ raise ValueError("Audio path cannot be empty")
34
+
35
+ # Check if it's a URL
36
+ if audio_path.startswith(("http://", "https://")):
37
+ return download_audio_from_url(audio_path)
38
+ else:
39
+ # Handle local file
40
+ return validate_audio_path(audio_path)
41
+
42
+
43
+ def download_audio_from_url(url: str, output_path: Optional[str] = None) -> str:
44
+ """
45
+ Download audio from URL to temporary file or specified output path.
46
+
47
+ Args:
48
+ url: URL to audio file
49
+ output_path: Optional custom output path (if None, uses temp directory)
50
+
51
+ Returns:
52
+ Path to downloaded file
53
+
54
+ Raises:
55
+ RuntimeError: If download fails
56
+ """
57
+ if output_path:
58
+ temp_path = output_path
59
+ else:
60
+ temp_dir = tempfile.gettempdir()
61
+ filename = f"voice_replacement_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
62
+ temp_path = os.path.join(temp_dir, filename)
63
+
64
+ # Try multiple download methods
65
+ download_methods = [
66
+ # Method 1: Standard SSL context
67
+ lambda: _download_with_ssl_context(
68
+ url, temp_path, ssl.VerifyMode.CERT_REQUIRED
69
+ ),
70
+ # Method 2: Relaxed SSL (ignore cert errors)
71
+ lambda: _download_with_ssl_context(url, temp_path, ssl.VerifyMode.CERT_NONE),
72
+ # Method 3: No SSL verification
73
+ lambda: _download_no_ssl(url, temp_path),
74
+ ]
75
+
76
+ last_error = None
77
+ for i, download_method in enumerate(download_methods):
78
+ try:
79
+ download_method()
80
+ if not os.path.exists(temp_path) or os.path.getsize(temp_path) == 0:
81
+ raise RuntimeError(f"Downloaded file is empty or missing: {temp_path}")
82
+ return temp_path
83
+ except Exception as e:
84
+ last_error = e
85
+ if i < len(download_methods) - 1:
86
+ # Clean up partial download and try next method
87
+ if os.path.exists(temp_path):
88
+ os.remove(temp_path)
89
+ continue
90
+
91
+ raise RuntimeError(
92
+ f"Failed to download audio from URL {url}. Last error: {str(last_error)}"
93
+ )
94
+
95
+
96
+ def _download_with_ssl_context(
97
+ url: str, temp_path: str, verify_mode: ssl.VerifyMode
98
+ ) -> None:
99
+ """Download with specific SSL certificate mode."""
100
+ ssl_context = ssl.create_default_context()
101
+ ssl_context.check_hostname = False
102
+ ssl_context.verify_mode = verify_mode
103
+
104
+ req = urllib.request.Request(url)
105
+ req.add_header("User-Agent", "Mozilla/5.0 (compatible; Voice-Replacement-Tool/1.0)")
106
+
107
+ with urllib.request.urlopen(req, context=ssl_context) as response:
108
+ with open(temp_path, "wb") as f:
109
+ f.write(response.read())
110
+
111
+
112
+ def _download_no_ssl(url: str, temp_path: str) -> None:
113
+ """Download without SSL verification."""
114
+ req = urllib.request.Request(url)
115
+ req.add_header("User-Agent", "Mozilla/5.0 (compatible; Voice-Replacement-Tool/1.0)")
116
+
117
+ # Open without SSL context
118
+ with urllib.request.urlopen(req) as response:
119
+ with open(temp_path, "wb") as f:
120
+ f.write(response.read())
121
+
122
+
123
+ def cleanup_temp_file(file_path: str) -> None:
124
+ """
125
+ Clean up temporary file if it exists.
126
+
127
+ Args:
128
+ file_path: Path to temporary file
129
+ """
130
+ try:
131
+ if os.path.exists(file_path) and file_path.startswith(tempfile.gettempdir()):
132
+ os.remove(file_path)
133
+ except Exception:
134
+ # Ignore cleanup errors
135
+ pass
136
 
137
 
138
  def replace_voice(
 
148
  """
149
  Replace voice in source audio with voice from target audio using Seed-VC.
150
 
151
+ This function uses Seed-VC Gradio space to perform voice conversion,
152
+ replacing voice characteristics in source audio with those from
153
+ target audio while preserving linguistic content and timing.
154
+
155
+ Examples:
156
+ >>> replace_voice("source.wav", "target.wav")
157
+ # Returns 'path/to/source_voice_replaced_by_target_20251126_143022.wav'
158
+
159
+ >>> replace_voice("https://example.com/source.wav", "target.wav", diffusion_steps=15)
160
+ # Downloads source audio and replaces voice with target voice
161
+
162
+ >>> replace_voice("source.wav", "https://example.com/voice.mp3", pitch_shift=2)
163
+ # Downloads target voice and applies to source with pitch shift
164
 
165
  Args:
166
+ source_audio_path: Path to source audio file or URL (voice to be replaced)
167
+ Supports local files and HTTP/HTTPS URLs
168
+ target_audio_path: Path to target audio file or URL (voice to use)
169
+ Supports local files and HTTP/HTTPS URLs
170
  diffusion_steps: Number of diffusion steps for inference (default: 10)
171
  length_adjust: Length adjustment factor (default: 1.0)
172
  inference_cfg_rate: Classifier-free guidance rate (default: 0.7)
 
175
  pitch_shift: Pitch shift in semitones (default: 0)
176
 
177
  Returns:
178
+ Path to generated voice-replaced audio file
179
 
180
  Raises:
181
  FileNotFoundError: If source or target audio files don't exist
182
  ValueError: If parameters are invalid
183
  RuntimeError: If voice replacement fails
184
  """
185
+ source_temp_file = None
186
+ target_temp_file = None
187
+
188
  try:
189
+ # Resolve input paths (handle both URLs and local files)
190
+ source_abs_path = resolve_audio_path(source_audio_path)
191
+ target_abs_path = resolve_audio_path(target_audio_path)
192
+
193
+ # Track temporary files for cleanup
194
+ if source_audio_path.startswith(("http://", "https://")):
195
+ source_temp_file = source_abs_path
196
+ if target_audio_path.startswith(("http://", "https://")):
197
+ target_temp_file = target_abs_path
198
 
199
  # Validate parameters
200
  if diffusion_steps < 1 or diffusion_steps > 50:
 
206
  if pitch_shift < -12 or pitch_shift > 12:
207
  raise ValueError("pitch_shift must be between -12 and 12 semitones")
208
 
209
+ # Initialize Seed-VC client with manual file handling
210
+ client = Client("Plachta/Seed-VC", download_files=False)
211
+
212
+ # Prepare file handles for manual upload
213
+ source_handle = handle_file(source_abs_path)
214
+ target_handle = handle_file(target_abs_path)
215
 
216
  # Perform voice replacement
217
  result = client.predict(
218
+ source_audio_path=source_handle,
219
+ target_audio_path=target_handle,
220
  diffusion_steps=diffusion_steps,
221
  length_adjust=length_adjust,
222
  inference_cfg_rate=inference_cfg_rate,
 
227
  )
228
 
229
  # Create output directory
230
+ output_dir = Path(DEFAULT_TEMP_DIR)
231
  output_dir.mkdir(exist_ok=True)
232
 
233
  # Generate output filename with timestamp
 
239
  )
240
  output_path = output_dir / output_filename
241
 
242
+ # Handle result - check if it's a file path or needs manual download
243
+ if hasattr(result, "url") and result.url:
244
+ # Result is a file object with URL - download manually
245
+ download_audio_from_url(result.url, str(output_path))
246
+ elif isinstance(result, str) and os.path.exists(result):
247
+ # Result is a local file path - copy it
248
  import shutil
249
 
250
  shutil.copy2(result, output_path)
251
+ elif isinstance(result, (tuple, list)):
252
+ import shutil
253
+
254
+ # Only download the second item if multiple outputs
255
+ item = result[0]
256
+ if len(result) > 1:
257
+ item = result[1]
258
+
259
+ if url:= item.get("url"):
260
+ # Download each URL to a separate file
261
+ item_output = str(output_path)
262
+ download_audio_from_url(url, item_output)
263
+
264
+ elif isinstance(item, str) and os.path.exists(item):
265
+ # Copy each local file
266
+ item_output = str(output_path)
267
+ shutil.copy2(item, item_output)
268
+ else:
269
+ raise RuntimeError(f"Unexpected result format in tuple: {item}")
270
+
271
+ shutil.move(item_output, output_path)
272
  else:
273
+ # Result is audio data - save it directly
274
  import soundfile as sf
275
 
276
  sf.write(str(output_path), result, 22050)
 
278
  return str(output_path)
279
 
280
  except Exception as e:
281
+ # Handle specific Seed-VC errors
282
+ error_msg = str(e)
283
+ if "403" in error_msg or "Forbidden" in error_msg:
284
+ raise RuntimeError(
285
+ "Seed-VC access denied. This may indicate:\n"
286
+ "1. Files are in unsupported format\n"
287
+ "2. Files are too large\n"
288
+ "3. Temporary space restrictions\n"
289
+ "4. Authentication required\n\n"
290
+ "TROUBLESHOOTING:\n"
291
+ "• Try different audio files (WAV, MP3, FLAC, M4A)\n"
292
+ "• Use smaller files (< 30MB recommended)\n"
293
+ "• Check if files are corrupted\n"
294
+ "• Try again later if rate limited\n"
295
+ "• Consider using a different voice source/target"
296
+ )
297
+ elif "404" in error_msg or "Not Found" in error_msg:
298
+ raise RuntimeError(
299
+ "Seed-VC cannot find one or both files. "
300
+ "Check if:\n"
301
+ "• Files exist and are accessible\n"
302
+ "• File paths are correct\n"
303
+ "• Files are in supported format (WAV, MP3, FLAC, M4A)\n"
304
+ "• Manual download was successful"
305
+ )
306
+ elif "timeout" in error_msg.lower():
307
+ raise RuntimeError(
308
+ "Seed-VC connection timeout. "
309
+ "Try:\n"
310
+ "• Using fewer diffusion steps (5-10)\n"
311
+ "• Smaller audio files\n"
312
+ "• Processing again later\n"
313
+ "• Checking internet connection"
314
+ )
315
+ else:
316
+ raise RuntimeError(f"Voice replacement failed: {error_msg}")
317
+
318
+ finally:
319
+ # Always clean up temporary files
320
+ if source_temp_file:
321
+ cleanup_temp_file(source_temp_file)
322
+ if target_temp_file:
323
+ cleanup_temp_file(target_temp_file)
324
 
325
 
326
  def replace_voice_wrapper(
 
337
  Wrapper function for voice replacement with error handling for MCP integration.
338
 
339
  Args:
340
+ source_audio_path: Path to input audio file or URL
341
+ target_audio_path: Path to target audio file or URL
342
  diffusion_steps: Number of diffusion steps (default: 10)
343
  length_adjust: Length adjustment factor (default: 1.0)
344
  inference_cfg_rate: CFG rate (default: 0.7)
 
348
 
349
  Returns:
350
  Path to generated audio file or error message
351
+
352
+ Note for URL usage:
353
+ Some URLs may be blocked by Seed-VC space restrictions.
354
+ If URL processing fails with access errors, try:
355
+ 1. Download the file manually using your browser
356
+ 2. Save it locally and use the local file path
357
+ 3. Use a different audio source or target
358
  """
359
  try:
360
  return replace_voice(
 
369
  )
370
  except Exception as e:
371
  return f"Error: {str(e)}"
372
+
373
+
374
+ if __name__ == "__main__":
375
+ """
376
+ Script section for running voice replacement locally.
377
+
378
+ Usage:
379
+ python tools/voice_replacement.py source.wav target.wav
380
+ python tools/voice_replacement.py source.wav target.wav --steps 15 --pitch 2
381
+ python tools/voice_replacement.py https://example.com/source.wav target.wav
382
+ python tools/voice_replacement.py source.wav https://example.com/target.mp3 --pitch 2
383
+ """
384
+ import argparse
385
+ import sys
386
+ import os
387
+
388
+ # Add parent directory to path for imports
389
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
390
+
391
+ parser = argparse.ArgumentParser(
392
+ description="Voice replacement using Seed-VC",
393
+ formatter_class=argparse.RawDescriptionHelpFormatter,
394
+ epilog="""
395
+ Examples:
396
+ python tools/voice_replacement.py source.wav target.wav
397
+ python tools/voice_replacement.py source.wav target.wav --steps 15 --pitch 2
398
+ python tools/voice_replacement.py source.wav target.wav --f0-condition --no-auto-f0
399
+ python tools/voice_replacement.py https://example.com/source.wav target.wav
400
+ python tools/voice_replacement.py source.wav https://example.com/target.mp3 --pitch 2
401
+ """,
402
+ )
403
+
404
+ parser.add_argument(
405
+ "source", help="Source audio path or URL (voice to be replaced)"
406
+ )
407
+ parser.add_argument("target", help="Target audio path or URL (voice to use)")
408
+ parser.add_argument(
409
+ "--steps", type=int, default=10, help="Diffusion steps (1-50, default: 10)"
410
+ )
411
+ parser.add_argument(
412
+ "--length",
413
+ type=float,
414
+ default=1.0,
415
+ help="Length adjustment (0.1-3.0, default: 1.0)",
416
+ )
417
+ parser.add_argument(
418
+ "--cfg",
419
+ type=float,
420
+ default=0.7,
421
+ help="Inference CFG rate (0.0-1.0, default: 0.7)",
422
+ )
423
+ parser.add_argument(
424
+ "--f0-condition", action="store_true", help="Enable F0 conditioning"
425
+ )
426
+ parser.add_argument(
427
+ "--no-auto-f0", action="store_true", help="Disable auto F0 adjustment"
428
+ )
429
+ parser.add_argument(
430
+ "--pitch",
431
+ type=int,
432
+ default=0,
433
+ help="Pitch shift semitones (-12 to 12, default: 0)",
434
+ )
435
+
436
+ args = parser.parse_args()
437
+
438
+ print("Voice Replacement Tool")
439
+ print("=" * 30)
440
+ print(f"Source: {args.source}")
441
+ print(f"Target: {args.target}")
442
+ print(f"Parameters: steps={args.steps}, length={args.length}, cfg={args.cfg}")
443
+ print(
444
+ f"F0 condition={args.f0_condition}, auto F0={not args.no_auto_f0}, pitch={args.pitch}"
445
+ )
446
+ print()
447
+
448
+ try:
449
+ result = replace_voice(
450
+ source_audio_path=args.source,
451
+ target_audio_path=args.target,
452
+ diffusion_steps=args.steps,
453
+ length_adjust=args.length,
454
+ inference_cfg_rate=args.cfg,
455
+ f0_condition=args.f0_condition,
456
+ auto_f0_adjust=not args.no_auto_f0,
457
+ pitch_shift=args.pitch,
458
+ )
459
+
460
+ print("✅ Voice replacement completed!")
461
+ print(f"Output saved to: {result}")
462
+
463
+ except Exception as e:
464
+ print(f"❌ Error: {e}")
465
+ sys.exit(1)