science-storyteller / tests /convert_cache_to_mp3.py
tuhulab's picture
chore: Organize repository structure - move docs to docs/ and tests to tests/
28b3cfa
"""
Convert cached WAV files to MP3 for smaller size (GitHub 10MB limit)
Requires: pydub and ffmpeg
"""
import os
import json
from pathlib import Path
try:
from pydub import AudioSegment
except ImportError:
print("Installing pydub...")
os.system("pip install pydub")
from pydub import AudioSegment
def convert_wav_to_mp3(wav_path: str, mp3_path: str, bitrate: str = "128k"):
"""Convert WAV to MP3."""
print(f"Converting {wav_path} -> {mp3_path}")
audio = AudioSegment.from_wav(wav_path)
audio.export(mp3_path, format="mp3", bitrate=bitrate)
wav_size = os.path.getsize(wav_path) / (1024 * 1024)
mp3_size = os.path.getsize(mp3_path) / (1024 * 1024)
print(f" WAV: {wav_size:.2f}MB -> MP3: {mp3_size:.2f}MB (saved {wav_size - mp3_size:.2f}MB)")
def update_cache_metadata(cache_dir: Path):
"""Update metadata.json to point to MP3 files."""
metadata_file = cache_dir / "metadata.json"
if not metadata_file.exists():
print("No metadata.json found")
return
with open(metadata_file, 'r') as f:
metadata = json.load(f)
for key, entry in metadata.items():
audio_file = entry.get('audio_file', '')
if audio_file.endswith('.wav'):
# Update to MP3
mp3_file = audio_file.replace('.wav', '.mp3')
entry['audio_file'] = mp3_file
print(f"Updated metadata: {audio_file} -> {mp3_file}")
with open(metadata_file, 'w') as f:
json.dump(metadata, f, indent=2)
print(f"\nβœ… Updated {metadata_file}")
def main():
cache_dir = Path("./cache")
audio_dir = Path("./assets/audio")
print("🎡 Converting cached WAV files to MP3\n")
print("This reduces file size by ~90% to fit GitHub's 10MB limit\n")
# Load metadata to find all cached audio files
metadata_file = cache_dir / "metadata.json"
if not metadata_file.exists():
print("❌ No cache metadata found. Run generate_cache.py first.")
return
with open(metadata_file, 'r') as f:
metadata = json.load(f)
converted = 0
for key, entry in metadata.items():
audio_file = entry.get('audio_file', '')
if not audio_file or not audio_file.endswith('.wav'):
continue
wav_path = Path(audio_file)
if not wav_path.exists():
print(f"⚠️ Not found: {wav_path}")
continue
# Convert to MP3
mp3_path = wav_path.with_suffix('.mp3')
if mp3_path.exists():
print(f"βœ“ Already exists: {mp3_path}")
else:
try:
convert_wav_to_mp3(str(wav_path), str(mp3_path), bitrate="128k")
converted += 1
except Exception as e:
print(f"❌ Error converting {wav_path}: {e}")
print(f"\nπŸ“Š Converted {converted} files")
# Update metadata
print("\nπŸ”„ Updating cache metadata...")
update_cache_metadata(cache_dir)
# Clean up WAV files
print("\nπŸ—‘οΈ Removing original WAV files...")
for key, entry in metadata.items():
audio_file = entry.get('audio_file', '')
if audio_file:
wav_path = Path(audio_file.replace('.mp3', '.wav'))
if wav_path.exists() and wav_path.suffix == '.wav':
size_mb = os.path.getsize(wav_path) / (1024 * 1024)
os.remove(wav_path)
print(f" Removed {wav_path.name} ({size_mb:.2f}MB)")
print("\nβœ… Conversion complete!")
print("\nNow you can commit:")
print(" git add cache/ assets/audio/podcast_*.mp3")
print(" git commit -m 'Add pre-generated MP3 podcast cache'")
print(" git push")
if __name__ == "__main__":
main()