danielrosehill's picture
commit
31b5d64
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Whisper Fine-Tune vs Commercial APIs | STT Comparison</title>
<link rel="stylesheet" href="style.css" />
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
</head>
<body>
<div class="container">
<!-- Header -->
<header class="header">
<h1>Whisper Fine-Tune vs Commercial APIs</h1>
<p class="subtitle">Local Fine-Tunes Beat Commercial STT Services</p>
<div class="metadata">
<span>November 17, 2025</span>
<span>137-word test sample</span>
<span>7 models tested</span>
</div>
</header>
<!-- Executive Summary -->
<section class="card summary">
<h2>Executive Summary</h2>
<p>This evaluation compared transcription accuracy across five fine-tuned Whisper models and three commercial STT APIs (OpenAI Whisper, Assembly, Gladia). All models were tested on identical audio with verified ground truth transcription.</p>
<div class="key-finding">
<h3>Key Finding</h3>
<p>The fine-tuned <strong>Whisper Large V3 Turbo</strong> achieved the best performance with an accuracy of <strong class="highlight">94.16%</strong>, beating the best commercial API (Assembly at 92.70%) and all other services tested including OpenAI's Whisper API.</p>
</div>
</section>
<!-- Accuracy Comparison Chart -->
<section class="card">
<h2>Transcription Accuracy Comparison</h2>
<p class="chart-subtitle">Higher is better - percentage of words transcribed correctly</p>
<div class="chart-container">
<canvas id="accuracyChart"></canvas>
</div>
</section>
<!-- Model Rankings -->
<section class="card">
<h2>Model Rankings</h2>
<table class="results-table">
<thead>
<tr>
<th>Rank</th>
<th>Model</th>
<th>Type</th>
<th>Accuracy</th>
<th>WER</th>
</tr>
</thead>
<tbody>
<tr class="best-model">
<td>1</td>
<td><strong>Whisper Large V3 Turbo (Fine-Tune)</strong></td>
<td>Local</td>
<td><strong>94.16%</strong></td>
<td><strong>5.84%</strong></td>
</tr>
<tr>
<td>2</td>
<td><strong>Assembly API</strong></td>
<td>Commercial</td>
<td>92.70%</td>
<td>7.30%</td>
</tr>
<tr>
<td>3</td>
<td><strong>Gladia API</strong></td>
<td>Commercial</td>
<td>91.97%</td>
<td>8.03%</td>
</tr>
<tr>
<td>4</td>
<td>Whisper Small (Fine-Tune)</td>
<td>Local</td>
<td>91.24%</td>
<td>8.76%</td>
</tr>
<tr>
<td>5</td>
<td>Whisper (OpenAI API)</td>
<td>Commercial</td>
<td>91.24%</td>
<td>8.76%</td>
</tr>
<tr style="border-top: 3px solid #ef4444;">
<td colspan="5" style="background-color: #fef2f2; text-align: center; font-weight: bold; color: #dc2626; padding: 8px;">
Below 90% Accuracy Threshold
</td>
</tr>
<tr>
<td>6</td>
<td>Whisper Base (Fine-Tune)</td>
<td>Local</td>
<td>85.40%</td>
<td>14.60%</td>
</tr>
<tr>
<td>7</td>
<td>Whisper Tiny (Fine-Tune)</td>
<td>Local</td>
<td>85.40%</td>
<td>14.60%</td>
</tr>
</tbody>
</table>
</section>
<!-- Local vs Commercial Chart -->
<section class="card">
<h2>Fine-Tuned Whisper vs Commercial APIs</h2>
<p class="chart-subtitle">Comparing local fine-tuned models against commercial Whisper services</p>
<div class="chart-container">
<canvas id="localVsCommercialChart"></canvas>
</div>
</section>
<!-- Error Breakdown Chart -->
<section class="card">
<h2>Error Breakdown by Type</h2>
<p class="chart-subtitle">Distribution of substitutions, deletions, and insertions</p>
<div class="chart-container">
<canvas id="errorChart"></canvas>
</div>
</section>
<!-- Information Preserved Chart -->
<section class="card">
<h2>Information Preserved (WIP)</h2>
<p class="chart-subtitle">Higher is better - measures semantic accuracy</p>
<div class="chart-container">
<canvas id="wipChart"></canvas>
</div>
</section>
<!-- Detailed Metrics -->
<section class="card">
<h2>Detailed Metrics</h2>
<div class="model-detail">
<h3>1. Whisper Large V3 Turbo (Fine-Tune) - WINNER</h3>
<div class="metrics-grid">
<div class="metric">
<div class="metric-label">Accuracy</div>
<div class="metric-value">94.16%</div>
</div>
<div class="metric">
<div class="metric-label">Hits</div>
<div class="metric-value">130/137</div>
</div>
<div class="metric">
<div class="metric-label">Substitutions</div>
<div class="metric-value">7</div>
</div>
<div class="metric">
<div class="metric-label">Deletions</div>
<div class="metric-value">0</div>
</div>
<div class="metric">
<div class="metric-label">Insertions</div>
<div class="metric-value">1</div>
</div>
<div class="metric">
<div class="metric-label">Info Preserved</div>
<div class="metric-value">89.39%</div>
</div>
</div>
<p><strong>Analysis:</strong> Excellent performance with zero deletions (no lost content). Clear winner for production use, beating all commercial APIs.</p>
</div>
<div class="model-detail">
<h3>2. Whisper Small (Fine-Tune)</h3>
<div class="metrics-grid">
<div class="metric">
<div class="metric-label">Accuracy</div>
<div class="metric-value">91.24%</div>
</div>
<div class="metric">
<div class="metric-label">Hits</div>
<div class="metric-value">127/137</div>
</div>
<div class="metric">
<div class="metric-label">Substitutions</div>
<div class="metric-value">9</div>
</div>
<div class="metric">
<div class="metric-label">Deletions</div>
<div class="metric-value">1</div>
</div>
<div class="metric">
<div class="metric-label">Insertions</div>
<div class="metric-value">2</div>
</div>
<div class="metric">
<div class="metric-label">Info Preserved</div>
<div class="metric-value">85.31%</div>
</div>
</div>
<p><strong>Analysis:</strong> Strong performance from a smaller model. Matches OpenAI's Whisper API while running locally. Good choice for real-time applications where speed matters.</p>
</div>
<div class="model-detail">
<h3>2. Assembly API - BEST COMMERCIAL</h3>
<div class="metrics-grid">
<div class="metric">
<div class="metric-label">Accuracy</div>
<div class="metric-value">92.70%</div>
</div>
<div class="metric">
<div class="metric-label">Hits</div>
<div class="metric-value">129/137</div>
</div>
<div class="metric">
<div class="metric-label">Substitutions</div>
<div class="metric-value">8</div>
</div>
<div class="metric">
<div class="metric-label">Deletions</div>
<div class="metric-value">0</div>
</div>
<div class="metric">
<div class="metric-label">Insertions</div>
<div class="metric-value">2</div>
</div>
<div class="metric">
<div class="metric-label">Info Preserved</div>
<div class="metric-value">87.39%</div>
</div>
</div>
<p><strong>Analysis:</strong> Best commercial API, but still beaten by fine-tuned Large V3 Turbo (92.70% → 94.16% accuracy).</p>
</div>
<div class="model-detail">
<h3>3. Gladia API</h3>
<div class="metrics-grid">
<div class="metric">
<div class="metric-label">Accuracy</div>
<div class="metric-value">91.97%</div>
</div>
<div class="metric">
<div class="metric-label">Hits</div>
<div class="metric-value">128/137</div>
</div>
<div class="metric">
<div class="metric-label">Substitutions</div>
<div class="metric-value">9</div>
</div>
<div class="metric">
<div class="metric-label">Deletions</div>
<div class="metric-value">0</div>
</div>
<div class="metric">
<div class="metric-label">Insertions</div>
<div class="metric-value">2</div>
</div>
<div class="metric">
<div class="metric-label">Info Preserved</div>
<div class="metric-value">86.04%</div>
</div>
</div>
<p><strong>Analysis:</strong> Third overall, second-best commercial API. Competitive performance.</p>
</div>
</section>
<!-- Key Conclusions -->
<section class="card conclusions">
<h2>Key Conclusions</h2>
<div class="conclusion-item">
<h3>1. Local Fine-Tunes Beat Commercial Whisper APIs</h3>
<p>The fine-tuned Whisper Large V3 Turbo achieved <strong>94.16% accuracy</strong>, beating the best commercial API (Assembly at 92.70%). This demonstrates that targeted fine-tuning can outperform premium commercial services on the same base model.</p>
</div>
<div class="conclusion-item">
<h3>2. Cost & Privacy Advantages</h3>
<p>Running local fine-tuned models eliminates per-minute API costs and keeps sensitive audio data on-premises. The performance advantage makes this even more compelling.</p>
</div>
<div class="conclusion-item">
<h3>3. Commercial APIs Are Competitive</h3>
<p>All three commercial APIs (Assembly 92.70%, Gladia 91.97%, OpenAI Whisper 91.24%) delivered production-ready performance. They're viable alternatives when local inference isn't feasible.</p>
</div>
<div class="conclusion-item">
<h3>4. Production Recommendations</h3>
<ul>
<li><strong>Best Performance:</strong> Whisper Large V3 Turbo (Fine-Tune) - 94.16% accuracy for local deployment</li>
<li><strong>Best Commercial:</strong> Assembly API - 92.70% accuracy if cloud is required</li>
<li><strong>Balanced Local:</strong> Whisper Small (Fine-Tune) - 91.24% accuracy, matches OpenAI with faster inference</li>
<li><strong>OpenAI Alternative:</strong> Fine-tuned Small matches OpenAI's API while running locally</li>
</ul>
</div>
</section>
<!-- Fine-Tunes vs Commercial -->
<section class="card">
<h2>Local vs Commercial Performance</h2>
<table class="comparison-table">
<thead>
<tr>
<th>Model</th>
<th>Type</th>
<th>Accuracy</th>
<th>vs Best Commercial</th>
</tr>
</thead>
<tbody>
<tr class="improvement">
<td><strong>Large V3 Turbo (Fine-Tune)</strong></td>
<td>Local</td>
<td><strong>94.16%</strong></td>
<td class="positive">+1.46% better</td>
</tr>
<tr class="baseline-row">
<td><strong>Assembly API</strong></td>
<td>Commercial</td>
<td><strong>92.70%</strong></td>
<td><strong>baseline (best commercial)</strong></td>
</tr>
<tr>
<td>Gladia API</td>
<td>Commercial</td>
<td>91.97%</td>
<td class="negative">-0.73% worse</td>
</tr>
<tr class="improvement">
<td>Whisper Small (Fine-Tune)</td>
<td>Local</td>
<td>91.24%</td>
<td class="negative">-1.46% worse</td>
</tr>
<tr>
<td>Whisper (OpenAI API)</td>
<td>Commercial</td>
<td>91.24%</td>
<td class="negative">-1.46% worse</td>
</tr>
</tbody>
</table>
</section>
<!-- Methodology -->
<section class="card methodology">
<h2>Methodology</h2>
<ul>
<li><strong>Ground Truth:</strong> Manual transcription of test audio</li>
<li><strong>WER Calculation:</strong> Using jiwer library</li>
<li><strong>Framework:</strong> Hugging Face Transformers pipeline</li>
<li><strong>Additional Metrics:</strong> MER, WIL, WIP, error breakdown</li>
<li><strong>Environment:</strong> Python 3.12, CPU inference</li>
<li><strong>Test Sample:</strong> 137-word narrative about a coastal town</li>
</ul>
</section>
<!-- Audio Sample -->
<section class="card">
<h2>Test Audio Sample</h2>
<p>Listen to the audio sample used for this evaluation:</p>
<div style="margin: 20px 0;">
<audio controls style="width: 100%; max-width: 600px;">
<source src="eval/eval/test-audio.wav" type="audio/wav">
Your browser does not support the audio element.
</audio>
</div>
</section>
<!-- Ground Truth Transcription -->
<section class="card">
<h2>Ground Truth Transcription</h2>
<p class="chart-subtitle">Reference transcription used to evaluate all models</p>
<div style="background-color: #f8f9fa; padding: 20px; border-radius: 8px; border-left: 4px solid #6366f1; line-height: 1.8; font-family: Georgia, serif;">
<p style="margin: 0;">I once wandered through a coastal town that smelled like sea salt and fresh bread. The locals said the tide wrote stories on the sand—short tales at low tide, epics when the moon grew bold. Every morning the boardwalk baker pulled loaves out of a brick oven, tapping the crusts so they sang a hollow, golden note. Kids would line up for the first slice, steam fogging their glasses while gulls staged slow-motion dives overhead. The best part was the lighthouse keeper, who claimed he could forecast the weather by listening to the bells on distant fishing boats. If the chimes sounded playful, the day would be calm; if they rang flat, storms stampeded in. I never learned whether his method worked, but I liked believing in a town where music, bread, and tides kept time together.</p>
</div>
<p style="margin-top: 15px; font-size: 0.9em; color: #666;">
<strong>Word count:</strong> 137 words
</p>
</section>
<!-- Footer -->
<footer class="footer">
<p>Generated by automated Whisper evaluation framework</p>
<p>Data source: Local inference testing on CPU | November 2025</p>
<p style="margin-top: 20px; padding-top: 20px; border-top: 1px solid rgba(99, 102, 241, 0.2);">
<strong>Daniel Rosehill</strong> | <a href="https://danielrosehill.com" target="_blank" style="color: #6366f1;">danielrosehill.com</a> | MIT License
</p>
</footer>
</div>
<!-- Chart.js Initialization -->
<script>
// Accuracy Comparison Chart
const accuracyCtx = document.getElementById('accuracyChart').getContext('2d');
new Chart(accuracyCtx, {
type: 'bar',
data: {
labels: [
'Large V3 Turbo\n(Fine-Tune)',
'Assembly API',
'Gladia API',
'Whisper Small\n(Fine-Tune)',
'Whisper\n(OpenAI API)',
'Whisper Base\n(Fine-Tune)',
'Whisper Tiny\n(Fine-Tune)'
],
datasets: [{
label: 'Transcription Accuracy (%)',
data: [94.16, 92.70, 91.97, 91.24, 91.24, 85.40, 85.40],
backgroundColor: [
'rgba(34, 197, 94, 0.7)', // Green for best (Local)
'rgba(99, 102, 241, 0.7)', // Indigo for commercial
'rgba(99, 102, 241, 0.7)', // Indigo for commercial
'rgba(34, 197, 94, 0.6)', // Green for local
'rgba(99, 102, 241, 0.7)', // Indigo for commercial
'rgba(239, 68, 68, 0.7)', // Red for poor (Local)
'rgba(239, 68, 68, 0.7)' // Red for poor (Local)
],
borderColor: [
'rgba(34, 197, 94, 1)',
'rgba(99, 102, 241, 1)',
'rgba(99, 102, 241, 1)',
'rgba(34, 197, 94, 1)',
'rgba(99, 102, 241, 1)',
'rgba(239, 68, 68, 1)',
'rgba(239, 68, 68, 1)'
],
borderWidth: 2
}]
},
options: {
responsive: true,
maintainAspectRatio: false,
plugins: {
legend: {
display: false
},
tooltip: {
callbacks: {
label: function(context) {
const types = ['Local', 'Commercial', 'Commercial', 'Local', 'Commercial', 'Local', 'Local'];
return types[context.dataIndex] + ' - Accuracy: ' + context.parsed.y.toFixed(2) + '%';
}
}
}
},
scales: {
y: {
beginAtZero: false,
min: 82,
max: 96,
title: {
display: true,
text: 'Transcription Accuracy (%)'
}
}
}
}
});
// Local vs Commercial Comparison Chart
const localVsCommercialCtx = document.getElementById('localVsCommercialChart').getContext('2d');
new Chart(localVsCommercialCtx, {
type: 'bar',
data: {
labels: ['Fine-Tuned Models', 'Commercial APIs'],
datasets: [
{
label: 'Large V3 Turbo (FT)',
data: [94.16, null],
backgroundColor: 'rgba(34, 197, 94, 0.8)',
borderColor: 'rgba(34, 197, 94, 1)',
borderWidth: 2
},
{
label: 'Assembly API',
data: [null, 92.70],
backgroundColor: 'rgba(99, 102, 241, 0.8)',
borderColor: 'rgba(99, 102, 241, 1)',
borderWidth: 2
},
{
label: 'Whisper Small (FT)',
data: [91.24, null],
backgroundColor: 'rgba(34, 197, 94, 0.6)',
borderColor: 'rgba(34, 197, 94, 1)',
borderWidth: 2
},
{
label: 'Whisper (OpenAI API)',
data: [null, 91.24],
backgroundColor: 'rgba(99, 102, 241, 0.6)',
borderColor: 'rgba(99, 102, 241, 1)',
borderWidth: 2
}
]
},
options: {
responsive: true,
maintainAspectRatio: false,
plugins: {
legend: {
position: 'top',
},
tooltip: {
callbacks: {
label: function(context) {
return context.dataset.label + ': ' + context.parsed.y.toFixed(2) + '%';
}
}
}
},
scales: {
y: {
beginAtZero: false,
min: 88,
max: 96,
title: {
display: true,
text: 'Accuracy (%)'
}
}
}
}
});
// Error Breakdown Chart
const errorCtx = document.getElementById('errorChart').getContext('2d');
new Chart(errorCtx, {
type: 'bar',
data: {
labels: [
'Large V3 Turbo\n(Fine-Tune)',
'Assembly API',
'Gladia API',
'Whisper Small\n(Fine-Tune)',
'Whisper\n(OpenAI API)',
'Whisper Base\n(Fine-Tune)',
'Whisper Tiny\n(Fine-Tune)'
],
datasets: [
{
label: 'Substitutions',
data: [7, 8, 9, 9, 10, 15, 15],
backgroundColor: 'rgba(239, 68, 68, 0.7)',
borderColor: 'rgba(239, 68, 68, 1)',
borderWidth: 1
},
{
label: 'Deletions',
data: [0, 0, 0, 1, 0, 2, 1],
backgroundColor: 'rgba(251, 191, 36, 0.7)',
borderColor: 'rgba(251, 191, 36, 1)',
borderWidth: 1
},
{
label: 'Insertions',
data: [1, 2, 2, 2, 2, 3, 4],
backgroundColor: 'rgba(59, 130, 246, 0.7)',
borderColor: 'rgba(59, 130, 246, 1)',
borderWidth: 1
}
]
},
options: {
responsive: true,
maintainAspectRatio: false,
plugins: {
legend: {
position: 'top',
}
},
scales: {
x: {
stacked: true,
},
y: {
stacked: true,
beginAtZero: true,
title: {
display: true,
text: 'Number of Errors'
}
}
}
}
});
// Information Preserved Chart
const wipCtx = document.getElementById('wipChart').getContext('2d');
new Chart(wipCtx, {
type: 'bar',
data: {
labels: [
'Large V3 Turbo\n(Fine-Tune)',
'Assembly API',
'Gladia API',
'Whisper Small\n(Fine-Tune)',
'Whisper\n(OpenAI API)',
'Whisper Base\n(Fine-Tune)',
'Whisper Tiny\n(Fine-Tune)'
],
datasets: [{
label: 'Information Preserved (%)',
data: [89.39, 87.39, 86.04, 85.31, 84.70, 76.17, 76.33],
backgroundColor: [
'rgba(34, 197, 94, 0.7)', // Green for best (Local)
'rgba(99, 102, 241, 0.7)', // Indigo for commercial
'rgba(99, 102, 241, 0.7)', // Indigo for commercial
'rgba(34, 197, 94, 0.6)', // Green for local
'rgba(99, 102, 241, 0.7)', // Indigo for commercial
'rgba(239, 68, 68, 0.7)', // Red for poor (Local)
'rgba(239, 68, 68, 0.7)' // Red for poor (Local)
],
borderColor: [
'rgba(34, 197, 94, 1)',
'rgba(99, 102, 241, 1)',
'rgba(99, 102, 241, 1)',
'rgba(34, 197, 94, 1)',
'rgba(99, 102, 241, 1)',
'rgba(239, 68, 68, 1)',
'rgba(239, 68, 68, 1)'
],
borderWidth: 2
}]
},
options: {
responsive: true,
maintainAspectRatio: false,
plugins: {
legend: {
display: false
},
tooltip: {
callbacks: {
label: function(context) {
return 'Info Preserved: ' + context.parsed.y.toFixed(2) + '%';
}
}
}
},
scales: {
y: {
beginAtZero: false,
min: 74,
max: 92,
title: {
display: true,
text: 'Information Preserved (%)'
}
}
}
}
});
</script>
</body>
</html>