You Need Live Transcription, Not Batch Processing
Standard Whisper processes complete audio files — but you need live transcription from a microphone, phone call, or streaming source. The audio arrives continuously and the user expects to see words appear in near-real-time, not after the recording ends. Building this on a GPU server requires a WebSocket server that accepts audio chunks, buffers them intelligently, and feeds them to Whisper with minimal latency.
WebSocket Server Architecture
The server accepts raw PCM audio over a WebSocket connection, accumulates chunks, and sends transcription results back:
import asyncio
import websockets
import numpy as np
from faster_whisper import WhisperModel
model = WhisperModel("medium", device="cuda", compute_type="int8")
async def transcription_handler(websocket):
audio_buffer = np.array([], dtype=np.float32)
sample_rate = 16000
chunk_duration = 3.0 # Process every 3 seconds of audio
chunk_size = int(sample_rate * chunk_duration)
async for message in websocket:
# Receive raw PCM audio bytes
audio_chunk = np.frombuffer(message, dtype=np.float32)
audio_buffer = np.concatenate([audio_buffer, audio_chunk])
if len(audio_buffer) >= chunk_size:
# Transcribe the accumulated buffer
segments, _ = model.transcribe(
audio_buffer, language="en",
vad_filter=True, beam_size=3
)
text = " ".join([s.text for s in segments])
# Send result back to client
await websocket.send(text)
# Keep overlap for word boundary continuity
overlap = int(sample_rate * 0.5)
audio_buffer = audio_buffer[-overlap:]
async def main():
async with websockets.serve(transcription_handler, "0.0.0.0", 8765):
print("WebSocket Whisper server running on ws://0.0.0.0:8765")
await asyncio.Future() # Run forever
asyncio.run(main())
Voice Activity Detection for Smart Chunking
Instead of fixed-interval chunking, use VAD to detect when the speaker pauses and transcribe complete utterances:
import torch
# Load Silero VAD
vad_model, utils = torch.hub.load('snakers4/silero-vad', 'silero_vad')
(get_speech_timestamps, _, _, _, _) = utils
class VADChunker:
def __init__(self, sample_rate=16000):
self.sr = sample_rate
self.buffer = np.array([], dtype=np.float32)
self.silence_threshold_ms = 500
self.min_speech_ms = 300
def feed(self, audio_chunk):
self.buffer = np.concatenate([self.buffer, audio_chunk])
# Check for speech endpoints every 100ms of new audio
if len(self.buffer) < self.sr * 0.5:
return None
audio_tensor = torch.from_numpy(self.buffer)
timestamps = get_speech_timestamps(
audio_tensor, vad_model,
threshold=0.5,
min_silence_duration_ms=self.silence_threshold_ms,
min_speech_duration_ms=self.min_speech_ms,
sampling_rate=self.sr
)
if timestamps and self._has_speech_ended(timestamps):
# Extract speech segment and reset buffer
end_sample = timestamps[-1]['end']
speech = self.buffer[:end_sample]
self.buffer = self.buffer[end_sample:]
return speech
return None
def _has_speech_ended(self, timestamps):
last_end = timestamps[-1]['end'] / self.sr
buffer_end = len(self.buffer) / self.sr
silence_duration = buffer_end - last_end
return silence_duration > (self.silence_threshold_ms / 1000)
Browser Client Implementation
Capture microphone audio in the browser and stream it to your WebSocket server:
<script>
const ws = new WebSocket('wss://your-gpu-server.com:8765');
async function startStreaming() {
const stream = await navigator.mediaDevices.getUserMedia({
audio: { sampleRate: 16000, channelCount: 1, echoCancellation: true }
});
const audioContext = new AudioContext({ sampleRate: 16000 });
const source = audioContext.createMediaStreamSource(stream);
const processor = audioContext.createScriptProcessor(4096, 1, 1);
processor.onaudioprocess = (event) => {
const audioData = event.inputBuffer.getChannelData(0);
// Send raw float32 PCM over WebSocket
ws.send(audioData.buffer);
};
source.connect(processor);
processor.connect(audioContext.destination);
}
ws.onmessage = (event) => {
document.getElementById('transcript').textContent += event.data + ' ';
};
startStreaming();
</script>
Streaming Partial Results
For a more responsive experience, send partial (unstable) results alongside final (stable) results:
async def transcription_handler_with_partials(websocket):
chunker = VADChunker()
running_text = ""
async for message in websocket:
audio_chunk = np.frombuffer(message, dtype=np.float32)
# Try to get a complete utterance
complete_speech = chunker.feed(audio_chunk)
if complete_speech is not None:
# Final result for this utterance
segments, _ = model.transcribe(complete_speech, language="en")
final_text = " ".join([s.text for s in segments])
running_text += final_text + " "
await websocket.send(json.dumps({
"type": "final",
"text": final_text,
"full_transcript": running_text
}))
elif len(chunker.buffer) > 16000: # >1 second buffered
# Partial result from current buffer
segments, _ = model.transcribe(chunker.buffer, language="en")
partial_text = " ".join([s.text for s in segments])
await websocket.send(json.dumps({
"type": "partial",
"text": partial_text
}))
Production Deployment Considerations
Harden the WebSocket server for production traffic:
# Run behind Nginx with WebSocket proxy
# nginx.conf
upstream whisper_ws {
server 127.0.0.1:8765;
}
server {
listen 443 ssl;
location /ws/transcribe {
proxy_pass http://whisper_ws;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_read_timeout 3600s; # Keep connections alive for long sessions
}
}
# Connection limits and authentication
# Add API key validation in the WebSocket handler
# Limit concurrent connections per client IP
# Set maximum session duration (e.g., 60 minutes)
For production real-time transcription on your GPU server, faster-whisper with INT8 on a mid-range GPU handles 10-20 concurrent streams comfortably. The Whisper hosting platform supports WebSocket deployments. See the tutorials section for more patterns, vLLM production guide for Nginx reverse proxy details, and the benchmarks for concurrent stream capacity. Our PyTorch guide covers the underlying environment.
Real-Time Transcription Servers
Stream audio to Whisper on GigaGPU dedicated servers. Low latency, high concurrency, always on.
Browse GPU Servers