Published on

Voice AI Backend — Speech-to-Text, NLU, and Response Generation

Authors

Introduction

Voice interfaces unlock hands-free interaction, accessibility, and natural communication. Building production voice AI requires handling real-time transcription, processing spoken language nuances (incomplete sentences, filler words, casual grammar), and generating natural responses. This guide covers the full voice stack: transcription, NLU, response generation, and optimization for ultra-low latency.

Whisper for Transcription

Use OpenAI''s Whisper for accurate speech-to-text:

type AudioFormat = 'mp3' | 'wav' | 'aac' | 'flac';
type TranscriptionMode = 'real-time' | 'batch' | 'streaming';

interface TranscriptionResult {
  text: string;
  confidence: number;
  duration: number; // seconds
  language: string;
  segments?: Array<{ start: number; end: number; text: string }>;
}

async function transcribeAudio(
  audioData: Buffer | ReadableStream<Buffer>,
  format: AudioFormat,
  mode: TranscriptionMode = 'batch'
): Promise<TranscriptionResult> {
  if (mode === 'real-time' || mode === 'streaming') {
    return transcribeStreaming(audioData as ReadableStream<Buffer>, format);
  }

  // Batch mode
  return transcribeBatch(audioData as Buffer, format);
}

async function transcribeBatch(
  audioBuffer: Buffer,
  format: AudioFormat
): Promise<TranscriptionResult> {
  const response = await openai.audio.transcriptions.create({
    file: new File([audioBuffer], `audio.${format}`),
    model: 'whisper-1',
    language: 'en',
    response_format: 'verbose_json'
  });

  return {
    text: response.text,
    confidence: response.words ? calculateConfidence(response.words) : 0.95,
    duration: response.duration || 0,
    language: 'en',
    segments: response.segments
  };
}

function calculateConfidence(words: any[]): number {
  if (words.length === 0) return 0;
  const avg = words.reduce((sum, w) => sum + (w.confidence || 1), 0) / words.length;
  return avg;
}

Streaming Transcription with WebSockets

Enable real-time voice input:

interface StreamingTranscriptionSession {
  sessionId: string;
  startTime: Date;
  interimTranscript: string;
  finalTranscript: string;
  confidence: number;
}

async function transcribeStreaming(
  audioStream: ReadableStream<Buffer>,
  format: AudioFormat
): Promise<TranscriptionResult> {
  const chunks: Buffer[] = [];
  let interimText = '';

  return new Promise((resolve, reject) => {
    audioStream.on('data', async (chunk: Buffer) => {
      chunks.push(chunk);

      // Process every 1 second of audio (~32KB at 16kHz)
      if (chunks.reduce((sum, c) => sum + c.length, 0) &gt; 32000) {
        const audioBuffer = Buffer.concat(chunks);

        try {
          const interim = await transcribeBatch(audioBuffer, format);
          interimText = interim.text;

          // Stream interim result back to client
          console.log('Interim:', interimText);
        } catch (error) {
          console.error('Transcription error:', error);
        }

        chunks.length = 0; // Clear buffer
      }
    });

    audioStream.on('end', async () => {
      const audioBuffer = Buffer.concat(chunks);
      try {
        const final = await transcribeBatch(audioBuffer, format);
        resolve(final);
      } catch (error) {
        reject(error);
      }
    });

    audioStream.on('error', reject);
  });
}

async function setupWebSocketTranscription(ws: WebSocket): Promise<void> {
  const session: StreamingTranscriptionSession = {
    sessionId: generateId(),
    startTime: new Date(),
    interimTranscript: '',
    finalTranscript: '',
    confidence: 0
  };

  ws.on('message', async (data: Buffer) => {
    // Process audio chunks
    const result = await transcribeBatch(data, 'wav');

    session.interimTranscript = result.text;
    session.confidence = result.confidence;

    ws.send(JSON.stringify({
      type: 'interim',
      transcript: result.text,
      confidence: result.confidence
    }));
  });

  ws.on('close', () => {
    console.log(`Session ${session.sessionId} ended`);
  });
}

Speaker Diarization

Identify who is speaking:

interface Speaker {
  id: string;
  name?: string;
  segments: Array<{ start: number; end: number }>;
}

interface DiarizedTranscript {
  speakers: Speaker[];
  segments: Array<{
    speakerId: string;
    text: string;
    start: number;
    end: number;
  }>;
}

async function diarizeAudio(
  audioBuffer: Buffer
): Promise<DiarizedTranscript> {
  // Use Pyannote or similar diarization model
  const speakers = await diarizationModel.diarize(audioBuffer);

  // Get transcription with timing
  const transcription = await transcribeBatch(audioBuffer, 'wav');

  // Align speakers with transcript segments
  const segments = [];

  for (const segment of transcription.segments || []) {
    const speaker = findSpeakerAtTime(speakers, segment.start);

    segments.push({
      speakerId: speaker.id,
      text: segment.text,
      start: segment.start,
      end: segment.end
    });
  }

  return {
    speakers: speakers.map((s, i) => ({
      id: `speaker_${i}`,
      segments: s.segments
    })),
    segments
  };
}

function findSpeakerAtTime(
  speakers: any[],
  time: number
): any {
  for (const speaker of speakers) {
    for (const segment of speaker.segments) {
      if (time &gt;= segment.start && time &lt; segment.end) {
        return speaker;
      }
    }
  }
  return speakers[0]; // Default to first speaker
}

async function formatDiarizedTranscript(
  diarized: DiarizedTranscript
): Promise<string> {
  let output = '';

  for (const segment of diarized.segments) {
    const speaker = diarized.speakers.find(s => s.id === segment.speakerId);
    const speakerName = speaker?.name || segment.speakerId;

    output += `${speakerName}: ${segment.text}\n`;
  }

  return output;
}

Punctuation Restoration

Clean up transcripts:

async function restorePunctuation(
  transcript: string
): Promise<string> {
  const punctuationPrompt = `
    Add proper punctuation and capitalization to this transcript:

    "${transcript}"

    Rules:
    - Fix sentence capitalization
    - Add periods, commas, question marks
    - Keep [pause] markers if present
    - Preserve technical terms

    Return ONLY the corrected text.
  `;

  return llm.generate(punctuationPrompt);
}

async function cleanupTranscript(transcript: string): Promise<string> {
  let cleaned = transcript
    .replace(/\b(uh|um|uh|hmm|like|you know)\b/g, '') // Remove filler words
    .replace(/\s+/g, ' ') // Normalize whitespace
    .trim();

  cleaned = await restorePunctuation(cleaned);

  return cleaned;
}

Intent Extraction from Transcription

Understand what the user wants:

type VoiceIntent = 'query' | 'command' | 'response' | 'affirmation' | 'negation' | 'clarification';

interface ExtractedVoiceIntent {
  intent: VoiceIntent;
  entities: Record<string, any>;
  confidence: number;
  followUp?: string;
}

async function extractVoiceIntent(
  transcript: string,
  context?: string
): Promise<ExtractedVoiceIntent> {
  const intentPrompt = `
    Determine the user''s intent from this spoken message:

    Message: "${transcript}"
    ${context ? `Context: ${context}` : ''}

    Intent types:
    - query: User asking a question
    - command: User giving a direct instruction
    - response: User answering a question
    - affirmation: User saying yes/agree
    - negation: User saying no/disagree
    - clarification: User asking for clarification

    Also extract entities (dates, numbers, names, etc.).

    Return JSON: { intent, entities (object), confidence (0-1) }
  `;

  const result = JSON.parse(await llm.generate(intentPrompt));

  return {
    intent: result.intent,
    entities: result.entities,
    confidence: result.confidence
  };
}

Voice-Specific Prompt Design

Adapt prompts for spoken language:

async function generateVoiceResponse(
  transcript: string,
  context: string
): Promise<string> {
  // Voice-specific prompt: spoken language is more casual
  const voicePrompt = `
    A user said this to a voice assistant:

    "${transcript}"

    Context: ${context}

    Generate a natural voice response (as if you were speaking).

    Guidelines:
    - Use conversational language (contractions ok)
    - Keep response under 100 words (15-20 seconds of speech)
    - Ask clarifying questions if needed
    - No URLs, markdown, or technical jargon
    - Acknowledge the user''s intent
    - Be friendly and natural

    Return ONLY the response text, no descriptions.
  `;

  return llm.generate(voicePrompt);
}

async function generateTextResponseForVoice(
  userQuestion: string
): Promise<string> {
  // This is the response text that will be converted to speech
  const response = await generateVoiceResponse(userQuestion, '');

  // Optimize for TTS
  const optimized = optimizeForTTS(response);

  return optimized;
}

function optimizeForTTS(text: string): string {
  // Replace common abbreviations
  const replacements: Record<string, string> = {
    'Mr.': 'Mister',
    'Dr.': 'Doctor',
    'Inc.': 'Incorporated',
    '&': 'and',
    '$': 'dollars',
    '100%': 'one hundred percent'
  };

  let optimized = text;
  for (const [from, to] of Object.entries(replacements)) {
    optimized = optimized.replace(new RegExp(from, 'g'), to);
  }

  return optimized;
}

Text-to-Speech for Responses

Convert responses back to audio:

type TTSVoice = 'echo' | 'nova' | 'onyx' | 'fable' | 'shimmer';
type TTSSpeed = 0.25 | 0.5 | 1.0 | 1.25 | 1.5;

interface SpeechOutput {
  audioBuffer: Buffer;
  duration: number; // seconds
  format: 'mp3' | 'aac' | 'opus' | 'flac';
}

async function generateSpeech(
  text: string,
  voice: TTSVoice = 'nova',
  speed: TTSSpeed = 1.0
): Promise<SpeechOutput> {
  const audioResponse = await openai.audio.speech.create({
    model: 'tts-1-hd', // Use HD for voice interfaces
    voice,
    text,
    speed,
    response_format: 'mp3'
  });

  const audioBuffer = await audioResponse.arrayBuffer();

  return {
    audioBuffer: Buffer.from(audioBuffer),
    duration: estimateSpeechDuration(text),
    format: 'mp3'
  };
}

function estimateSpeechDuration(text: string): number {
  // Average speech rate: ~150 words per minute
  const wordCount = text.split(/\s+/).length;
  return (wordCount / 150) * 60;
}

async function streamSpeech(
  text: string,
  voice: TTSVoice = 'nova'
): Promise<ReadableStream<Buffer>> {
  // Use streaming TTS for lower latency
  const response = await openai.audio.speech.create({
    model: 'tts-1', // Faster model for streaming
    voice,
    text,
    response_format: 'aac'
  });

  return response.body as ReadableStream<Buffer>;
}

async function selectVoiceForContext(
  context: string
): Promise<TTSVoice> {
  const contextLower = context.toLowerCase();

  if (contextLower.includes('medical') || contextLower.includes('professional')) {
    return 'echo'; // Formal, professional
  }

  if (contextLower.includes('children') || contextLower.includes('friendly')) {
    return 'fable'; // Warm, friendly
  }

  return 'nova'; // Neutral, balanced
}

Latency Optimization for Voice

Minimize end-to-end latency:

interface VoiceLatencyMetrics {
  transcriptionLatency: number;
  nlpLatency: number;
  generationLatency: number;
  ttsLatency: number;
  totalLatency: number;
}

async function optimizedVoiceFlow(
  audioChunk: Buffer
): Promise<{ response: Buffer; metrics: VoiceLatencyMetrics }> {
  const start = Date.now();

  // 1. Transcription (parallel with stream receiving)
  const transcribeStart = Date.now();
  const transcript = await transcribeBatch(audioChunk, 'wav');
  const transcriptionLatency = Date.now() - transcribeStart;

  // 2. Intent extraction (lightweight)
  const nlpStart = Date.now();
  const intent = await extractVoiceIntent(transcript.text);
  const nlpLatency = Date.now() - nlpStart;

  // 3. Response generation (start immediately, stream to TTS)
  const generationStart = Date.now();
  const responseText = await generateTextResponseForVoice(transcript.text);
  const generationLatency = Date.now() - generationStart;

  // 4. TTS (stream results as they come)
  const ttsStart = Date.now();
  const speech = await generateSpeech(responseText);
  const ttsLatency = Date.now() - ttsStart;

  const totalLatency = Date.now() - start;

  return {
    response: speech.audioBuffer,
    metrics: {
      transcriptionLatency,
      nlpLatency,
      generationLatency,
      ttsLatency,
      totalLatency
    }
  };
}

async function predictLatency(textLength: number): Promise<number> {
  // Estimate based on model latencies
  const estimatedMetrics = {
    transcription: 200, // Whisper
    nlp: 100, // Intent extraction
    generation: 800, // LLM generation
    tts: 500 // Text to speech
  };

  return Object.values(estimatedMetrics).reduce((a, b) => a + b);
}

Telephony Integration

Connect to phone systems:

interface TelephonyConfig {
  provider: 'twilio' | 'vonage' | 'amazon-chime';
  apiKey: string;
  twilioAccountSid?: string;
}

async function setupVoiceCall(
  config: TelephonyConfig,
  callbackUrl: string
): Promise<void> {
  if (config.provider === 'twilio') {
    const twilio = require('twilio')(
      config.twilioAccountSid,
      config.apiKey
    );

    // Register webhook for incoming calls
    const twiml = new twilio.twiml.VoiceResponse();

    twiml.connect()
      .stream({
        url: callbackUrl,
        track: 'inbound_track'
      });

    return twiml.toString();
  }
}

async function handleIncomingCall(
  call: any,
  config: TelephonyConfig
): Promise<void> {
  const twilio = require('twilio');

  const response = new twilio.twiml.VoiceResponse();

  // Gather speech input
  const gather = response.gather({
    numDigits: 1,
    action: '/handle-dtmf',
    method: 'POST'
  });

  gather.say('Press 1 for support, 2 for sales, or speak your request');

  // Alternatively, enable speech recognition
  const stream = response.connect().stream({
    url: process.env.WS_URL,
    track: 'inbound_track'
  });

  return response.toString();
}

async function recordAndTranscribeCall(
  callSid: string,
  recordingUrl: string
): Promise<TranscriptionResult> {
  // Download recording
  const recordingBuffer = await downloadRecording(recordingUrl);

  // Transcribe
  return transcribeBatch(recordingBuffer, 'wav');
}

Checklist

  • Use Whisper for batch transcription with > 95% accuracy
  • Implement streaming transcription with WebSockets for real-time input
  • Enable speaker diarization to identify multiple speakers
  • Restore punctuation and capitalize sentences properly
  • Extract intent from spoken language (query, command, response, etc.)
  • Design prompts for spoken language (conversational, no URLs/markdown)
  • Use TTS with voice selection optimized for context
  • Stream TTS output for lower latency
  • Optimize end-to-end latency: target <2s for voice interaction
  • Integrate with telephony providers (Twilio, Vonage)
  • Record and transcribe calls for compliance and training
  • Set confidence thresholds: escalate low-confidence transcriptions
  • Test with various accents, backgrounds, and audio quality
  • Monitor latency metrics and identify bottlenecks

Conclusion

Voice AI backends transform accessibility and user experience. By combining Whisper for transcription, intent extraction, and fast TTS, you can build responsive voice interfaces. The key is optimizing latency: stream audio processing, parallelize operations, and use faster models where accuracy allows. Start with simple voice commands, gradually expanding to conversational AI as your latency and accuracy improve.