Voice + Text

Voice Conversational Chatbot

Build a real-time voice chatbot: speech-to-text transcription, LLM reasoning, and text-to-speech synthesis — all powered through OneInfer's unified API.

OneInfer Chat APIOneInfer Audio APIWhisperPythonFastAPI

Pipeline overview

Microphone audio→Whisper STT→LLM response→TTS synthesis→Speaker output

Step-by-step guide

Transcribe audio with Whisper

python

import openai

client = openai.OpenAI(
    api_key="your-oneinfer-api-key",
    base_url="https://api.oneinfer.ai/v1"
)

def transcribe(audio_path: str) -> str:
    with open(audio_path, "rb") as f:
        result = client.audio.transcriptions.create(
            model="whisper-1",
            file=f,
            response_format="text"
        )
    return result

Get a chat response from the LLM

python

from typing import Generator

def chat_stream(history: list, user_message: str) -> Generator[str, None, None]:
    history.append({"role": "user", "content": user_message})

    stream = client.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
        messages=[
            {"role": "system", "content": "You are a helpful voice assistant. Be concise."},
            *history
        ],
        stream=True
    )

    full_response = ""
    for chunk in stream:
        delta = chunk.choices[0].delta.content or ""
        full_response += delta
        yield delta

    history.append({"role": "assistant", "content": full_response})

Synthesize speech and play it back

python

import io
import sounddevice as sd
import soundfile as sf

def speak(text: str):
    response = client.audio.speech.create(
        model="tts-1",
        voice="alloy",
        input=text
    )
    audio_bytes = io.BytesIO(response.content)
    data, samplerate = sf.read(audio_bytes)
    sd.play(data, samplerate)
    sd.wait()

# Full voice loop
history = []
while True:
    transcript = transcribe("recorded_input.wav")
    print(f"You: {transcript}")

    reply = ""
    for token in chat_stream(history, transcript):
        reply += token

    print(f"Bot: {reply}")
    speak(reply)