Using Deepgram for live transcription and AI-driven conversation in React Native.
In this post, we’ll explore how to integrate Deepgram into an Android React Native application to handle live audio streams, transcription, and AI-driven conversation. We'll walk through:
- The custom React Hook for managing the Deepgram connection
- The native Kotlin modules for audio recording and playback
- How to wire those modules into your React Native project using Expo config plugins
It was quite challenging to make it work, as the documentation of Deepgram is not in a good shape. But seems it started to work, so maybe my code would save somebody's time 🙇♂️.
1. The useDeepgramConversation
Hook
Below is an excerpt of the React Native hook that sets up WebSocket communication with Deepgram’s Agent endpoint and forwards audio data from our native Kotlin recorder:
import { useEffect, useRef } from "react"
import { Audio } from "expo-av"
import type { AgentLiveSchema } from "@deepgram/sdk"
import { AgentEvents } from "@deepgram/sdk"
import { NativeModules, NativeEventEmitter } from "react-native"
import type {
UseConversationHook,
Message,
} from "@/types/use-conversation-hook"
import type { VoiceAgentController } from "@/types/voice-agent-controller"
const { CustomAudioRecorder, CustomAudioPlayer } = NativeModules
const recorderEmitter = new NativeEventEmitter(CustomAudioRecorder)
const DEEPGRAM_API_KEY = "---your-deepgram-api-key---"
export type Props = {
onBeforeStarting: () => void
onStarted: (vac: VoiceAgentController) => void
onAfterStarted: () => void
onError: (err: unknown) => void
onEnd: () => void
onMessage: (event: Message) => void
}
const arrayBufferToBase64 = (buffer: ArrayBuffer) => {
const bytes = new Uint8Array(buffer)
const binary = String.fromCharCode(...bytes)
return global.btoa(binary)
}
const useDeepgramConversation: UseConversationHook = ({
onBeforeStarting,
onStarted,
onAfterStarted,
onEnd,
onError,
onMessage,
}: Props): {
startSession: () => void
stopSession: () => void
} => {
const ws = useRef<WebSocket | null>(null)
const interval = useRef<NodeJS.Timeout | null>(null)
const subscription = useRef<any | null>(null)
const conversationContext = useRef<
{
role: "user" | "assistant"
content: string
}[] | null
>(null)
const instructions = useRef<string | null>(null)
const startSession = async () => {
try {
onBeforeStarting()
const { granted } = await Audio.requestPermissionsAsync()
if (!granted) {
throw new Error("Permission to use microphone was denied")
}
CustomAudioRecorder.startRecording()
CustomAudioPlayer.startAudio()
console.log("Agent starting...")
console.log("Connection opened")
const vac: VoiceAgentController = {
sendInitialIntructions: (initialInstructions: string) => {
instructions.current = initialInstructions
return Promise.resolve()
},
setInitialConversationPhrases: (
phrases: {
role: "user" | "assistant"
text: string
}[]
) => {
conversationContext.current =
phrases.map((phrase) => ({
role: phrase.role,
content: phrase.text,
}))
return Promise.resolve()
},
makeAgentSay: (text: string, instructions?: string) => {
// ws.current?.send(
// JSON.stringify({
// type: "InjectAgentMessage",
// message: text,
// })
// )
return Promise.resolve()
},
startConversation: () => {
const settingsConfig: AgentLiveSchema = {
audio: {
input: {
encoding: "linear16",
// @ts-ignore
sample_rate: 16000,
},
output: {
encoding: "linear16",
// @ts-ignore
sample_rate: 16000,
container: "none",
},
},
agent: {
listen: { model: "nova-2" },
speak: { model: "aura-asteria-en" },
think: {
model: "gpt-4o-mini",
provider: {
type: "open_ai",
},
instructions: instructions.current || 'You are a helpful agent',
},
},
context: {
// @ts-ignore
messages: conversationContext.current === null ? [] : conversationContext.current,
replay: conversationContext.current !== null && conversationContext.current.length > 0 ? true : false,
},
}
const settingsConfigJson = JSON.stringify({
type: "SettingsConfiguration",
...settingsConfig,
})
ws.current = new WebSocket(
"wss://agent.deepgram.com/agent",
null,
// @ts-ignore
{
headers: {
Authorization: "Token " + DEEPGRAM_API_KEY,
},
}
)
console.log("Connection created")
ws.current.onopen = () => {
console.log("Sending settings config:", settingsConfigJson)
ws.current?.send(settingsConfigJson)
console.log("Deepgram Agent configured.")
}
ws.current.onmessage = (event) => {
console.log("Received message:", event.data)
if (typeof event.data === "string") {
const msgObj = JSON.parse(event.data)
const { type: messageType } = msgObj
console.log("Message type:", messageType, msgObj)
switch (messageType) {
case AgentEvents.SettingsApplied:
subscription.current = recorderEmitter.addListener(
"AudioChunk",
(event) => {
const byteArray = new Uint8Array(event.data)
ws.current?.send(byteArray)
}
)
break
case AgentEvents.ConversationText:
const { role, content } = msgObj
onMessage({
role,
content,
timestamp: Date.now()
})
break
case AgentEvents.UserStartedSpeaking:
case AgentEvents.AgentThinking:
case AgentEvents.AgentStartedSpeaking:
break
}
} else if (event.data instanceof ArrayBuffer) {
const base64StringData = arrayBufferToBase64(event.data)
CustomAudioPlayer.playAudioChunk(base64StringData)
}
}
ws.current.onclose = () => {
console.log("WebSocket closed")
}
ws.current.onerror = (err) => {
console.error("WebSocket error:", err)
}
interval.current = setInterval(() => {
console.log("Keep alive!")
ws.current?.send(JSON.stringify({ type: "KeepAlive" }))
}, 5000)
return Promise.resolve()
},
}
onStarted(vac)
} catch (err) {
console.error("Error starting session:", err)
onError(err)
} finally {
onAfterStarted()
}
}
const stopSession = async () => {
try {
subscription.current?.remove()
CustomAudioRecorder.stopRecording()
CustomAudioPlayer.stopAudio()
interval.current && clearInterval(interval.current)
if (ws.current) {
ws.current.close(1000, "Component unmounted")
}
onEnd()
} catch (err: any) {
console.error("Error stopping session:", err)
onError(err?.message || "Error stopping session")
}
}
useEffect(() => {
return () => {
stopSession()
}
}, [])
return {
startSession,
stopSession,
}
}
export default useDeepgramConversation
Key points:
- We request microphone permission using Expo’s Audio.requestPermissionsAsync().
- We start a native audio recorder (CustomAudioRecorder) that emits raw audio data via an event listener (recorderEmitter.addListener('AudioChunk', ...)).
- We pass raw audio data to Deepgram through the open WebSocket connection.
- We receive audio data (as ArrayBuffer) back from Deepgram, convert it to Base64, and play it using CustomAudioPlayer.
2. The Native Android Kotlin Modules
Custom Audio Recorder (Android)
Below is the CustomAudioRecorderModule which uses AudioRecord to capture audio data and emit “AudioChunk” events to JavaScript:
package com.example.package
import android.media.AudioRecord
import android.media.MediaRecorder
import android.media.AudioFormat
import android.util.Log
import com.facebook.react.bridge.ReactApplicationContext
import com.facebook.react.bridge.ReactContextBaseJavaModule
import com.facebook.react.bridge.ReactMethod
import com.facebook.react.bridge.WritableNativeArray
import com.facebook.react.bridge.WritableNativeMap
import com.facebook.react.modules.core.DeviceEventManagerModule
class CustomAudioRecorderModule(private val reactContext: ReactApplicationContext) :
ReactContextBaseJavaModule(reactContext) {
companion object {
private const val TAG = "CustomAudioRecorder"
private const val SAMPLE_RATE = 16000
private const val CHANNEL_CONFIG = AudioFormat.CHANNEL_IN_MONO
private const val AUDIO_FORMAT = AudioFormat.ENCODING_PCM_16BIT
private const val MAX_FAILED_READS = 5
private const val MAX_RESTARTS = 3
}
private val minBufferSize: Int = AudioRecord.getMinBufferSize(SAMPLE_RATE, CHANNEL_CONFIG, AUDIO_FORMAT)
private var bufferSize: Int = minBufferSize * 4
private var audioRecord: AudioRecord? = null
private var isRecording = false
private var recordingThread: Thread? = null
private var failedReads = 0
private var restartCount = 0
override fun getName(): String {
return "CustomAudioRecorder"
}
@ReactMethod
fun startRecording() {
if (isRecording) return
try {
Log.d(TAG, "Initializing AudioRecord...")
audioRecord = AudioRecord(
MediaRecorder.AudioSource.VOICE_RECOGNITION,
SAMPLE_RATE,
CHANNEL_CONFIG,
AUDIO_FORMAT,
bufferSize
)
if (audioRecord?.state != AudioRecord.STATE_INITIALIZED) {
Log.e(TAG, "AudioRecord initialization failed")
return
}
audioRecord?.startRecording()
isRecording = true
Log.d(TAG, "Recording started")
startRecordingThread()
} catch (e: Exception) {
Log.e(TAG, "Error starting recording", e)
}
}
@ReactMethod
fun stopRecording() {
isRecording = false
try {
Log.d(TAG, "Stopping recording...")
recordingThread?.interrupt()
recordingThread = null
audioRecord?.stop()
audioRecord?.release()
audioRecord = null
Log.d(TAG, "Recording stopped")
} catch (e: Exception) {
Log.e(TAG, "Error stopping recording", e)
}
}
private fun startRecordingThread() {
recordingThread = Thread {
try {
Thread.currentThread().priority = Thread.NORM_PRIORITY
val buffer = ByteArray(bufferSize)
Log.d(TAG, "Recording thread started")
failedReads = 0
restartCount = 0
while (isRecording && audioRecord != null) {
if (audioRecord?.state != AudioRecord.STATE_INITIALIZED && audioRecord?.recordingState != AudioRecord.RECORDSTATE_RECORDING) {
Log.e(TAG, "AudioRecord is not in recording state. State: ${audioRecord?.state}, Recording State: ${audioRecord?.recordingState}")
restartRecording()
break
}
val read = audioRecord!!.read(buffer, 0, buffer.size)
if (read > 0) {
failedReads = 0 // Reset failure counter
sendEvent("AudioChunk", buffer.copyOf(read))
} else {
failedReads++
Log.w(TAG, "AudioRecord read error ($failedReads times)")
if (failedReads >= MAX_FAILED_READS) {
Log.e(TAG, "Too many read failures, restarting recording...")
restartRecording()
break
}
}
}
} catch (e: Exception) {
Log.e(TAG, "Recording thread error", e)
} finally {
// Ensure resources are released even if an exception occurs
audioRecord?.release()
audioRecord = null
}
}
recordingThread?.start()
}
private fun restartRecording() {
if (restartCount < MAX_RESTARTS) {
stopRecording()
Thread.sleep(200)
startRecording()
restartCount++
} else {
Log.e(TAG, "Too many restarts, stopping recording...")
stopRecording()
sendErrorEvent("Too many restarts, stopping recording")
}
}
private fun sendEvent(eventName: String, byteArray: ByteArray) {
val eventData = WritableNativeMap()
val array = WritableNativeArray()
for (byte in byteArray) {
array.pushInt(byte.toInt())
}
eventData.putArray("data", array)
reactContext
.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
.emit(eventName, eventData)
Log.d(TAG, "Audio chunk sent to JS")
}
private fun sendErrorEvent(errorMessage: String) {
val eventData = WritableNativeMap()
eventData.putString("error", errorMessage)
reactContext
.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
.emit("AudioError", eventData)
Log.e(TAG, "Error event sent to JS: $errorMessage")
}
}
We also need to expose this module to React Native via a ReactPackage, for example:
package com.example.package
import com.facebook.react.ReactPackage
import com.facebook.react.bridge.NativeModule
import com.facebook.react.bridge.ReactApplicationContext
import com.facebook.react.uimanager.ViewManager
class CustomAudioRecorderPackage : ReactPackage {
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
return listOf(CustomAudioRecorderModule(reactContext))
}
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
return emptyList()
}
}
Custom Audio Player (Android)
This module receives Base64-encoded audio from the JavaScript side and plays it with an AudioTrack.
package com.example.package
import android.media.AudioAttributes
import android.media.AudioFormat
import android.media.AudioTrack
import android.util.Base64
import android.util.Log
import com.facebook.react.bridge.ReactApplicationContext
import com.facebook.react.bridge.ReactContextBaseJavaModule
import com.facebook.react.bridge.ReactMethod
class CustomAudioPlayerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {
companion object {
private const val TAG = "CustomAudioPlayer"
}
// Default configuration; mutable to allow changes.
private var sampleRate: Int = 16000
private var channelCount: Int = 1 // 1 for mono
private val audioEncoding: Int = AudioFormat.ENCODING_PCM_16BIT // 16-bit PCM
// Create an AudioTrack using the current configuration.
private var audioTrack: AudioTrack? = createAudioTrack()
/**
* Helper function to create an AudioTrack based on current parameters.
*/
private fun createAudioTrack(): AudioTrack {
val channelMask = if (channelCount == 1) AudioFormat.CHANNEL_OUT_MONO else AudioFormat.CHANNEL_OUT_STEREO
val minBufferSize = AudioTrack.getMinBufferSize(sampleRate, channelMask, audioEncoding)
return AudioTrack.Builder()
.setAudioAttributes(
AudioAttributes.Builder()
.setUsage(AudioAttributes.USAGE_MEDIA)
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.build()
)
.setAudioFormat(
AudioFormat.Builder()
.setEncoding(audioEncoding)
.setSampleRate(sampleRate)
.setChannelMask(channelMask)
.build()
)
.setBufferSizeInBytes(minBufferSize)
.setTransferMode(AudioTrack.MODE_STREAM)
.build()
}
init {
// Start playback so that the AudioTrack is ready to accept data.
audioTrack?.play()
}
override fun getName(): String {
return "CustomAudioPlayer"
}
/**
* Allows the React Native side to customize audio parameters.
*
* @param sampleRate The sample rate in Hz (e.g., 48000).
* @param channelCount The number of channels (1 for mono, 2 for stereo).
*/
@ReactMethod
fun setAudioConfig(sampleRate: Int, channelCount: Int) {
try {
// Stop and release the current AudioTrack.
audioTrack?.stop()
audioTrack?.release()
} catch (e: Exception) {
Log.e(TAG, "Error releasing AudioTrack: ", e)
}
// Update parameters.
this.sampleRate = sampleRate
this.channelCount = channelCount
// Re-create the AudioTrack with the new configuration.
audioTrack = createAudioTrack()
audioTrack?.play()
Log.i(TAG, "Audio config updated: sampleRate=$sampleRate, channelCount=$channelCount")
}
/**
* Decodes a Base64-encoded string representing raw 16-bit PCM audio data (no endianness conversion needed)
* and writes it to the AudioTrack for playback.
*
* @param base64Audio The Base64-encoded audio data.
*/
@ReactMethod
fun playAudioChunk(base64Audio: String) {
try {
// Decode the Base64 string into a byte array.
val audioData: ByteArray = Base64.decode(base64Audio, Base64.DEFAULT)
// Write the audio data into the AudioTrack.
val written = audioTrack?.write(audioData, 0, audioData.size) ?: -1
if (written < 0) {
Log.e(TAG, "Error writing audio data: $written")
}
} catch (e: Exception) {
Log.e(TAG, "Error in playAudioChunk: ", e)
}
}
@ReactMethod
fun startAudio() {
if (audioTrack == null) {
audioTrack = createAudioTrack()
}
audioTrack?.play()
}
@ReactMethod
fun stopAudio() {
try {
audioTrack?.stop()
audioTrack?.release()
} catch (e: Exception) {
Log.e(TAG, "Error stopping audio: ", e)
} finally {
audioTrack = null
}
}
}
And similarly, a package for it:
package com.example.package
import com.facebook.react.ReactPackage
import com.facebook.react.bridge.NativeModule
import com.facebook.react.bridge.ReactApplicationContext
import com.facebook.react.uimanager.ViewManager
class CustomAudioPlayerPackage : ReactPackage {
override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
return listOf(CustomAudioPlayerModule(reactContext))
}
override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
return emptyList()
}
}
3. Integrating the Native Packages into Expo (app.json)
If you’re using Expo’s config plugins, you can automate adding these packages to your Android MainApplication with a file like withCustomAudioPackage.js. (In a bare React Native, you could manually add them in MainApplication.java or MainApplication.kt.)
// plugins/withCustomAudioPlayerPackage.js
const { withMainApplication } = require("@expo/config-plugins")
function addCustomPackage(src) {
const packageImport = 'import com.example.package.CustomAudioPlayerPackage'
const packageInstance = 'packages.add(CustomAudioPlayerPackage())'
if (!src.includes(packageImport)) {
src = src.replace(
/^(package\s+[\w\.]+\s*\n)/m,
`$1\n${packageImport}\n`
)
}
if (!src.includes(packageInstance)) {
src = src.replace(
/(\s*return\s+packages)/,
`\n ${packageInstance}$1`
)
}
return src
}
module.exports = function withCustomAudioPackage(config) {
return withMainApplication(config, (config) => {
if (config.modResults.language === "java" || config.modResults.language === "kt") {
config.modResults.contents = addCustomPackage(config.modResults.contents)
}
return config
})
}
Repeat a similar approach in another file (e.g., withCustomAudioRecorderPackage.js) for the CustomAudioRecorderPackage. Then in your app.json or app.config.js:
{
"expo": {
...
"plugins": [
[
"./plugins/withCustomAudioPlayerPackage",
{
"platforms": "android"
}
],
[
"./plugins/withCustomAudioRecorderPackage",
{
"platforms": "android"
}
]
]
}
}
This ensures that when you run expo prebuild, the native packages for audio recording and playback are injected into your Android codebase automatically.
Usage
Here's how you can use the useDeepgramConversation hook in a React Native component:
import React, { useState } from "react"
import useDeepgramConversation from "./useDeepgramConversation"
const VoiceComponent = () => {
const [messages, setMessages] = useState([])
const { startSession, stopSession } = useDeepgramConversation({
onBeforeStarting: () => console.log("Starting session..."),
onStarted: (vac) => {
console.log("Session started")
vac.startConversation()
},
onAfterStarted: () => console.log("Session after started"),
onError: (err) => console.error("Error:", err),
onEnd: () => console.log("Session ended"),
onMessage: (message) => {
setMessages((prevMessages) => [...prevMessages, message])
},
})
return (
<div>
<button onClick={startSession}>Start</button>
<button onClick={stopSession}>Stop</button>
<div>
{messages.map((message, index) => (
<div key={index}>
<strong>{message.role}:</strong> {message.content}
</div>
))}
</div>
</div>
)
}
export default VoiceComponent
Wrapping Up
With these pieces in place:
- React Native Hook: Manages Deepgram’s WebSocket connection, sends audio up, receives audio down, and triggers UI events.
- Native Kotlin Modules: Record audio locally (as raw PCM) and play back audio from Deepgram in real time.
- Expo Config Plugins (or manual linking): Ensures these modules are added to the Android app’s classpath.
You now have a complete pipeline for conversational AI with Deepgram in your React Native application!