Deepgram Integration in React Native: End-to-End Voice Conversation

January 10, 2025

Deepgram React Native Integration

Using Deepgram for live transcription and AI-driven conversation in React Native.

In this post, we’ll explore how to integrate Deepgram into an Android React Native application to handle live audio streams, transcription, and AI-driven conversation. We'll walk through:

  1. The custom React Hook for managing the Deepgram connection
  2. The native Kotlin modules for audio recording and playback
  3. How to wire those modules into your React Native project using Expo config plugins

It was quite challenging to make it work, as the documentation of Deepgram is not in a good shape. But seems it started to work, so maybe my code would save somebody's time 🙇‍♂️.

1. The useDeepgramConversation Hook

Below is an excerpt of the React Native hook that sets up WebSocket communication with Deepgram’s Agent endpoint and forwards audio data from our native Kotlin recorder:

import { useEffect, useRef } from "react"
import { Audio } from "expo-av"
import type { AgentLiveSchema } from "@deepgram/sdk"
import { AgentEvents } from "@deepgram/sdk"
import { NativeModules, NativeEventEmitter } from "react-native"
import type {
  UseConversationHook,
  Message,
} from "@/types/use-conversation-hook"
import type { VoiceAgentController } from "@/types/voice-agent-controller"

const { CustomAudioRecorder, CustomAudioPlayer } = NativeModules
const recorderEmitter = new NativeEventEmitter(CustomAudioRecorder)

const DEEPGRAM_API_KEY = "---your-deepgram-api-key---"

export type Props = {
  onBeforeStarting: () => void
  onStarted: (vac: VoiceAgentController) => void
  onAfterStarted: () => void
  onError: (err: unknown) => void
  onEnd: () => void
  onMessage: (event: Message) => void
}

const arrayBufferToBase64 = (buffer: ArrayBuffer) => {
  const bytes = new Uint8Array(buffer)
  const binary = String.fromCharCode(...bytes)
  return global.btoa(binary)
}

const useDeepgramConversation: UseConversationHook = ({
  onBeforeStarting,
  onStarted,
  onAfterStarted,
  onEnd,
  onError,
  onMessage,
}: Props): {
  startSession: () => void
  stopSession: () => void
} => {
  const ws = useRef<WebSocket | null>(null)
  const interval = useRef<NodeJS.Timeout | null>(null)
  const subscription = useRef<any | null>(null)
  const conversationContext = useRef<
    {
      role: "user" | "assistant"
      content: string
    }[] | null
  >(null)
  const instructions = useRef<string | null>(null)

  const startSession = async () => {
    try {
      onBeforeStarting()

      const { granted } = await Audio.requestPermissionsAsync()

      if (!granted) {
        throw new Error("Permission to use microphone was denied")
      }

      CustomAudioRecorder.startRecording()
      CustomAudioPlayer.startAudio()

      console.log("Agent starting...")

      console.log("Connection opened")

      const vac: VoiceAgentController = {
        sendInitialIntructions: (initialInstructions: string) => {
          instructions.current = initialInstructions

          return Promise.resolve()
        },
        setInitialConversationPhrases: (
          phrases: {
            role: "user" | "assistant"
            text: string
          }[]
        ) => {
          conversationContext.current =
            phrases.map((phrase) => ({
              role: phrase.role,
              content: phrase.text,
            }))


          return Promise.resolve()
        },
        makeAgentSay: (text: string, instructions?: string) => {
          // ws.current?.send(
          //   JSON.stringify({
          //     type: "InjectAgentMessage",
          //     message: text,
          //   })
          // )

          return Promise.resolve()
        },
        startConversation: () => {
          const settingsConfig: AgentLiveSchema = {
            audio: {
              input: {
                encoding: "linear16",
                // @ts-ignore
                sample_rate: 16000,
              },
              output: {
                encoding: "linear16",
                // @ts-ignore
                sample_rate: 16000,
                container: "none",
              },
            },
            agent: {
              listen: { model: "nova-2" },
              speak: { model: "aura-asteria-en" },
              think: {
                model: "gpt-4o-mini",
                provider: {
                  type: "open_ai",
                },
                instructions: instructions.current || 'You are a helpful agent',
              },
            },
            context: {
              // @ts-ignore
              messages: conversationContext.current === null ? [] : conversationContext.current,
              replay: conversationContext.current !== null && conversationContext.current.length > 0 ? true : false,
            },
          }

          const settingsConfigJson = JSON.stringify({
            type: "SettingsConfiguration",
            ...settingsConfig,
          })

          ws.current = new WebSocket(
            "wss://agent.deepgram.com/agent",
            null,
            // @ts-ignore
            {
              headers: {
                Authorization: "Token " + DEEPGRAM_API_KEY,
              },
            }
          )

          console.log("Connection created")

          ws.current.onopen = () => {
            console.log("Sending settings config:", settingsConfigJson)

            ws.current?.send(settingsConfigJson)

            console.log("Deepgram Agent configured.")
          }

          ws.current.onmessage = (event) => {
            console.log("Received message:", event.data)

            if (typeof event.data === "string") {
              const msgObj = JSON.parse(event.data)
              const { type: messageType } = msgObj

              console.log("Message type:", messageType, msgObj)

              switch (messageType) {
                case AgentEvents.SettingsApplied:
                  subscription.current = recorderEmitter.addListener(
                    "AudioChunk",
                    (event) => {
                      const byteArray = new Uint8Array(event.data)
                      ws.current?.send(byteArray)
                    }
                  )

                  break
                case AgentEvents.ConversationText:
                  const { role, content } = msgObj
                  onMessage({
                    role,
                    content,
                    timestamp: Date.now()
                  })
                  break
                case AgentEvents.UserStartedSpeaking:
                case AgentEvents.AgentThinking:
                case AgentEvents.AgentStartedSpeaking:
                  break
              }
            } else if (event.data instanceof ArrayBuffer) {
              const base64StringData = arrayBufferToBase64(event.data)
              CustomAudioPlayer.playAudioChunk(base64StringData)
            }
          }

          ws.current.onclose = () => {
            console.log("WebSocket closed")
          }

          ws.current.onerror = (err) => {
            console.error("WebSocket error:", err)
          }

          interval.current = setInterval(() => {
            console.log("Keep alive!")
            ws.current?.send(JSON.stringify({ type: "KeepAlive" }))
          }, 5000)

          return Promise.resolve()
        },
      }

      onStarted(vac)
    } catch (err) {
      console.error("Error starting session:", err)
      onError(err)
    } finally {
      onAfterStarted()
    }
  }

  const stopSession = async () => {
    try {
      subscription.current?.remove()
      CustomAudioRecorder.stopRecording()
      CustomAudioPlayer.stopAudio()
      interval.current && clearInterval(interval.current)

      if (ws.current) {
        ws.current.close(1000, "Component unmounted")
      }

      onEnd()
    } catch (err: any) {
      console.error("Error stopping session:", err)
      onError(err?.message || "Error stopping session")
    }
  }

  useEffect(() => {
    return () => {
      stopSession()
    }
  }, [])

  return {
    startSession,
    stopSession,
  }
}

export default useDeepgramConversation

Key points:

  • We request microphone permission using Expo’s Audio.requestPermissionsAsync().
  • We start a native audio recorder (CustomAudioRecorder) that emits raw audio data via an event listener (recorderEmitter.addListener('AudioChunk', ...)).
  • We pass raw audio data to Deepgram through the open WebSocket connection.
  • We receive audio data (as ArrayBuffer) back from Deepgram, convert it to Base64, and play it using CustomAudioPlayer.

2. The Native Android Kotlin Modules

Custom Audio Recorder (Android)

Below is the CustomAudioRecorderModule which uses AudioRecord to capture audio data and emit “AudioChunk” events to JavaScript:

package com.example.package

import android.media.AudioRecord
import android.media.MediaRecorder
import android.media.AudioFormat
import android.util.Log
import com.facebook.react.bridge.ReactApplicationContext
import com.facebook.react.bridge.ReactContextBaseJavaModule
import com.facebook.react.bridge.ReactMethod
import com.facebook.react.bridge.WritableNativeArray
import com.facebook.react.bridge.WritableNativeMap
import com.facebook.react.modules.core.DeviceEventManagerModule

class CustomAudioRecorderModule(private val reactContext: ReactApplicationContext) :
    ReactContextBaseJavaModule(reactContext) {

    companion object {
        private const val TAG = "CustomAudioRecorder"
        private const val SAMPLE_RATE = 16000
        private const val CHANNEL_CONFIG = AudioFormat.CHANNEL_IN_MONO
        private const val AUDIO_FORMAT = AudioFormat.ENCODING_PCM_16BIT

        private const val MAX_FAILED_READS = 5
        private const val MAX_RESTARTS = 3
    }

    private val minBufferSize: Int = AudioRecord.getMinBufferSize(SAMPLE_RATE, CHANNEL_CONFIG, AUDIO_FORMAT)
    private var bufferSize: Int = minBufferSize * 4
    private var audioRecord: AudioRecord? = null
    private var isRecording = false
    private var recordingThread: Thread? = null

    private var failedReads = 0
    private var restartCount = 0

    override fun getName(): String {
        return "CustomAudioRecorder"
    }

    @ReactMethod
    fun startRecording() {
        if (isRecording) return

        try {
            Log.d(TAG, "Initializing AudioRecord...")
            audioRecord = AudioRecord(
                MediaRecorder.AudioSource.VOICE_RECOGNITION,
                SAMPLE_RATE,
                CHANNEL_CONFIG,
                AUDIO_FORMAT,
                bufferSize
            )

            if (audioRecord?.state != AudioRecord.STATE_INITIALIZED) {
                Log.e(TAG, "AudioRecord initialization failed")
                return
            }

            audioRecord?.startRecording()
            isRecording = true
            Log.d(TAG, "Recording started")
            startRecordingThread()
        } catch (e: Exception) {
            Log.e(TAG, "Error starting recording", e)
        }
    }

    @ReactMethod
    fun stopRecording() {
        isRecording = false
        try {
            Log.d(TAG, "Stopping recording...")
            recordingThread?.interrupt()
            recordingThread = null
            audioRecord?.stop()
            audioRecord?.release()
            audioRecord = null
            Log.d(TAG, "Recording stopped")
        } catch (e: Exception) {
            Log.e(TAG, "Error stopping recording", e)
        }
    }

    private fun startRecordingThread() {
        recordingThread = Thread {
            try {
                Thread.currentThread().priority = Thread.NORM_PRIORITY
                val buffer = ByteArray(bufferSize)
                Log.d(TAG, "Recording thread started")
                failedReads = 0
                restartCount = 0

                while (isRecording && audioRecord != null) {
                    if (audioRecord?.state != AudioRecord.STATE_INITIALIZED && audioRecord?.recordingState != AudioRecord.RECORDSTATE_RECORDING) {
                        Log.e(TAG, "AudioRecord is not in recording state. State: ${audioRecord?.state}, Recording State: ${audioRecord?.recordingState}")
                        restartRecording()
                        break
                    }

                    val read = audioRecord!!.read(buffer, 0, buffer.size)

                    if (read > 0) {
                        failedReads = 0 // Reset failure counter
                        sendEvent("AudioChunk", buffer.copyOf(read))
                    } else {
                        failedReads++
                        Log.w(TAG, "AudioRecord read error ($failedReads times)")
                        if (failedReads >= MAX_FAILED_READS) {
                            Log.e(TAG, "Too many read failures, restarting recording...")
                            restartRecording()
                            break
                        }
                    }
                }
            } catch (e: Exception) {
                Log.e(TAG, "Recording thread error", e)
            } finally {
                // Ensure resources are released even if an exception occurs
                audioRecord?.release()
                audioRecord = null
            }
        }
        recordingThread?.start()
    }

    private fun restartRecording() {
        if (restartCount < MAX_RESTARTS) {
            stopRecording()
            Thread.sleep(200)
            startRecording()
            restartCount++
        } else {
            Log.e(TAG, "Too many restarts, stopping recording...")
            stopRecording()
            sendErrorEvent("Too many restarts, stopping recording")
        }
    }

    private fun sendEvent(eventName: String, byteArray: ByteArray) {
        val eventData = WritableNativeMap()
        val array = WritableNativeArray()
        for (byte in byteArray) {
            array.pushInt(byte.toInt())
        }
        eventData.putArray("data", array)
        reactContext
            .getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
            .emit(eventName, eventData)
        Log.d(TAG, "Audio chunk sent to JS")
    }

    private fun sendErrorEvent(errorMessage: String) {
        val eventData = WritableNativeMap()
        eventData.putString("error", errorMessage)
        reactContext
            .getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter::class.java)
            .emit("AudioError", eventData)
        Log.e(TAG, "Error event sent to JS: $errorMessage")
    }
}

We also need to expose this module to React Native via a ReactPackage, for example:

package com.example.package

import com.facebook.react.ReactPackage
import com.facebook.react.bridge.NativeModule
import com.facebook.react.bridge.ReactApplicationContext
import com.facebook.react.uimanager.ViewManager

class CustomAudioRecorderPackage : ReactPackage {
  override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
    return listOf(CustomAudioRecorderModule(reactContext))
  }

  override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
    return emptyList()
  }
}

Custom Audio Player (Android)

This module receives Base64-encoded audio from the JavaScript side and plays it with an AudioTrack.

package com.example.package

import android.media.AudioAttributes
import android.media.AudioFormat
import android.media.AudioTrack
import android.util.Base64
import android.util.Log
import com.facebook.react.bridge.ReactApplicationContext
import com.facebook.react.bridge.ReactContextBaseJavaModule
import com.facebook.react.bridge.ReactMethod

class CustomAudioPlayerModule(reactContext: ReactApplicationContext) : ReactContextBaseJavaModule(reactContext) {

  companion object {
    private const val TAG = "CustomAudioPlayer"
  }

  // Default configuration; mutable to allow changes.
  private var sampleRate: Int = 16000
  private var channelCount: Int = 1 // 1 for mono
  private val audioEncoding: Int = AudioFormat.ENCODING_PCM_16BIT  // 16-bit PCM

  // Create an AudioTrack using the current configuration.
  private var audioTrack: AudioTrack? = createAudioTrack()

  /**
   * Helper function to create an AudioTrack based on current parameters.
   */
  private fun createAudioTrack(): AudioTrack {
    val channelMask = if (channelCount == 1) AudioFormat.CHANNEL_OUT_MONO else AudioFormat.CHANNEL_OUT_STEREO
    val minBufferSize = AudioTrack.getMinBufferSize(sampleRate, channelMask, audioEncoding)
    return AudioTrack.Builder()
      .setAudioAttributes(
        AudioAttributes.Builder()
          .setUsage(AudioAttributes.USAGE_MEDIA)
          .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
          .build()
      )
      .setAudioFormat(
        AudioFormat.Builder()
          .setEncoding(audioEncoding)
          .setSampleRate(sampleRate)
          .setChannelMask(channelMask)
          .build()
      )
      .setBufferSizeInBytes(minBufferSize)
      .setTransferMode(AudioTrack.MODE_STREAM)
      .build()
  }

  init {
    // Start playback so that the AudioTrack is ready to accept data.
    audioTrack?.play()
  }

  override fun getName(): String {
    return "CustomAudioPlayer"
  }

  /**
   * Allows the React Native side to customize audio parameters.
   *
   * @param sampleRate The sample rate in Hz (e.g., 48000).
   * @param channelCount The number of channels (1 for mono, 2 for stereo).
   */
  @ReactMethod
  fun setAudioConfig(sampleRate: Int, channelCount: Int) {
    try {
      // Stop and release the current AudioTrack.
      audioTrack?.stop()
      audioTrack?.release()
    } catch (e: Exception) {
      Log.e(TAG, "Error releasing AudioTrack: ", e)
    }
    // Update parameters.
    this.sampleRate = sampleRate
    this.channelCount = channelCount

    // Re-create the AudioTrack with the new configuration.
    audioTrack = createAudioTrack()
    audioTrack?.play()
    Log.i(TAG, "Audio config updated: sampleRate=$sampleRate, channelCount=$channelCount")
  }

  /**
   * Decodes a Base64-encoded string representing raw 16-bit PCM audio data (no endianness conversion needed)
   * and writes it to the AudioTrack for playback.
   *
   * @param base64Audio The Base64-encoded audio data.
   */
  @ReactMethod
  fun playAudioChunk(base64Audio: String) {
    try {
      // Decode the Base64 string into a byte array.
      val audioData: ByteArray = Base64.decode(base64Audio, Base64.DEFAULT)
      // Write the audio data into the AudioTrack.
      val written = audioTrack?.write(audioData, 0, audioData.size) ?: -1
      if (written < 0) {
        Log.e(TAG, "Error writing audio data: $written")
      }
    } catch (e: Exception) {
      Log.e(TAG, "Error in playAudioChunk: ", e)
    }
  }

  @ReactMethod
  fun startAudio() {
    if (audioTrack == null) {
      audioTrack = createAudioTrack()
    }
    audioTrack?.play()
  }

  @ReactMethod
  fun stopAudio() {
    try {
      audioTrack?.stop()
      audioTrack?.release()
    } catch (e: Exception) {
      Log.e(TAG, "Error stopping audio: ", e)
    } finally {
      audioTrack = null
    }
  }
}

And similarly, a package for it:

package com.example.package

import com.facebook.react.ReactPackage
import com.facebook.react.bridge.NativeModule
import com.facebook.react.bridge.ReactApplicationContext
import com.facebook.react.uimanager.ViewManager

class CustomAudioPlayerPackage : ReactPackage {
  override fun createNativeModules(reactContext: ReactApplicationContext): List<NativeModule> {
    return listOf(CustomAudioPlayerModule(reactContext))
  }

  override fun createViewManagers(reactContext: ReactApplicationContext): List<ViewManager<*, *>> {
    return emptyList()
  }
}

3. Integrating the Native Packages into Expo (app.json)

If you’re using Expo’s config plugins, you can automate adding these packages to your Android MainApplication with a file like withCustomAudioPackage.js. (In a bare React Native, you could manually add them in MainApplication.java or MainApplication.kt.)

// plugins/withCustomAudioPlayerPackage.js
const { withMainApplication } = require("@expo/config-plugins")

function addCustomPackage(src) {
  const packageImport = 'import com.example.package.CustomAudioPlayerPackage'
  const packageInstance = 'packages.add(CustomAudioPlayerPackage())'

  if (!src.includes(packageImport)) {
    src = src.replace(
      /^(package\s+[\w\.]+\s*\n)/m,
      `$1\n${packageImport}\n`
    )
  }

  if (!src.includes(packageInstance)) {
    src = src.replace(
      /(\s*return\s+packages)/,
      `\n            ${packageInstance}$1`
    )
  }
  return src
}

module.exports = function withCustomAudioPackage(config) {
  return withMainApplication(config, (config) => {
    if (config.modResults.language === "java" || config.modResults.language === "kt") {
      config.modResults.contents = addCustomPackage(config.modResults.contents)
    }
    return config
  })
}

Repeat a similar approach in another file (e.g., withCustomAudioRecorderPackage.js) for the CustomAudioRecorderPackage. Then in your app.json or app.config.js:

{
  "expo": {
    ...
    "plugins": [
      [
        "./plugins/withCustomAudioPlayerPackage",
        {
          "platforms": "android"
        }
      ],
      [
        "./plugins/withCustomAudioRecorderPackage",
        {
          "platforms": "android"
        }
      ]
    ]
  }
}

This ensures that when you run expo prebuild, the native packages for audio recording and playback are injected into your Android codebase automatically.

Usage

Here's how you can use the useDeepgramConversation hook in a React Native component:

import React, { useState } from "react"
import useDeepgramConversation from "./useDeepgramConversation"

const VoiceComponent = () => {
  const [messages, setMessages] = useState([])
  const { startSession, stopSession } = useDeepgramConversation({
    onBeforeStarting: () => console.log("Starting session..."),
    onStarted: (vac) => {
      console.log("Session started")
      vac.startConversation()
    },
    onAfterStarted: () => console.log("Session after started"),
    onError: (err) => console.error("Error:", err),
    onEnd: () => console.log("Session ended"),
    onMessage: (message) => {
      setMessages((prevMessages) => [...prevMessages, message])
    },
  })

  return (
    <div>
      <button onClick={startSession}>Start</button>
      <button onClick={stopSession}>Stop</button>
      <div>
        {messages.map((message, index) => (
          <div key={index}>
            <strong>{message.role}:</strong> {message.content}
          </div>
        ))}
      </div>
    </div>
  )
}

export default VoiceComponent

Wrapping Up

With these pieces in place:

  • React Native Hook: Manages Deepgram’s WebSocket connection, sends audio up, receives audio down, and triggers UI events.
  • Native Kotlin Modules: Record audio locally (as raw PCM) and play back audio from Deepgram in real time.
  • Expo Config Plugins (or manual linking): Ensures these modules are added to the Android app’s classpath.

You now have a complete pipeline for conversational AI with Deepgram in your React Native application!

react nativedeepgramvoicekotlinandroidaispeech