Skip to content

useSpeechToText

Speech to text hook. Powered by the Web Speech API for speech recognition.

Experimental DO NOT use this hook in production.
Web Speech API may not be supported by all browsers. Browser compatibility.

Import

import {useSpeechToText} from '@kaiverse/k/hooks'

Usage

import {useSpeechToText} from '@kaiverse/k/hooks'

const {transcript, isSpeechAPIAvailable, isListening, startListening, stopListening} =
  useSpeechToText({
    lang: 'en',
    onStart: () => console.log('Start listening'),
    onUnMatch: () => console.error('Cannot recognize speech.'),
    onError: (event) => console.error(event.error),
  })

if (!isSpeechAPIAvailable) {
  return <div>Browser doesn't support the Web Speech API</div>
}

return (
  <div>
    <button type="button" onClick={startListening}>
      Turn on mic
    </button>
    <button type="button" onClick={stopListening}>
      Stop
    </button>
    <p>
      {isListening ? 'Listening...' : 'Not listening'}
      <br />
      {transcript}
    </p>
  </div>
)

import {useSpeechToText, type SpeechToTextHookErrorCode} from '@kaiverse/k/hooks'
import {IconCheck, IconCopy, IconMicrophone, IconMicrophoneOff} from '@tabler/icons-react'
import {useCallback, useRef, type MouseEventHandler} from 'react'

export default function SpeechToTextDemo() {
  const speechErrRef = useRef<HTMLParagraphElement>(null)
  const transcriptInputRef = useRef<HTMLTextAreaElement>(null)

  const writeErr = (err: string) => {
    if (!speechErrRef.current) {
      return
    }

    speechErrRef.current.textContent = err
  }

  const {transcript, isSpeechAPIAvailable, isListening, startListening, stopListening} =
    useSpeechToText({
      lang: 'en',
      onStart: () => {
        if (speechErrRef.current?.textContent) speechErrRef.current.textContent = null
      },
      onUnMatch: () => writeErr('Cannot recognize speech.'),
      onError: (event) =>
        writeErr(
          `Error occurred in recognition: ${SPEECH_ERROR_MAPPING[event.error as SpeechToTextHookErrorCode] || event.error}`,
        ),
    })

  const handleCopyClipboard = useCallback<MouseEventHandler<HTMLButtonElement>>((e) => {
    transcriptInputRef.current?.focus()
    const transcriptVal = transcriptInputRef.current?.value.trim()
    if (!transcriptVal) {
      return
    }

    const targetClasses = e.currentTarget.classList
    navigator.clipboard
      .writeText(transcriptVal)
      .then(() => {
        targetClasses.add(...BTN_COPIED)
        setTimeout(() => targetClasses.remove(...BTN_COPIED), 1500)
      })
      .catch(console.log)
  }, [])

  if (!isSpeechAPIAvailable) {
    return <div>Browser doesn't support the Web Speech API</div>
  }

  return (
    <>
      <p className="mb-4">
        Language of the Speech Recognition:{' '}
        <strong>
          <code>en</code> - English
        </strong>
      </p>
      <div>
        {isListening ? (
          <div className="flex items-center gap-4">
            <div>Listening...</div>
            <button className="btn btn-neutral" type="button" onClick={stopListening}>
              <IconMicrophoneOff size={20} /> Stop
            </button>
          </div>
        ) : (
          <button className="btn btn-neutral btn-outline" type="button" onClick={startListening}>
            <IconMicrophone size={20} /> Turn on mic
          </button>
        )}

        <p ref={speechErrRef} className="mt-2 min-h-6 text-red-500"></p>
      </div>
      <label className="group relative">
        Transcript
        <textarea
          ref={transcriptInputRef}
          className="block min-h-12 w-full resize-none rounded-md p-2 shadow-sm [field-sizing:content]"
          name="transcript"
          defaultValue={transcript}
          wrap="soft"
          placeholder="The transcript will be displayed here"
          readOnly
        />
        {!transcript || (
          <button
            className="btn btn-ghost btn-outline btn-square btn-sm absolute top-8 right-2 hidden h-fit group-focus-within:block"
            type="button"
            onClick={handleCopyClipboard}
          >
            <IconCopy size={16} />
            <IconCheck className="hidden mx-auto" size={16} />
          </button>
        )}
      </label>
    </>
  )
}

const SPEECH_ERROR_MAPPING: Partial<Record<SpeechToTextHookErrorCode, string>> = {
  'audio-capture': 'Cannot detect your microphone! Please check your bluetooth/cable connection.',
  'no-speech': 'No speech',
  'language-not-supported': 'Language not supported',
}

const BTN_COPIED = [
  'text-green-500',
  'pointer-events-none',
  'opacity-80',
  '[&>.tabler-icon-copy]:hidden',
  '[&>.tabler-icon-check]:block',
]

Type Definition

function useSpeechToText(options: SpeechToTextHookOptions): {
  isSpeechAPIAvailable: boolean
  isListening: boolean
  startListening: () => void
  stopListening: () => void
  transcript: string
}

Special Types

Name	Type	Description
`SpeechToTextHookErrorCode`	`SpeechRecognitionErrorCode`	Possible error codes that can be returned by the Web Speech API. Enum: `SpeechRecognitionErrorCode`.
`SpeechToTextHookOptions`	See `useSpeechToText` Options below	`useSpeechToText` options.

`useSpeechToText` Options

Name	Type	Default	Description
`lang`	`string`	HTML `lang` attribute value	Language of the speech. If not specified, and the HTML `lang` attribute isn’t set either then the user agent’s language setting will be used. Read more.
`onStart`	`() => void`	—	Callback function that is called when the speech recognition service has begun listening to incoming audio.
`onUnMatch`	`(event: SpeechRecognitionEvent) => void`	—	Fired when the speech recognition service returns a final result with no significant recognition.
`onError`	`(event: SpeechRecognitionErrorEvent) => void`	—	Fired when a speech recognition error occurs.
`onTranscriptChange`	`(transcript: string) => void`	—	Fired when the speech recognition service returns a final result with significant recognition.

Return Types

Name	Type	Description
`isSpeechAPIAvailable`	`boolean`	Flag to check if the Web Speech API is available.
`isListening`	`boolean`	Is the service is listening to incoming audio or not.
`startListening`	`() => void`	Turn on microphone and start listening.
`stopListening`	`() => void`	Turn off microphone and stop listening.
`transcript`	`string`	The result of the speech recognition.