iOS/SpeechTranscriber.swift
Bruno Pantaleão Gonçalves f823d8936f
Experimental Assist UI (#4139)
<!-- Thank you for submitting a Pull Request and helping to improve Home
Assistant. Please complete the following sections to help the processing
and review of your changes. Please do not delete anything from this
template. -->

## Summary
<!-- Provide a brief summary of the changes you have made and most
importantly what they aim to achieve -->

## Screenshots
<!-- If this is a user-facing change not in the frontend, please include
screenshots in light and dark mode. -->
<img width="1776" height="1124" alt="CleanShot 2025-12-23 at 03 29
25@2x"
src="https://github.com/user-attachments/assets/ab1a5c9f-72a9-4dc8-ae44-1f7d574738f6"
/>

## Link to pull request in Documentation repository
<!-- Pull requests that add, change or remove functionality must have a
corresponding pull request in the Companion App Documentation repository
(https://github.com/home-assistant/companion.home-assistant). Please add
the number of this pull request after the "#" -->
Documentation: home-assistant/companion.home-assistant#

## Any other notes
<!-- If there is any other information of note, like if this Pull
Request is part of a bigger change, please include it here. -->

---------

Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
2025-12-23 15:38:35 +00:00

184 lines
6.0 KiB
Swift

import AVFoundation
import Speech
/// A class that manages speech transcription using Apple's SpeechTranscriber API.
/// Automatically detects when the user has finished speaking.
@available(iOS 18.0, *)
@MainActor
final class SpeechTranscriber: ObservableObject {
// MARK: - Published Properties
/// The current transcription text
@Published private(set) var transcription: String = ""
/// Whether the transcriber is currently listening
@Published private(set) var isListening: Bool = false
/// The current authorization status
@Published private(set) var authorizationStatus: SFSpeechRecognizerAuthorizationStatus = .notDetermined
/// Any error that occurred during transcription
@Published private(set) var error: Error?
// MARK: - Private Properties
private var speechTranscriber: Speech.SpeechTranscriber?
private var transcriptionTask: Task<Void, Never>?
private let audioEngine = AVAudioEngine()
// MARK: - Initialization
init() {
self.authorizationStatus = SFSpeechRecognizer.authorizationStatus()
}
// MARK: - Public Methods
/// Requests authorization to use speech recognition
func requestAuthorization() async {
let status = await SFSpeechRecognizer.requestAuthorization()
authorizationStatus = status
}
/// Starts listening and transcribing speech
/// - Parameter locale: The locale for speech recognition (defaults to device locale)
func startListening(locale: Locale = .current) async throws {
// Check authorization
guard authorizationStatus == .authorized else {
throw TranscriberError.notAuthorized
}
// Stop any existing transcription
if isListening {
await stopListening()
}
// Reset state
transcription = ""
error = nil
isListening = true
// Request audio session
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
// Create the speech transcriber
let transcriber = Speech.SpeechTranscriber(locale: locale)
speechTranscriber = transcriber
// Start transcription task
transcriptionTask = Task {
do {
// Get the audio input
let inputNode = audioEngine.inputNode
let recordingFormat = inputNode.outputFormat(forBus: 0)
// Create transcription session
let session = transcriber.addsPunctuation().onAudioPacketAvailable { [weak self] _ in
// Audio packet callback if needed for visualization
}
// Install tap on audio engine
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { [weak self] buffer, _ in
guard let self else { return }
// Send audio to transcriber
Task { @MainActor in
try? await self.speechTranscriber?.transcribe(audioBuffer: buffer)
}
}
// Start the audio engine
audioEngine.prepare()
try audioEngine.start()
// Process transcription results
for try await result in transcriber.transcribedResults() {
await handleTranscriptionResult(result)
}
} catch {
await handleError(error)
}
}
}
/// Stops listening and transcribing
func stopListening() async {
isListening = false
// Stop audio engine
if audioEngine.isRunning {
audioEngine.stop()
audioEngine.inputNode.removeTap(onBus: 0)
}
// Finish transcription
try? await speechTranscriber?.finishTranscription()
// Cancel task
transcriptionTask?.cancel()
transcriptionTask = nil
speechTranscriber = nil
// Deactivate audio session
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
}
// MARK: - Private Methods
private func handleTranscriptionResult(_ result: SFTranscriptionResult) {
// Update transcription
transcription = result.bestTranscription.formattedString
// Check if speech has finished
if result.isFinal {
Task {
await stopListening()
}
} else if result.bestTranscription.segments.last?.confidence ?? 0 > 0.5 {
// If we have high confidence and a pause, consider stopping
// This helps auto-detect when the user is done speaking
let lastSegmentTimestamp = result.bestTranscription.segments.last?.timestamp ?? 0
let duration = result.bestTranscription.segments.last?.duration ?? 0
// You can adjust this threshold for pause detection
if duration > 2.0 { // 2 second pause
Task {
try? await Task.sleep(for: .seconds(1))
if self.isListening {
await stopListening()
}
}
}
}
}
private func handleError(_ error: Error) {
self.error = error
Task {
await stopListening()
}
}
// MARK: - Error Types
enum TranscriberError: LocalizedError {
case notAuthorized
case audioEngineFailure
case transcriptionFailed
var errorDescription: String? {
switch self {
case .notAuthorized:
return "Speech recognition is not authorized. Please enable it in Settings."
case .audioEngineFailure:
return "Failed to start audio engine."
case .transcriptionFailed:
return "Speech transcription failed."
}
}
}
}