#[cfg(windows)] use std::collections::HashMap; use std::sync::Mutex; use lazy_static::lazy_static; use log::{info, trace}; use winrt::*; use tts_winrt_bindings::windows::media::playback::{ CurrentMediaPlaybackItemChangedEventArgs, MediaPlaybackItem, MediaPlaybackList, MediaPlaybackState, MediaPlayer, MediaPlayerAudioCategory, }; use tts_winrt_bindings::windows::media::speech_synthesis::SpeechSynthesizer; use tts_winrt_bindings::windows::{foundation::TypedEventHandler, media::core::MediaSource}; use crate::{Backend, BackendId, Error, Features, UtteranceId, CALLBACKS}; impl From for Error { fn from(e: winrt::Error) -> Self { Error::WinRT(e) } } #[derive(Clone, Debug)] pub struct WinRT { id: BackendId, synth: SpeechSynthesizer, player: MediaPlayer, playback_list: MediaPlaybackList, } lazy_static! { static ref NEXT_BACKEND_ID: Mutex = Mutex::new(0); static ref NEXT_UTTERANCE_ID: Mutex = Mutex::new(0); static ref UTTERANCE_MAPPINGS: Mutex> = Mutex::new(Vec::new()); static ref BACKEND_TO_MEDIA_PLAYER: Mutex> = { let v: HashMap = HashMap::new(); Mutex::new(v) }; static ref BACKEND_TO_PLAYBACK_LIST: Mutex> = { let v: HashMap = HashMap::new(); Mutex::new(v) }; static ref LAST_SPOKEN_UTTERANCE: Mutex> = { let v: HashMap = HashMap::new(); Mutex::new(v) }; } impl WinRT { pub fn new() -> std::result::Result { info!("Initializing WinRT backend"); let playback_list = MediaPlaybackList::new()?; let player = MediaPlayer::new()?; player.set_real_time_playback(true)?; player.set_audio_category(MediaPlayerAudioCategory::Speech)?; player.set_source(&playback_list)?; let mut backend_id = NEXT_BACKEND_ID.lock().unwrap(); let bid = BackendId::WinRT(*backend_id); *backend_id += 1; drop(backend_id); let mut backend_to_media_player = BACKEND_TO_MEDIA_PLAYER.lock().unwrap(); backend_to_media_player.insert(bid, player.clone()); drop(backend_to_media_player); player.media_ended(TypedEventHandler::new(|sender: &MediaPlayer, _args| { let source = sender.source()?; let source: MediaPlaybackList = source.try_into()?; source.items()?.clear()?; let backend_to_media_player = BACKEND_TO_MEDIA_PLAYER.lock().unwrap(); let id = backend_to_media_player.iter().find(|v| v.1 == sender); if let Some(id) = id { let id = id.0; let mut callbacks = CALLBACKS.lock().unwrap(); let callbacks = callbacks.get_mut(&id).unwrap(); if let Some(callback) = callbacks.utterance_end.as_mut() { let last_spoken_utterance = LAST_SPOKEN_UTTERANCE.lock().unwrap(); if let Some(utterance_id) = last_spoken_utterance.get(&id) { callback(*utterance_id); } } } Ok(()) }))?; let mut backend_to_playback_list = BACKEND_TO_PLAYBACK_LIST.lock().unwrap(); backend_to_playback_list.insert(bid, playback_list.clone()); drop(backend_to_playback_list); playback_list.current_item_changed(TypedEventHandler::new( |sender: &MediaPlaybackList, args: &CurrentMediaPlaybackItemChangedEventArgs| { let backend_to_playback_list = BACKEND_TO_PLAYBACK_LIST.lock().unwrap(); let id = backend_to_playback_list.iter().find(|v| v.1 == sender); if let Some(id) = id { let id = id.0; let mut callbacks = CALLBACKS.lock().unwrap(); let callbacks = callbacks.get_mut(&id).unwrap(); let old_item = args.old_item()?; if !old_item.is_null() { let mut mappings = UTTERANCE_MAPPINGS.lock().unwrap(); if let Some(callback) = callbacks.utterance_end.as_mut() { for mapping in &*mappings { if mapping.1 == old_item { callback(mapping.2); } } mappings.retain(|v| v.1 != old_item); } } let new_item = args.new_item()?; if !new_item.is_null() { let mut last_spoken_utterance = LAST_SPOKEN_UTTERANCE.lock().unwrap(); let mappings = UTTERANCE_MAPPINGS.lock().unwrap(); for mapping in &*mappings { if mapping.1 == new_item { last_spoken_utterance.insert(*id, mapping.2); } } if let Some(callback) = callbacks.utterance_begin.as_mut() { for mapping in &*mappings { if mapping.1 == new_item { callback(mapping.2); } } } } } Ok(()) }, ))?; Ok(Self { id: bid, synth: SpeechSynthesizer::new()?, player, playback_list, }) } } impl Backend for WinRT { fn id(&self) -> Option { Some(self.id) } fn supported_features(&self) -> Features { Features { stop: true, rate: true, pitch: true, volume: true, is_speaking: true, utterance_callbacks: true, } } fn speak( &mut self, text: &str, interrupt: bool, ) -> std::result::Result, Error> { let stream = self.synth.synthesize_text_to_stream_async(text)?.get()?; let content_type = stream.content_type()?; let source = MediaSource::create_from_stream(stream, content_type)?; let item = MediaPlaybackItem::create(source)?; if interrupt && self.is_speaking()? { self.stop()?; } self.playback_list.items()?.append(&item)?; if self.player.playback_session()?.playback_state()? != MediaPlaybackState::Playing { self.player.play()?; } let mut uid = NEXT_UTTERANCE_ID.lock().unwrap(); let utterance_id = UtteranceId::WinRT(*uid); *uid += 1; drop(uid); let mut mappings = UTTERANCE_MAPPINGS.lock().unwrap(); mappings.push((self.id, item, utterance_id)); Ok(Some(utterance_id)) } fn stop(&mut self) -> std::result::Result<(), Error> { trace!("stop()"); if !self.is_speaking()? { return Ok(()); } self.playback_list.items()?.clear()?; let mut callbacks = CALLBACKS.lock().unwrap(); let callbacks = callbacks.get_mut(&self.id).unwrap(); let mut mappings = UTTERANCE_MAPPINGS.lock().unwrap(); if let Some(callback) = callbacks.utterance_stop.as_mut() { for mapping in &*mappings { callback(mapping.2); } } mappings.retain(|v| v.0 != self.id); Ok(()) } fn min_rate(&self) -> f32 { 0.5 } fn max_rate(&self) -> f32 { 6.0 } fn normal_rate(&self) -> f32 { 1. } fn get_rate(&self) -> std::result::Result { let rate = self.synth.options()?.speaking_rate()?; Ok(rate as f32) } fn set_rate(&mut self, rate: f32) -> std::result::Result<(), Error> { self.synth.options()?.set_speaking_rate(rate.into())?; Ok(()) } fn min_pitch(&self) -> f32 { 0. } fn max_pitch(&self) -> f32 { 2. } fn normal_pitch(&self) -> f32 { 1. } fn get_pitch(&self) -> std::result::Result { let pitch = self.synth.options()?.audio_pitch()?; Ok(pitch as f32) } fn set_pitch(&mut self, pitch: f32) -> std::result::Result<(), Error> { self.synth.options()?.set_audio_pitch(pitch.into())?; Ok(()) } fn min_volume(&self) -> f32 { 0. } fn max_volume(&self) -> f32 { 1. } fn normal_volume(&self) -> f32 { 1. } fn get_volume(&self) -> std::result::Result { let volume = self.synth.options()?.audio_volume()?; Ok(volume as f32) } fn set_volume(&mut self, volume: f32) -> std::result::Result<(), Error> { self.synth.options()?.set_audio_volume(volume.into())?; Ok(()) } fn is_speaking(&self) -> std::result::Result { let item_count = self.playback_list.items()?.size()?; let state = self.player.playback_session()?.playback_state()?; let playing = state == MediaPlaybackState::Playing; Ok(item_count != 0 || playing) } } impl Drop for WinRT { fn drop(&mut self) { let id = self.id; let mut backend_to_playback_list = BACKEND_TO_PLAYBACK_LIST.lock().unwrap(); backend_to_playback_list.remove(&id); let mut backend_to_media_player = BACKEND_TO_MEDIA_PLAYER.lock().unwrap(); backend_to_media_player.remove(&id); let mut last_spoken_utterance = LAST_SPOKEN_UTTERANCE.lock().unwrap(); last_spoken_utterance.remove(&id); let mut mappings = UTTERANCE_MAPPINGS.lock().unwrap(); mappings.retain(|v| v.0 != id); } }