tts-rs/src/backends/winrt.rs

#[cfg(windows)]
use log::{info, trace};

use tts_winrt_bindings::windows::media::core::MediaSource;
use tts_winrt_bindings::windows::media::playback::{
    MediaPlaybackItem, MediaPlaybackList, MediaPlaybackState, MediaPlayer,
};
use tts_winrt_bindings::windows::media::speech_synthesis::SpeechSynthesizer;

use crate::{Backend, Error, Features, UtteranceId};

impl From<winrt::Error> for Error {
    fn from(e: winrt::Error) -> Self {
        Error::WinRT(e)
    }
}

pub struct WinRT {
    synth: SpeechSynthesizer,
    player: MediaPlayer,
    playback_list: MediaPlaybackList,
}

impl WinRT {
    pub fn new() -> std::result::Result<Self, Error> {
        info!("Initializing WinRT backend");
        let playback_list = MediaPlaybackList::new()?;
        let player = MediaPlayer::new()?;
        player.set_auto_play(true)?;
        player.set_source(&playback_list)?;
        Ok(Self {
            synth: SpeechSynthesizer::new()?,
            player: player,
            playback_list: playback_list,
        })
    }

    fn reinit_player(&mut self) -> std::result::Result<(), Error> {
        self.playback_list = MediaPlaybackList::new()?;
        self.player = MediaPlayer::new()?;
        self.player.set_auto_play(true)?;
        self.player.set_source(&self.playback_list)?;
        Ok(())
    }
}

impl Backend for WinRT {
    fn supported_features(&self) -> Features {
        Features {
            stop: true,
            rate: true,
            pitch: true,
            volume: true,
            is_speaking: true,
        }
    }

    fn speak(
        &mut self,
        text: &str,
        interrupt: bool,
    ) -> std::result::Result<Option<UtteranceId>, Error> {
        trace!("speak({}, {})", text, interrupt);
        if interrupt {
            self.stop()?;
        }
        let stream = self.synth.synthesize_text_to_stream_async(text)?.get()?;
        let content_type = stream.content_type()?;
        let source = MediaSource::create_from_stream(stream, content_type)?;
        let item = MediaPlaybackItem::create(source)?;
        let state = self.player.playback_session()?.playback_state()?;
        if state == MediaPlaybackState::Paused {
            let index = self.playback_list.current_item_index()?;
            let total = self.playback_list.items()?.size()?;
            if total != 0 && index == total - 1 {
                self.reinit_player()?;
            }
        }
        self.playback_list.items()?.append(&item)?;
        if !self.is_speaking()? {
            self.player.play()?;
        }
        Ok(Some(UtteranceId::WinRT(item)))
    }

    fn stop(&mut self) -> std::result::Result<(), Error> {
        trace!("stop()");
        self.reinit_player()?;
        Ok(())
    }

    fn min_rate(&self) -> f32 {
        0.5
    }

    fn max_rate(&self) -> f32 {
        6.0
    }

    fn normal_rate(&self) -> f32 {
        1.
    }

    fn get_rate(&self) -> std::result::Result<f32, Error> {
        let rate = self.synth.options()?.speaking_rate()?;
        Ok(rate as f32)
    }

    fn set_rate(&mut self, rate: f32) -> std::result::Result<(), Error> {
        self.synth.options()?.set_speaking_rate(rate.into())?;
        Ok(())
    }

    fn min_pitch(&self) -> f32 {
        0.
    }

    fn max_pitch(&self) -> f32 {
        2.
    }

    fn normal_pitch(&self) -> f32 {
        1.
    }

    fn get_pitch(&self) -> std::result::Result<f32, Error> {
        let pitch = self.synth.options()?.audio_pitch()?;
        Ok(pitch as f32)
    }

    fn set_pitch(&mut self, pitch: f32) -> std::result::Result<(), Error> {
        self.synth.options()?.set_audio_pitch(pitch.into())?;
        Ok(())
    }

    fn min_volume(&self) -> f32 {
        0.
    }

    fn max_volume(&self) -> f32 {
        1.
    }

    fn normal_volume(&self) -> f32 {
        1.
    }

    fn get_volume(&self) -> std::result::Result<f32, Error> {
        let volume = self.synth.options()?.audio_volume()?;
        Ok(volume as f32)
    }

    fn set_volume(&mut self, volume: f32) -> std::result::Result<(), Error> {
        self.synth.options()?.set_audio_volume(volume.into())?;
        Ok(())
    }

    fn is_speaking(&self) -> std::result::Result<bool, Error> {
        let state = self.player.playback_session()?.playback_state()?;
        let playing = state == MediaPlaybackState::Opening || state == MediaPlaybackState::Playing;
        Ok(playing)
    }
}
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`#[cfg(windows)]`
			`use log::{info, trace};`
Various WinRT refinements. * Move autogenerated code to subcrate to speed up compilation. * `is_speaking` also checks whether a source is opening, in addition to whether it is playing. * Return to using autoplay. 2020-06-17 21:46:42 +00:00
Refactor to use separate `tts_winrt_bindings` crate, and bump version. 2020-06-17 22:25:43 +00:00			`use tts_winrt_bindings::windows::media::core::MediaSource;`
			`use tts_winrt_bindings::windows::media::playback::{`
Don't close `MediaPlayer` when stopping speech, and actually support interruption. 2020-06-14 23:56:01 +00:00			`MediaPlaybackItem, MediaPlaybackList, MediaPlaybackState, MediaPlayer,`
			`};`
Refactor to use separate `tts_winrt_bindings` crate, and bump version. 2020-06-17 22:25:43 +00:00			`use tts_winrt_bindings::windows::media::speech_synthesis::SpeechSynthesizer;`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00
Make speak calls return an utterance ID, where possible. 2020-09-22 17:40:03 +00:00			`use crate::{Backend, Error, Features, UtteranceId};`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00
			`impl From<winrt::Error> for Error {`
			`fn from(e: winrt::Error) -> Self {`
			`Error::WinRT(e)`
			`}`
			`}`

			`pub struct WinRT {`
			`synth: SpeechSynthesizer,`
			`player: MediaPlayer,`
			`playback_list: MediaPlaybackList,`
			`}`

			`impl WinRT {`
			`pub fn new() -> std::result::Result<Self, Error> {`
			`info!("Initializing WinRT backend");`
Under WinRT, recreate player completely when interruption is requested. 2020-07-06 17:52:18 +00:00			`let playback_list = MediaPlaybackList::new()?;`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`let player = MediaPlayer::new()?;`
Various WinRT refinements. * Move autogenerated code to subcrate to speed up compilation. * `is_speaking` also checks whether a source is opening, in addition to whether it is playing. * Return to using autoplay. 2020-06-17 21:46:42 +00:00			`player.set_auto_play(true)?;`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`player.set_source(&playback_list)?;`
			`Ok(Self {`
			`synth: SpeechSynthesizer::new()?,`
			`player: player,`
			`playback_list: playback_list,`
			`})`
			`}`
Under WinRT, recreate player completely when interruption is requested. 2020-07-06 17:52:18 +00:00
			`fn reinit_player(&mut self) -> std::result::Result<(), Error> {`
			`self.playback_list = MediaPlaybackList::new()?;`
			`self.player = MediaPlayer::new()?;`
			`self.player.set_auto_play(true)?;`
			`self.player.set_source(&self.playback_list)?;`
			`Ok(())`
			`}`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`}`

			`impl Backend for WinRT {`
			`fn supported_features(&self) -> Features {`
			`Features {`
			`stop: true,`
Clean up speech synthesis properties, and implement everything for WinRT. I'd previously attempted to normalize everything to `u8`, but this had some drawbacks: * It failed to account for some synthesis drivers defining normal as mid-range, while most define it very low. * It didn't track the normal value for a given synthesizer. * There was no clean way to map a curve between the minimum, normal, and maximum rates. Here we track the minimum, normal, and maximum values of rate, pitch, and volume. Sanity checks are done on set. Also, as a further proof-of-concept, all properties are now implemented for the WinRT driver. 2020-05-18 23:12:59 +00:00			`rate: true,`
			`pitch: true,`
			`volume: true,`
Correctly indicate that WinRT supports detection of speaking. 2020-07-07 14:08:44 +00:00			`is_speaking: true,`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`}`
			`}`

Make speak calls return an utterance ID, where possible. 2020-09-22 17:40:03 +00:00			`fn speak(`
			`&mut self,`
			`text: &str,`
			`interrupt: bool,`
			`) -> std::result::Result<Option<UtteranceId>, Error> {`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`trace!("speak({}, {})", text, interrupt);`
Don't close `MediaPlayer` when stopping speech, and actually support interruption. 2020-06-14 23:56:01 +00:00			`if interrupt {`
			`self.stop()?;`
			`}`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`let stream = self.synth.synthesize_text_to_stream_async(text)?.get()?;`
			`let content_type = stream.content_type()?;`
			`let source = MediaSource::create_from_stream(stream, content_type)?;`
			`let item = MediaPlaybackItem::create(source)?;`
Handle corner case where WinRT speech that doesn't interrupt, and is played after a delay, causes recently-spoken utterances to replay. `MediaPlayer` only seems to have states for playing and paused, but not stopped. Further, playing when the queue is finished seems to restart playback from the beginning. Here we clear the list of items to play if the player is paused and we're on the last item. We assume we're done with all items to speak, and clear the list before appending a new item and beginning playback again. The correct solution is probably to investigate how events work in winrt-rs, but callbacks and Rust have always been a disaster when I've tried them, so I'm hesitant. This does seem to handle the basic scenarios I've thrown at it. 2020-06-17 23:54:34 +00:00			`let state = self.player.playback_session()?.playback_state()?;`
			`if state == MediaPlaybackState::Paused {`
			`let index = self.playback_list.current_item_index()?;`
			`let total = self.playback_list.items()?.size()?;`
Sanity-check value to prevent overflow. 2020-07-06 17:14:50 +00:00			`if total != 0 && index == total - 1 {`
Under WinRT, recreate player completely when interruption is requested. 2020-07-06 17:52:18 +00:00			`self.reinit_player()?;`
Handle corner case where WinRT speech that doesn't interrupt, and is played after a delay, causes recently-spoken utterances to replay. `MediaPlayer` only seems to have states for playing and paused, but not stopped. Further, playing when the queue is finished seems to restart playback from the beginning. Here we clear the list of items to play if the player is paused and we're on the last item. We assume we're done with all items to speak, and clear the list before appending a new item and beginning playback again. The correct solution is probably to investigate how events work in winrt-rs, but callbacks and Rust have always been a disaster when I've tried them, so I'm hesitant. This does seem to handle the basic scenarios I've thrown at it. 2020-06-17 23:54:34 +00:00			`}`
			`}`
Switch to using MediaPlaybackItem as WinRT utterance ID. 2020-09-22 19:51:59 +00:00			`self.playback_list.items()?.append(&item)?;`
Ensure that `MediaPlayer` for speech is playing. 2020-06-15 00:42:48 +00:00			`if !self.is_speaking()? {`
			`self.player.play()?;`
			`}`
Switch to using MediaPlaybackItem as WinRT utterance ID. 2020-09-22 19:51:59 +00:00			`Ok(Some(UtteranceId::WinRT(item)))`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`}`

Under WinRT, recreate player completely when interruption is requested. 2020-07-06 17:52:18 +00:00			`fn stop(&mut self) -> std::result::Result<(), Error> {`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`trace!("stop()");`
Under WinRT, recreate player completely when interruption is requested. 2020-07-06 17:52:18 +00:00			`self.reinit_player()?;`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`Ok(())`
			`}`

Clean up speech synthesis properties, and implement everything for WinRT. I'd previously attempted to normalize everything to `u8`, but this had some drawbacks: * It failed to account for some synthesis drivers defining normal as mid-range, while most define it very low. * It didn't track the normal value for a given synthesizer. * There was no clean way to map a curve between the minimum, normal, and maximum rates. Here we track the minimum, normal, and maximum values of rate, pitch, and volume. Sanity checks are done on set. Also, as a further proof-of-concept, all properties are now implemented for the WinRT driver. 2020-05-18 23:12:59 +00:00			`fn min_rate(&self) -> f32 {`
			`0.5`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`}`

Clean up speech synthesis properties, and implement everything for WinRT. I'd previously attempted to normalize everything to `u8`, but this had some drawbacks: * It failed to account for some synthesis drivers defining normal as mid-range, while most define it very low. * It didn't track the normal value for a given synthesizer. * There was no clean way to map a curve between the minimum, normal, and maximum rates. Here we track the minimum, normal, and maximum values of rate, pitch, and volume. Sanity checks are done on set. Also, as a further proof-of-concept, all properties are now implemented for the WinRT driver. 2020-05-18 23:12:59 +00:00			`fn max_rate(&self) -> f32 {`
			`6.0`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`}`

Clean up speech synthesis properties, and implement everything for WinRT. I'd previously attempted to normalize everything to `u8`, but this had some drawbacks: * It failed to account for some synthesis drivers defining normal as mid-range, while most define it very low. * It didn't track the normal value for a given synthesizer. * There was no clean way to map a curve between the minimum, normal, and maximum rates. Here we track the minimum, normal, and maximum values of rate, pitch, and volume. Sanity checks are done on set. Also, as a further proof-of-concept, all properties are now implemented for the WinRT driver. 2020-05-18 23:12:59 +00:00			`fn normal_rate(&self) -> f32 {`
			`1.`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`}`

Clean up speech synthesis properties, and implement everything for WinRT. I'd previously attempted to normalize everything to `u8`, but this had some drawbacks: * It failed to account for some synthesis drivers defining normal as mid-range, while most define it very low. * It didn't track the normal value for a given synthesizer. * There was no clean way to map a curve between the minimum, normal, and maximum rates. Here we track the minimum, normal, and maximum values of rate, pitch, and volume. Sanity checks are done on set. Also, as a further proof-of-concept, all properties are now implemented for the WinRT driver. 2020-05-18 23:12:59 +00:00			`fn get_rate(&self) -> std::result::Result<f32, Error> {`
			`let rate = self.synth.options()?.speaking_rate()?;`
			`Ok(rate as f32)`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`}`

Clean up speech synthesis properties, and implement everything for WinRT. I'd previously attempted to normalize everything to `u8`, but this had some drawbacks: * It failed to account for some synthesis drivers defining normal as mid-range, while most define it very low. * It didn't track the normal value for a given synthesizer. * There was no clean way to map a curve between the minimum, normal, and maximum rates. Here we track the minimum, normal, and maximum values of rate, pitch, and volume. Sanity checks are done on set. Also, as a further proof-of-concept, all properties are now implemented for the WinRT driver. 2020-05-18 23:12:59 +00:00			`fn set_rate(&mut self, rate: f32) -> std::result::Result<(), Error> {`
			`self.synth.options()?.set_speaking_rate(rate.into())?;`
			`Ok(())`
			`}`

			`fn min_pitch(&self) -> f32 {`
			`0.`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`}`

Clean up speech synthesis properties, and implement everything for WinRT. I'd previously attempted to normalize everything to `u8`, but this had some drawbacks: * It failed to account for some synthesis drivers defining normal as mid-range, while most define it very low. * It didn't track the normal value for a given synthesizer. * There was no clean way to map a curve between the minimum, normal, and maximum rates. Here we track the minimum, normal, and maximum values of rate, pitch, and volume. Sanity checks are done on set. Also, as a further proof-of-concept, all properties are now implemented for the WinRT driver. 2020-05-18 23:12:59 +00:00			`fn max_pitch(&self) -> f32 {`
			`2.`
			`}`

			`fn normal_pitch(&self) -> f32 {`
			`1.`
			`}`

			`fn get_pitch(&self) -> std::result::Result<f32, Error> {`
			`let pitch = self.synth.options()?.audio_pitch()?;`
			`Ok(pitch as f32)`
			`}`

			`fn set_pitch(&mut self, pitch: f32) -> std::result::Result<(), Error> {`
			`self.synth.options()?.set_audio_pitch(pitch.into())?;`
			`Ok(())`
			`}`

			`fn min_volume(&self) -> f32 {`
			`0.`
			`}`

			`fn max_volume(&self) -> f32 {`
			`1.`
			`}`

			`fn normal_volume(&self) -> f32 {`
			`1.`
			`}`

			`fn get_volume(&self) -> std::result::Result<f32, Error> {`
			`let volume = self.synth.options()?.audio_volume()?;`
			`Ok(volume as f32)`
			`}`

			`fn set_volume(&mut self, volume: f32) -> std::result::Result<(), Error> {`
			`self.synth.options()?.set_audio_volume(volume.into())?;`
			`Ok(())`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`}`
Implement support for detecting when TTS is speaking. 2020-06-02 19:53:14 +00:00
Forgot we have to fully-qualify the type here. 2020-06-02 21:59:04 +00:00			`fn is_speaking(&self) -> std::result::Result<bool, Error> {`
WinRT fixes. * Use released `winrt` crate. * Implement `is_speaking`. 2020-06-09 16:00:37 +00:00			`let state = self.player.playback_session()?.playback_state()?;`
Various WinRT refinements. * Move autogenerated code to subcrate to speed up compilation. * `is_speaking` also checks whether a source is opening, in addition to whether it is playing. * Return to using autoplay. 2020-06-17 21:46:42 +00:00			`let playing = state == MediaPlaybackState::Opening \|\| state == MediaPlaybackState::Playing;`
WinRT fixes. * Use released `winrt` crate. * Implement `is_speaking`. 2020-06-09 16:00:37 +00:00			`Ok(playing)`
Implement support for detecting when TTS is speaking. 2020-06-02 19:53:14 +00:00			`}`
Initial WinRT backend. * Add WinRT backend * Refactor to use thiserror and unify error-handling * If a screen reader is detected. use Tolk. Otherwise, use the WinRT backend. 2020-05-18 20:01:28 +00:00			`}`