Add voice support to WinRT backend.

2024-11-17 10:49:38 +00:00 · 2022-03-30 20:13:27 -05:00 · 2022-03-30 20:13:27 -05:00 · b1f60811bf
commit b1f60811bf
parent 51cd84a6cd
5 changed files with 78 additions and 26 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -27,7 +27,7 @@ env_logger = "0.9"

 [target.'cfg(windows)'.dependencies]
 tolk = { version = "0.5", optional = true }
-windows = { version = "0.34", features = ["alloc", "Foundation", "Media_Core", "Media_Playback", "Media_SpeechSynthesis", "Storage_Streams"] }
+windows = { version = "0.34", features = ["alloc", "Foundation", "Foundation_Collections", "Media_Core", "Media_Playback", "Media_SpeechSynthesis", "Storage_Streams"] }

 [target.'cfg(target_os = "linux")'.dependencies]
 speech-dispatcher = { version = "0.13", default-features = false }
--- a/examples/hello_world.rs
+++ b/examples/hello_world.rs
@ -71,19 +71,22 @@ fn main() -> Result<(), Error> {
        tts.speak("This is normal volume.", false)?;
        tts.set_volume(original_volume)?;
    }
-    let Features { voices, .. } = tts.supported_features();
-    if voices {
-        let original_voice = tts.voice()?;
-        let voices_list = tts.list_voices();
+    let Features { voice, .. } = tts.supported_features();
+    if voice {
+        let voices = tts.voices()?;
        println!("Available voices:\n===");
-        for v in voices_list.iter() {
-            println!("{}", v);
-            tts.set_voice(v)?;
-            println!("voice set");
-            println!("{}", tts.voice()?);
-            tts.speak(v, false)?;
+        for v in &voices {
+            println!("{:?}", v);
+        }
+        let Features { get_voice, .. } = tts.supported_features();
+        if get_voice {
+            let original_voice = tts.voice()?;
+            for v in &voices {
+                tts.set_voice(v)?;
+                tts.speak(format!("This is {}.", v.name), false)?;
+            }
+            tts.set_voice(&original_voice)?;
        }
-        tts.set_voice(original_voice)?;
    }
    tts.speak("Goodbye.", false)?;
    let mut _input = String::new();
--- a/src/backends/speech_dispatcher.rs
+++ b/src/backends/speech_dispatcher.rs
@ -198,7 +198,7 @@ impl Backend for SpeechDispatcher {
        Ok(rv)
    }

-    fn voice(&self) -> Result<String, Error> {
+    fn voice(&self) -> Result<Voice, Error> {
        unimplemented!()
    }

--- a/src/backends/winrt.rs
+++ b/src/backends/winrt.rs
@ -1,19 +1,23 @@
 #[cfg(windows)]
-use std::collections::{HashMap, VecDeque};
-use std::sync::Mutex;
+use std::{
+    collections::{HashMap, VecDeque},
+    str::FromStr,
+    sync::Mutex,
+};

 use lazy_static::lazy_static;
 use log::{info, trace};
+use unic_langid::LanguageIdentifier;
 use windows::{
    Foundation::TypedEventHandler,
    Media::{
        Core::MediaSource,
        Playback::{MediaPlayer, MediaPlayerAudioCategory},
-        SpeechSynthesis::SpeechSynthesizer,
+        SpeechSynthesis::{SpeechSynthesizer, VoiceGender, VoiceInformation},
    },
 };

-use crate::{Backend, BackendId, Error, Features, UtteranceId, CALLBACKS};
+use crate::{Backend, BackendId, Error, Features, Gender, UtteranceId, Voice, CALLBACKS};

 impl From<windows::core::Error> for Error {
    fn from(e: windows::core::Error) -> Self {
@ -29,6 +33,7 @@ pub struct WinRt {
    rate: f32,
    pitch: f32,
    volume: f32,
+    voice: VoiceInformation,
 }

 struct Utterance {
@ -37,6 +42,7 @@ struct Utterance {
    rate: f32,
    pitch: f32,
    volume: f32,
+    voice: VoiceInformation,
 }

 lazy_static! {
@ -102,6 +108,7 @@ impl WinRt {
                                        tts.Options()?.SetSpeakingRate(utterance.rate.into())?;
                                        tts.Options()?.SetAudioPitch(utterance.pitch.into())?;
                                        tts.Options()?.SetAudioVolume(utterance.volume.into())?;
+                                        tts.SetVoice(utterance.voice.clone())?;
                                        let stream = tts
                                            .SynthesizeTextToStreamAsync(utterance.text.as_str())?
                                            .get()?;
@ -129,6 +136,7 @@ impl WinRt {
            rate: 1.,
            pitch: 1.,
            volume: 1.,
+            voice: SpeechSynthesizer::DefaultVoice()?,
        })
    }
 }
@ -145,7 +153,8 @@ impl Backend for WinRt {
            pitch: true,
            volume: true,
            is_speaking: true,
-            voices: true,
+            voice: true,
+            get_voice: true,
            utterance_callbacks: true,
        }
    }
@ -175,6 +184,7 @@ impl Backend for WinRt {
                    rate: self.rate,
                    pitch: self.pitch,
                    volume: self.volume,
+                    voice: self.voice.clone(),
                };
                utterances.push_back(utterance);
            }
@ -291,16 +301,28 @@ impl Backend for WinRt {
        Ok(!utterances.is_empty())
    }

-    fn voice(&self) -> Result<String, Error> {
-        unimplemented!()
+    fn voice(&self) -> Result<Voice, Error> {
+        let voice = self.synth.Voice()?;
+        voice.try_into()
    }

-    fn list_voices(&self) -> Vec<String> {
-        unimplemented!()
+    fn voices(&self) -> Result<Vec<Voice>, Error> {
+        let mut rv: Vec<Voice> = vec![];
+        for voice in SpeechSynthesizer::AllVoices()? {
+            rv.push(voice.try_into()?);
+        }
+        Ok(rv)
    }

-    fn set_voice(&mut self, voice: &str) -> Result<(), Error> {
-        unimplemented!()
+    fn set_voice(&mut self, voice: &Voice) -> Result<(), Error> {
+        for v in SpeechSynthesizer::AllVoices()? {
+            let vid: String = v.Id()?.try_into()?;
+            if vid == voice.id {
+                self.voice = v.clone();
+                return Ok(());
+            }
+        }
+        Err(Error::OperationFailed)
    }
 }

@ -315,3 +337,24 @@ impl Drop for WinRt {
        utterances.remove(&id);
    }
 }
+
+impl TryInto<Voice> for VoiceInformation {
+    type Error = Error;
+
+    fn try_into(self) -> Result<Voice, Self::Error> {
+        let gender = self.Gender()?;
+        let gender = if gender == VoiceGender::Male {
+            Gender::Male
+        } else {
+            Gender::Female
+        };
+        let language: String = self.Language()?.try_into()?;
+        let language = LanguageIdentifier::from_str(&language).unwrap();
+        Ok(Voice {
+            id: self.Id()?.try_into()?,
+            name: self.DisplayName()?.try_into()?,
+            gender,
+            language,
+        })
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -16,6 +16,7 @@ use std::collections::HashMap;
 #[cfg(target_os = "macos")]
 use std::ffi::CStr;
 use std::fmt;
+use std::string::FromUtf16Error;
 use std::sync::{Arc, Mutex};
 use std::{boxed::Box, sync::RwLock};

@ -200,6 +201,9 @@ pub enum Error {
    #[cfg(windows)]
    #[error("WinRT error")]
    WinRt(windows::core::Error),
+    #[cfg(windows)]
+    #[error("UTF string conversion failed")]
+    UtfStringConversionFailed(#[from] FromUtf16Error),
    #[error("Unsupported feature")]
    UnsupportedFeature,
    #[error("Out of range")]
@ -232,7 +236,7 @@ pub trait Backend: Clone {
    fn set_volume(&mut self, volume: f32) -> Result<(), Error>;
    fn is_speaking(&self) -> Result<bool, Error>;
    fn voices(&self) -> Result<Vec<Voice>, Error>;
-    fn voice(&self) -> Result<String, Error>;
+    fn voice(&self) -> Result<Voice, Error>;
    fn set_voice(&mut self, voice: &Voice) -> Result<(), Error>;
 }

@ -577,7 +581,7 @@ impl Tts {
    /**
     * Return the current speaking voice.
     */
-    pub fn voice(&self) -> Result<String, Error> {
+    pub fn voice(&self) -> Result<Voice, Error> {
        let Features { get_voice, .. } = self.supported_features();
        if get_voice {
            self.0.read().unwrap().voice()
@ -697,12 +701,14 @@ impl Drop for Tts {
    }
 }

+#[derive(Debug)]
 pub enum Gender {
    Unspecified,
    Male,
    Female,
 }

+#[derive(Debug)]
 pub struct Voice {
    pub id: String,
    pub name: String,