Merge branch 'develop' into feature/voices

2024-11-17 12:39:36 +00:00 · 2020-09-26 18:20:10 +02:00 · 2020-09-26 18:20:10 +02:00 · 47cbb80595
commit 47cbb80595
parent 97f1de5724 ace5d2fd1f
10 changed files with 461 additions and 85 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "tts"
-version = "0.6.3"
+version = "0.8.0"
 authors = ["Nolan Darilek <nolan@thewordnerd.info>"]
 repository = "https://github.com/ndarilek/tts-rs"
 description = "High-level Text-To-Speech (TTS) interface"
@ -9,7 +9,7 @@ exclude = ["*.cfg", "*.yml"]
 edition = "2018"

 [lib]
-crate-type = ["lib", "staticlib"]
+crate-type = ["lib", "cdylib", "staticlib"]

 [dependencies]
 lazy_static = "1"
@ -20,12 +20,12 @@ thiserror = "1"
 env_logger = "0.7"

 [target.'cfg(windows)'.dependencies]
-tolk = "0.2"
+tolk = ">= 0.2.1"
 winrt = "0.7"
 tts_winrt_bindings = { version = "0.1", path="winrt_bindings" }

 [target.'cfg(target_os = "linux")'.dependencies]
-speech-dispatcher = "0.6"
+speech-dispatcher = "0.7"

 [target.'cfg(any(target_os = "macos", target_os = "ios"))'.dependencies]
 cocoa-foundation = "0.1"
@ -35,4 +35,4 @@ objc = "0.2"

 [target.wasm32-unknown-unknown.dependencies]
 wasm-bindgen = "0.2"
-web-sys = { version = "0.3", features = ["SpeechSynthesis", "SpeechSynthesisUtterance", "Window", ] }
+web-sys = { version = "0.3", features = ["EventTarget", "SpeechSynthesis", "SpeechSynthesisEvent", "SpeechSynthesisUtterance", "Window", ] }
--- a/examples/hello_world.rs
+++ b/examples/hello_world.rs
@ -12,6 +12,18 @@ use tts::*;
 fn main() -> Result<(), Error> {
    env_logger::init();
    let mut tts = TTS::default()?;
+    let Features {
+        utterance_callbacks,
+        ..
+    } = tts.supported_features();
+    if utterance_callbacks {
+        tts.on_utterance_begin(Some(Box::new(|utterance| {
+            println!("Started speaking {:?}", utterance)
+        })))?;
+        tts.on_utterance_end(Some(Box::new(|utterance| {
+            println!("Finished speaking {:?}", utterance)
+        })))?;
+    }
    tts.speak("Hello, world.", false)?;
    let Features { rate, .. } = tts.supported_features();
    if rate {
@ -63,6 +75,8 @@ fn main() -> Result<(), Error> {
    }*/
    tts.speak("Goodbye.", false)?;
    let mut _input = String::new();
+    // The below is only needed to make the example run on MacOS because there is no NSRunLoop in this context.
+    // It shouldn't be needed in an app or game that almost certainly has one already.
    #[cfg(target_os = "macos")]
    {
        let run_loop: id = unsafe { NSRunLoop::currentRunLoop() };
--- a/src/backends/appkit.rs
+++ b/src/backends/appkit.rs
@ -7,12 +7,12 @@ use objc::declare::ClassDecl;
 use objc::runtime::*;
 use objc::*;

-use crate::{Backend, Error, Features};
+use crate::{Backend, BackendId, Error, Features, UtteranceId};

-pub struct AppKit(*mut Object, *mut Object);
+pub(crate) struct AppKit(*mut Object, *mut Object);

 impl AppKit {
-    pub fn new() -> Self {
+    pub(crate) fn new() -> Self {
        info!("Initializing AppKit backend");
        unsafe {
            let obj: *mut Object = msg_send![class!(NSSpeechSynthesizer), new];
@ -91,6 +91,10 @@ impl AppKit {
 }

 impl Backend for AppKit {
+    fn id(&self) -> Option<BackendId> {
+        None
+    }
+
    fn supported_features(&self) -> Features {
        Features {
            stop: true,
@ -101,7 +105,7 @@ impl Backend for AppKit {
        }
    }

-    fn speak(&mut self, text: &str, interrupt: bool) -> Result<(), Error> {
+    fn speak(&mut self, text: &str, interrupt: bool) -> Result<Option<UtteranceId>, Error> {
        trace!("speak({}, {})", text, interrupt);
        if interrupt {
            self.stop()?;
@ -110,7 +114,7 @@ impl Backend for AppKit {
            let str = NSString::alloc(nil).init_str(text);
            let _: () = msg_send![self.1, enqueueAndSpeak: str];
        }
-        Ok(())
+        Ok(None)
    }

    fn stop(&mut self) -> Result<(), Error> {
--- a/src/backends/av_foundation.rs
+++ b/src/backends/av_foundation.rs
@ -1,17 +1,22 @@
 #[cfg(any(target_os = "macos", target_os = "ios"))]
 #[link(name = "AVFoundation", kind = "framework")]
+use std::sync::Mutex;
+
 use cocoa_foundation::base::{id, nil};
 use cocoa_foundation::foundation::NSString;
+use lazy_static::lazy_static;
 use log::{info, trace};
-use objc::runtime::*;
-use objc::*;
+use objc::runtime::{Object, Sel};
+use objc::{class, declare::ClassDecl, msg_send, sel, sel_impl};

-use crate::{Backend, Error, Features};
+use crate::{Backend, BackendId, Error, Features, UtteranceId, CALLBACKS};

 mod voices;
 use voices::AVSpeechSynthesisVoice;

-pub struct AvFoundation {
+pub(crate) struct AvFoundation {
+    id: BackendId,
+    delegate: *mut Object,
    synth: *mut Object,
    rate: f32,
    volume: f32,
@ -19,23 +24,95 @@ pub struct AvFoundation {
    voice: AVSpeechSynthesisVoice,
 }

+lazy_static! {
+    static ref NEXT_BACKEND_ID: Mutex<u64> = Mutex::new(0);
+}
+
 impl AvFoundation {
-    pub fn new() -> Self {
+    pub(crate) fn new() -> Self {
        info!("Initializing AVFoundation backend");
+        let mut decl = ClassDecl::new("MyNSSpeechSynthesizerDelegate", class!(NSObject)).unwrap();
+        decl.add_ivar::<u64>("backend_id");
+
+        extern "C" fn speech_synthesizer_did_start_speech_utterance(
+            this: &Object,
+            _: Sel,
+            _synth: *const Object,
+            utterance: id,
+        ) {
+            unsafe {
+                let backend_id: u64 = *this.get_ivar("backend_id");
+                let backend_id = BackendId::AvFoundation(backend_id);
+                let mut callbacks = CALLBACKS.lock().unwrap();
+                let callbacks = callbacks.get_mut(&backend_id).unwrap();
+                if let Some(callback) = callbacks.utterance_begin.as_mut() {
+                    let utterance_id = UtteranceId::AvFoundation(utterance);
+                    callback(utterance_id);
+                }
+            }
+        }
+
+        extern "C" fn speech_synthesizer_did_finish_speech_utterance(
+            this: &Object,
+            _: Sel,
+            _synth: *const Object,
+            utterance: id,
+        ) {
+            unsafe {
+                let backend_id: u64 = *this.get_ivar("backend_id");
+                let backend_id = BackendId::AvFoundation(backend_id);
+                let mut callbacks = CALLBACKS.lock().unwrap();
+                let callbacks = callbacks.get_mut(&backend_id).unwrap();
+                if let Some(callback) = callbacks.utterance_end.as_mut() {
+                    let utterance_id = UtteranceId::AvFoundation(utterance);
+                    callback(utterance_id);
+                }
+            }
+        }
+
        unsafe {
+            decl.add_method(
+                sel!(speechSynthesizer:didStartSpeechUtterance:),
+                speech_synthesizer_did_start_speech_utterance
+                    as extern "C" fn(&Object, Sel, *const Object, id) -> (),
+            );
+            decl.add_method(
+                sel!(speechSynthesizer:didFinishSpeechUtterance:),
+                speech_synthesizer_did_finish_speech_utterance
+                    as extern "C" fn(&Object, Sel, *const Object, id) -> (),
+            );
+        }
+
+        let delegate_class = decl.register();
+        let delegate_obj: *mut Object = unsafe { msg_send![delegate_class, new] };
+        let mut backend_id = NEXT_BACKEND_ID.lock().unwrap();
+        let rv = unsafe {
            let synth: *mut Object = msg_send![class!(AVSpeechSynthesizer), new];
+            delegate_obj
+                .as_mut()
+                .unwrap()
+                .set_ivar("backend_id", *backend_id);
+            let _: () = msg_send![synth, setDelegate: delegate_obj];
            AvFoundation {
+                id: BackendId::AvFoundation(*backend_id),
+                delegate: delegate_obj,
                synth: synth,
                rate: 0.5,
                volume: 1.,
                pitch: 1.,
                voice: AVSpeechSynthesisVoice::default(),
            }
-        }
+        };
+        *backend_id += 1;
+        rv
    }
 }

 impl Backend for AvFoundation {
+    fn id(&self) -> Option<BackendId> {
+        Some(self.id)
+    }
+
    fn supported_features(&self) -> Features {
        Features {
            stop: true,
@ -43,18 +120,23 @@ impl Backend for AvFoundation {
            pitch: true,
            volume: true,
            is_speaking: true,
+<<<<<<< HEAD
            voices: true,
+=======
+            utterance_callbacks: true,
+>>>>>>> develop
        }
    }

-    fn speak(&mut self, text: &str, interrupt: bool) -> Result<(), Error> {
+    fn speak(&mut self, text: &str, interrupt: bool) -> Result<Option<UtteranceId>, Error> {
        trace!("speak({}, {})", text, interrupt);
        if interrupt {
            self.stop()?;
        }
+        let utterance: id;
        unsafe {
            let str = NSString::alloc(nil).init_str(text);
-            let utterance: id = msg_send![class!(AVSpeechUtterance), alloc];
+            utterance = msg_send![class!(AVSpeechUtterance), alloc];
            let _: () = msg_send![utterance, initWithString: str];
            let _: () = msg_send![utterance, setRate: self.rate];
            let _: () = msg_send![utterance, setVolume: self.volume];
@ -62,7 +144,7 @@ impl Backend for AvFoundation {
            let _: () = msg_send![utterance, setVoice: self.voice];
            let _: () = msg_send![self.synth, speakUtterance: utterance];
        }
-        Ok(())
+        Ok(Some(UtteranceId::AvFoundation(utterance)))
    }

    fn stop(&mut self) -> Result<(), Error> {
@ -159,6 +241,7 @@ impl Backend for AvFoundation {
 impl Drop for AvFoundation {
    fn drop(&mut self) {
        unsafe {
+            let _: Object = msg_send![self.delegate, release];
            let _: Object = msg_send![self.synth, release];
        }
    }
--- a/src/backends/mod.rs
+++ b/src/backends/mod.rs
@ -17,16 +17,16 @@ mod appkit;
 mod av_foundation;

 #[cfg(target_os = "linux")]
-pub use self::speech_dispatcher::*;
+pub(crate) use self::speech_dispatcher::*;

 #[cfg(windows)]
-pub use self::tolk::*;
+pub(crate) use self::tolk::*;

 #[cfg(target_arch = "wasm32")]
 pub use self::web::*;

 #[cfg(target_os = "macos")]
-pub use self::appkit::*;
+pub(crate) use self::appkit::*;

 #[cfg(any(target_os = "macos", target_os = "ios"))]
-pub use self::av_foundation::*;
+pub(crate) use self::av_foundation::*;
--- a/src/backends/speech_dispatcher.rs
+++ b/src/backends/speech_dispatcher.rs
@ -1,14 +1,15 @@
 #[cfg(target_os = "linux")]
 use std::collections::HashMap;
+use std::convert::TryInto;
 use std::sync::Mutex;

 use lazy_static::*;
 use log::{info, trace};
 use speech_dispatcher::*;

-use crate::{Backend, Error, Features};
+use crate::{Backend, BackendId, Error, Features, UtteranceId, CALLBACKS};

-pub struct SpeechDispatcher(Connection);
+pub(crate) struct SpeechDispatcher(Connection);

 lazy_static! {
    static ref SPEAKING: Mutex<HashMap<u64, bool>> = {
@ -18,37 +19,55 @@ lazy_static! {
 }

 impl SpeechDispatcher {
-    pub fn new() -> Self {
+    pub(crate) fn new() -> Self {
        info!("Initializing SpeechDispatcher backend");
        let connection = speech_dispatcher::Connection::open("tts", "tts", "tts", Mode::Threaded);
        let sd = SpeechDispatcher(connection);
        let mut speaking = SPEAKING.lock().unwrap();
        speaking.insert(sd.0.client_id(), false);
-        sd.0.on_begin(Some(|_msg_id, client_id| {
+        sd.0.on_begin(Some(Box::new(|msg_id, client_id| {
            let mut speaking = SPEAKING.lock().unwrap();
            speaking.insert(client_id, true);
-        }));
-        sd.0.on_end(Some(|_msg_id, client_id| {
+            let mut callbacks = CALLBACKS.lock().unwrap();
+            let backend_id = BackendId::SpeechDispatcher(client_id);
+            let cb = callbacks.get_mut(&backend_id).unwrap();
+            let utterance_id = UtteranceId::SpeechDispatcher(msg_id);
+            if let Some(f) = cb.utterance_begin.as_mut() {
+                f(utterance_id);
+            }
+        })));
+        sd.0.on_end(Some(Box::new(|msg_id, client_id| {
            let mut speaking = SPEAKING.lock().unwrap();
            speaking.insert(client_id, false);
-        }));
-        sd.0.on_cancel(Some(|_msg_id, client_id| {
+            let mut callbacks = CALLBACKS.lock().unwrap();
+            let backend_id = BackendId::SpeechDispatcher(client_id);
+            let cb = callbacks.get_mut(&backend_id).unwrap();
+            let utterance_id = UtteranceId::SpeechDispatcher(msg_id);
+            if let Some(f) = cb.utterance_end.as_mut() {
+                f(utterance_id);
+            }
+        })));
+        sd.0.on_cancel(Some(Box::new(|_msg_id, client_id| {
            let mut speaking = SPEAKING.lock().unwrap();
            speaking.insert(client_id, false);
-        }));
-        sd.0.on_pause(Some(|_msg_id, client_id| {
+        })));
+        sd.0.on_pause(Some(Box::new(|_msg_id, client_id| {
            let mut speaking = SPEAKING.lock().unwrap();
            speaking.insert(client_id, false);
-        }));
-        sd.0.on_resume(Some(|_msg_id, client_id| {
+        })));
+        sd.0.on_resume(Some(Box::new(|_msg_id, client_id| {
            let mut speaking = SPEAKING.lock().unwrap();
            speaking.insert(client_id, true);
-        }));
+        })));
        sd
    }
 }

 impl Backend for SpeechDispatcher {
+    fn id(&self) -> Option<BackendId> {
+        Some(BackendId::SpeechDispatcher(self.0.client_id()))
+    }
+
    fn supported_features(&self) -> Features {
        Features {
            stop: true,
@ -56,10 +75,11 @@ impl Backend for SpeechDispatcher {
            pitch: true,
            volume: true,
            is_speaking: true,
+            utterance_callbacks: true,
        }
    }

-    fn speak(&mut self, text: &str, interrupt: bool) -> Result<(), Error> {
+    fn speak(&mut self, text: &str, interrupt: bool) -> Result<Option<UtteranceId>, Error> {
        trace!("speak({}, {})", text, interrupt);
        if interrupt {
            self.stop()?;
@ -68,11 +88,15 @@ impl Backend for SpeechDispatcher {
        if single_char {
            self.0.set_punctuation(Punctuation::All);
        }
-        self.0.say(Priority::Important, text);
+        let id = self.0.say(Priority::Important, text);
        if single_char {
            self.0.set_punctuation(Punctuation::None);
        }
-        Ok(())
+        if let Some(id) = id {
+            Ok(Some(UtteranceId::SpeechDispatcher(id.try_into().unwrap())))
+        } else {
+            Err(Error::NoneError)
+        }
    }

    fn stop(&mut self) -> Result<(), Error> {
--- a/src/backends/tolk.rs
+++ b/src/backends/tolk.rs
@ -2,12 +2,12 @@
 use log::{info, trace};
 use tolk::Tolk as TolkPtr;

-use crate::{Backend, Error, Features};
+use crate::{Backend, BackendId, Error, Features, UtteranceId};

-pub struct Tolk(TolkPtr);
+pub(crate) struct Tolk(TolkPtr);

 impl Tolk {
-    pub fn new() -> Option<Self> {
+    pub(crate) fn new() -> Option<Self> {
        info!("Initializing Tolk backend");
        let tolk = TolkPtr::new();
        if tolk.detect_screen_reader().is_some() {
@ -19,6 +19,10 @@ impl Tolk {
 }

 impl Backend for Tolk {
+    fn id(&self) -> Option<BackendId> {
+        None
+    }
+
    fn supported_features(&self) -> Features {
        Features {
            stop: true,
@ -26,28 +30,10 @@ impl Backend for Tolk {
        }
    }

-    fn speak(&mut self, text: &str, interrupt: bool) -> Result<(), Error> {
+    fn speak(&mut self, text: &str, interrupt: bool) -> Result<Option<UtteranceId>, Error> {
        trace!("speak({}, {})", text, interrupt);
-        const BUFFER_LENGTH: usize = 300;
-        if text.len() <= BUFFER_LENGTH {
-            self.0.speak(text, interrupt);
-        } else {
-            if interrupt {
-                self.stop()?;
-            }
-            let tokens = text.split_whitespace();
-            let mut buffer = String::new();
-            for token in tokens {
-                if buffer.len() + token.len() > BUFFER_LENGTH {
-                    self.0.speak(buffer, false);
-                    buffer = String::new();
-                } else {
-                    buffer.push_str(token);
-                    buffer.push(' ');
-                }
-            }
-        }
-        Ok(())
+        self.0.speak(text, interrupt);
+        Ok(None)
    }

    fn stop(&mut self) -> Result<(), Error> {
--- a/src/backends/web.rs
+++ b/src/backends/web.rs
@ -1,27 +1,45 @@
 #[cfg(target_arch = "wasm32")]
-use log::{info, trace};
-use web_sys::SpeechSynthesisUtterance;
+use std::sync::Mutex;

-use crate::{Backend, Error, Features};
+use lazy_static::lazy_static;
+use log::{info, trace};
+use wasm_bindgen::prelude::*;
+use wasm_bindgen::JsCast;
+use web_sys::{SpeechSynthesisEvent, SpeechSynthesisUtterance};
+
+use crate::{Backend, BackendId, Error, Features, UtteranceId, CALLBACKS};

 pub struct Web {
+    id: BackendId,
    rate: f32,
    pitch: f32,
    volume: f32,
 }

+lazy_static! {
+    static ref NEXT_BACKEND_ID: Mutex<u64> = Mutex::new(0);
+}
+
 impl Web {
    pub fn new() -> Result<Self, Error> {
        info!("Initializing Web backend");
-        Ok(Web {
+        let mut backend_id = NEXT_BACKEND_ID.lock().unwrap();
+        let rv = Web {
+            id: BackendId::Web(*backend_id),
            rate: 1.,
            pitch: 1.,
            volume: 1.,
-        })
+        };
+        *backend_id += 1;
+        Ok(rv)
    }
 }

 impl Backend for Web {
+    fn id(&self) -> Option<BackendId> {
+        Some(self.id)
+    }
+
    fn supported_features(&self) -> Features {
        Features {
            stop: true,
@ -29,23 +47,46 @@ impl Backend for Web {
            pitch: true,
            volume: true,
            is_speaking: true,
+            utterance_callbacks: true,
        }
    }

-    fn speak(&mut self, text: &str, interrupt: bool) -> Result<(), Error> {
+    fn speak(&mut self, text: &str, interrupt: bool) -> Result<Option<UtteranceId>, Error> {
        trace!("speak({}, {})", text, interrupt);
        let utterance = SpeechSynthesisUtterance::new_with_text(text).unwrap();
        utterance.set_rate(self.rate);
        utterance.set_pitch(self.pitch);
        utterance.set_volume(self.volume);
+        let id = self.id().unwrap();
+        let utterance_id = UtteranceId::Web(utterance.clone());
+        let callback = Closure::wrap(Box::new(move |evt: SpeechSynthesisEvent| {
+            let mut callbacks = CALLBACKS.lock().unwrap();
+            let callback = callbacks.get_mut(&id).unwrap();
+            if let Some(f) = callback.utterance_begin.as_mut() {
+                let utterance_id = UtteranceId::Web(evt.utterance());
+                f(utterance_id);
+            }
+        }) as Box<dyn Fn(_)>);
+        utterance.set_onstart(Some(callback.as_ref().unchecked_ref()));
+        let callback = Closure::wrap(Box::new(move |evt: SpeechSynthesisEvent| {
+            let mut callbacks = CALLBACKS.lock().unwrap();
+            let callback = callbacks.get_mut(&id).unwrap();
+            if let Some(f) = callback.utterance_end.as_mut() {
+                let utterance_id = UtteranceId::Web(evt.utterance());
+                f(utterance_id);
+            }
+        }) as Box<dyn Fn(_)>);
+        utterance.set_onend(Some(callback.as_ref().unchecked_ref()));
        if interrupt {
            self.stop()?;
        }
        if let Some(window) = web_sys::window() {
            let speech_synthesis = window.speech_synthesis().unwrap();
            speech_synthesis.speak(&utterance);
+            Ok(Some(utterance_id))
+        } else {
+            Err(Error::NoneError)
        }
-        Ok(())
    }

    fn stop(&mut self) -> Result<(), Error> {
--- a/src/backends/winrt.rs
+++ b/src/backends/winrt.rs
@ -1,13 +1,19 @@
 #[cfg(windows)]
-use log::{info, trace};
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+use lazy_static::lazy_static;
+use log::{info, trace};
+use winrt::ComInterface;

-use tts_winrt_bindings::windows::media::core::MediaSource;
 use tts_winrt_bindings::windows::media::playback::{
-    MediaPlaybackItem, MediaPlaybackList, MediaPlaybackState, MediaPlayer,
+    CurrentMediaPlaybackItemChangedEventArgs, MediaPlaybackItem, MediaPlaybackList,
+    MediaPlaybackState, MediaPlayer,
 };
 use tts_winrt_bindings::windows::media::speech_synthesis::SpeechSynthesizer;
+use tts_winrt_bindings::windows::{foundation::TypedEventHandler, media::core::MediaSource};

-use crate::{Backend, Error, Features};
+use crate::{Backend, BackendId, Error, Features, UtteranceId, CALLBACKS};

 impl From<winrt::Error> for Error {
    fn from(e: winrt::Error) -> Self {
@ -16,11 +22,28 @@ impl From<winrt::Error> for Error {
 }

 pub struct WinRT {
+    id: BackendId,
    synth: SpeechSynthesizer,
    player: MediaPlayer,
    playback_list: MediaPlaybackList,
 }

+lazy_static! {
+    static ref NEXT_BACKEND_ID: Mutex<u64> = Mutex::new(0);
+    static ref BACKEND_TO_MEDIA_PLAYER: Mutex<HashMap<BackendId, MediaPlayer>> = {
+        let v: HashMap<BackendId, MediaPlayer> = HashMap::new();
+        Mutex::new(v)
+    };
+    static ref BACKEND_TO_PLAYBACK_LIST: Mutex<HashMap<BackendId, MediaPlaybackList>> = {
+        let v: HashMap<BackendId, MediaPlaybackList> = HashMap::new();
+        Mutex::new(v)
+    };
+    static ref LAST_SPOKEN_UTTERANCE: Mutex<HashMap<BackendId, UtteranceId>> = {
+        let v: HashMap<BackendId, UtteranceId> = HashMap::new();
+        Mutex::new(v)
+    };
+}
+
 impl WinRT {
    pub fn new() -> std::result::Result<Self, Error> {
        info!("Initializing WinRT backend");
@ -28,11 +51,17 @@ impl WinRT {
        let player = MediaPlayer::new()?;
        player.set_auto_play(true)?;
        player.set_source(&playback_list)?;
-        Ok(Self {
+        let mut backend_id = NEXT_BACKEND_ID.lock().unwrap();
+        let bid = BackendId::WinRT(*backend_id);
+        let mut rv = Self {
+            id: bid,
            synth: SpeechSynthesizer::new()?,
            player: player,
            playback_list: playback_list,
-        })
+        };
+        *backend_id += 1;
+        Self::init_callbacks(&mut rv)?;
+        Ok(rv)
    }

    fn reinit_player(&mut self) -> std::result::Result<(), Error> {
@ -40,11 +69,70 @@ impl WinRT {
        self.player = MediaPlayer::new()?;
        self.player.set_auto_play(true)?;
        self.player.set_source(&self.playback_list)?;
+        self.init_callbacks()?;
+        Ok(())
+    }
+
+    fn init_callbacks(&mut self) -> Result<(), winrt::Error> {
+        let id = self.id().unwrap();
+        let mut backend_to_media_player = BACKEND_TO_MEDIA_PLAYER.lock().unwrap();
+        backend_to_media_player.insert(id, self.player.clone());
+        self.player
+            .media_ended(TypedEventHandler::new(|sender, _args| {
+                let backend_to_media_player = BACKEND_TO_MEDIA_PLAYER.lock().unwrap();
+                let id = backend_to_media_player.iter().find(|v| v.1 == sender);
+                if let Some(id) = id {
+                    let id = id.0;
+                    let mut callbacks = CALLBACKS.lock().unwrap();
+                    let callbacks = callbacks.get_mut(&id).unwrap();
+                    if let Some(callback) = callbacks.utterance_end.as_mut() {
+                        let last_spoken_utterance = LAST_SPOKEN_UTTERANCE.lock().unwrap();
+                        if let Some(utterance_id) = last_spoken_utterance.get(&id) {
+                            callback(utterance_id.clone());
+                        }
+                    }
+                }
+                Ok(())
+            }))?;
+        let mut backend_to_playback_list = BACKEND_TO_PLAYBACK_LIST.lock().unwrap();
+        backend_to_playback_list.insert(id, self.playback_list.clone());
+        self.playback_list
+            .current_item_changed(TypedEventHandler::new(
+                |sender: &MediaPlaybackList, args: &CurrentMediaPlaybackItemChangedEventArgs| {
+                    let backend_to_playback_list = BACKEND_TO_PLAYBACK_LIST.lock().unwrap();
+                    let id = backend_to_playback_list.iter().find(|v| v.1 == sender);
+                    if let Some(id) = id {
+                        let id = id.0;
+                        let mut callbacks = CALLBACKS.lock().unwrap();
+                        let callbacks = callbacks.get_mut(&id).unwrap();
+                        let old_item = args.old_item()?;
+                        if !old_item.is_null() {
+                            if let Some(callback) = callbacks.utterance_end.as_mut() {
+                                callback(UtteranceId::WinRT(old_item));
+                            }
+                        }
+                        let new_item = args.new_item()?;
+                        if !new_item.is_null() {
+                            let mut last_spoken_utterance = LAST_SPOKEN_UTTERANCE.lock().unwrap();
+                            let utterance_id = UtteranceId::WinRT(new_item);
+                            last_spoken_utterance.insert(*id, utterance_id.clone());
+                            if let Some(callback) = callbacks.utterance_begin.as_mut() {
+                                callback(utterance_id);
+                            }
+                        }
+                    }
+                    Ok(())
+                },
+            ))?;
        Ok(())
    }
 }

 impl Backend for WinRT {
+    fn id(&self) -> Option<BackendId> {
+        Some(self.id)
+    }
+
    fn supported_features(&self) -> Features {
        Features {
            stop: true,
@ -52,10 +140,15 @@ impl Backend for WinRT {
            pitch: true,
            volume: true,
            is_speaking: true,
+            utterance_callbacks: true,
        }
    }

-    fn speak(&mut self, text: &str, interrupt: bool) -> std::result::Result<(), Error> {
+    fn speak(
+        &mut self,
+        text: &str,
+        interrupt: bool,
+    ) -> std::result::Result<Option<UtteranceId>, Error> {
        trace!("speak({}, {})", text, interrupt);
        if interrupt {
            self.stop()?;
@ -72,11 +165,12 @@ impl Backend for WinRT {
                self.reinit_player()?;
            }
        }
-        self.playback_list.items()?.append(item)?;
+        self.playback_list.items()?.append(&item)?;
        if !self.is_speaking()? {
            self.player.play()?;
        }
-        Ok(())
+        let utterance_id = UtteranceId::WinRT(item);
+        Ok(Some(utterance_id))
    }

    fn stop(&mut self) -> std::result::Result<(), Error> {
@ -169,3 +263,15 @@ impl Backend for WinRT {
        unimplemented!()
    }
 }
+
+impl Drop for WinRT {
+    fn drop(&mut self) {
+        let id = self.id().unwrap();
+        let mut backend_to_playback_list = BACKEND_TO_PLAYBACK_LIST.lock().unwrap();
+        backend_to_playback_list.remove(&id);
+        let mut backend_to_media_player = BACKEND_TO_MEDIA_PLAYER.lock().unwrap();
+        backend_to_media_player.remove(&id);
+        let mut last_spoken_utterance = LAST_SPOKEN_UTTERANCE.lock().unwrap();
+        last_spoken_utterance.remove(&id);
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -12,16 +12,24 @@
 */

 use std::boxed::Box;
+use std::collections::HashMap;
 #[cfg(target_os = "macos")]
 use std::ffi::CStr;
+use std::sync::Mutex;

-#[cfg(target_os = "macos")]
+#[cfg(any(target_os = "macos", target_os = "ios"))]
 use cocoa_foundation::base::id;
+use lazy_static::lazy_static;
 #[cfg(target_os = "macos")]
 use libc::c_char;
 #[cfg(target_os = "macos")]
 use objc::{class, msg_send, sel, sel_impl};
 use thiserror::Error;
+#[cfg(target_arch = "wasm32")]
+use web_sys::SpeechSynthesisUtterance;
+
+#[cfg(windows)]
+use tts_winrt_bindings::windows::media::playback::MediaPlaybackItem;

 mod backends;

@ -40,6 +48,30 @@ pub enum Backends {
    AvFoundation,
 }

+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub enum BackendId {
+    #[cfg(target_os = "linux")]
+    SpeechDispatcher(u64),
+    #[cfg(target_arch = "wasm32")]
+    Web(u64),
+    #[cfg(windows)]
+    WinRT(u64),
+    #[cfg(any(target_os = "macos", target_os = "ios"))]
+    AvFoundation(u64),
+}
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum UtteranceId {
+    #[cfg(target_os = "linux")]
+    SpeechDispatcher(u64),
+    #[cfg(target_arch = "wasm32")]
+    Web(SpeechSynthesisUtterance),
+    #[cfg(windows)]
+    WinRT(MediaPlaybackItem),
+    #[cfg(any(target_os = "macos", target_os = "ios"))]
+    AvFoundation(id),
+}
+
 pub struct Features {
    pub stop: bool,
    pub rate: bool,
@ -47,6 +79,7 @@ pub struct Features {
    pub volume: bool,
    pub is_speaking: bool,
    pub voices: bool,
+    pub utterance_callbacks: bool,
 }

 impl Default for Features {
@ -58,6 +91,7 @@ impl Default for Features {
            volume: false,
            is_speaking: false,
            voices: false,
+            utterance_callbacks: false,
        }
    }
 }
@ -81,8 +115,9 @@ pub enum Error {
 }

 pub trait Backend {
+    fn id(&self) -> Option<BackendId>;
    fn supported_features(&self) -> Features;
-    fn speak(&mut self, text: &str, interrupt: bool) -> Result<(), Error>;
+    fn speak(&mut self, text: &str, interrupt: bool) -> Result<Option<UtteranceId>, Error>;
    fn stop(&mut self) -> Result<(), Error>;
    fn min_rate(&self) -> f32;
    fn max_rate(&self) -> f32;
@ -105,6 +140,23 @@ pub trait Backend {
    fn set_voice(&mut self, voice: &str) -> Result<(),Error>;
 }

+#[derive(Default)]
+struct Callbacks {
+    utterance_begin: Option<Box<dyn FnMut(UtteranceId)>>,
+    utterance_end: Option<Box<dyn FnMut(UtteranceId)>>,
+}
+
+unsafe impl Send for Callbacks {}
+
+unsafe impl Sync for Callbacks {}
+
+lazy_static! {
+    static ref CALLBACKS: Mutex<HashMap<BackendId, Callbacks>> = {
+        let m: HashMap<BackendId, Callbacks> = HashMap::new();
+        Mutex::new(m)
+    };
+}
+
 pub struct TTS(Box<dyn Backend>);

 unsafe impl std::marker::Send for TTS {}
@ -116,7 +168,7 @@ impl TTS {
     * Create a new `TTS` instance with the specified backend.
     */
    pub fn new(backend: Backends) -> Result<TTS, Error> {
-        match backend {
+        let backend = match backend {
            #[cfg(target_os = "linux")]
            Backends::SpeechDispatcher => Ok(TTS(Box::new(backends::SpeechDispatcher::new()))),
            #[cfg(target_arch = "wasm32")]
@ -142,6 +194,16 @@ impl TTS {
            Backends::AppKit => Ok(TTS(Box::new(backends::AppKit::new()))),
            #[cfg(any(target_os = "macos", target_os = "ios"))]
            Backends::AvFoundation => Ok(TTS(Box::new(backends::AvFoundation::new()))),
+        };
+        if backend.is_ok() {
+            let backend = backend.unwrap();
+            if let Some(id) = backend.0.id() {
+                let mut callbacks = CALLBACKS.lock().unwrap();
+                callbacks.insert(id, Callbacks::default());
+            }
+            Ok(backend)
+        } else {
+            backend
        }
    }

@ -189,9 +251,12 @@ impl TTS {
    /**
     * Speaks the specified text, optionally interrupting current speech.
     */
-    pub fn speak<S: Into<String>>(&mut self, text: S, interrupt: bool) -> Result<&Self, Error> {
-        self.0.speak(text.into().as_str(), interrupt)?;
-        Ok(self)
+    pub fn speak<S: Into<String>>(
+        &mut self,
+        text: S,
+        interrupt: bool,
+    ) -> Result<Option<UtteranceId>, Error> {
+        self.0.speak(text.into().as_str(), interrupt)
    }

    /**
@ -410,4 +475,57 @@ impl TTS {
            Err(Error::UnsupportedFeature)
        }
    }
+
+    /**
+     * Called when this speech synthesizer begins speaking an utterance.
+     */
+    pub fn on_utterance_begin(
+        &self,
+        callback: Option<Box<dyn FnMut(UtteranceId)>>,
+    ) -> Result<(), Error> {
+        let Features {
+            utterance_callbacks,
+            ..
+        } = self.supported_features();
+        if utterance_callbacks {
+            let mut callbacks = CALLBACKS.lock().unwrap();
+            let id = self.0.id().unwrap();
+            let mut callbacks = callbacks.get_mut(&id).unwrap();
+            callbacks.utterance_begin = callback;
+            Ok(())
+        } else {
+            Err(Error::UnsupportedFeature)
+        }
+    }
+
+    /**
+     * Called when this speech synthesizer finishes speaking an utterance.
+     */
+    pub fn on_utterance_end(
+        &self,
+        callback: Option<Box<dyn FnMut(UtteranceId)>>,
+    ) -> Result<(), Error> {
+        let Features {
+            utterance_callbacks,
+            ..
+        } = self.supported_features();
+        if utterance_callbacks {
+            let mut callbacks = CALLBACKS.lock().unwrap();
+            let id = self.0.id().unwrap();
+            let mut callbacks = callbacks.get_mut(&id).unwrap();
+            callbacks.utterance_end = callback;
+            Ok(())
+        } else {
+            Err(Error::UnsupportedFeature)
+        }
+    }
+}
+
+impl Drop for TTS {
+    fn drop(&mut self) {
+        if let Some(id) = self.0.id() {
+            let mut callbacks = CALLBACKS.lock().unwrap();
+            callbacks.remove(&id);
+        }
+    }
 }