diff options
author | bruvzg <7645683+bruvzg@users.noreply.github.com> | 2021-11-04 14:33:37 +0200 |
---|---|---|
committer | bruvzg <7645683+bruvzg@users.noreply.github.com> | 2022-04-28 14:35:41 +0300 |
commit | 6ab672d1ef7ece5c3019d46aeb98df3686f37e26 (patch) | |
tree | be10d088e90c6a9e60efef823f54f9aa0d70aa07 /platform/windows/tts_windows.cpp | |
parent | 3e1b824c050b765095285c67b3e4c8092e1f88c6 (diff) |
Implement text-to-speech support on Android, iOS, HTML5, Linux, macOS and Windows.
Implement TextServer word break method.
Diffstat (limited to 'platform/windows/tts_windows.cpp')
-rw-r--r-- | platform/windows/tts_windows.cpp | 269 |
1 files changed, 269 insertions, 0 deletions
diff --git a/platform/windows/tts_windows.cpp b/platform/windows/tts_windows.cpp new file mode 100644 index 0000000000..05249934ba --- /dev/null +++ b/platform/windows/tts_windows.cpp @@ -0,0 +1,269 @@ +/*************************************************************************/ +/* tts_windows.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#include "tts_windows.h" + +TTS_Windows *TTS_Windows::singleton = nullptr; + +void __stdcall TTS_Windows::speech_event_callback(WPARAM wParam, LPARAM lParam) { + TTS_Windows *tts = TTS_Windows::get_singleton(); + SPEVENT event; + while (tts->synth->GetEvents(1, &event, NULL) == S_OK) { + if (tts->ids.has(event.ulStreamNum)) { + if (event.eEventId == SPEI_START_INPUT_STREAM) { + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_STARTED, tts->ids[event.ulStreamNum].id); + } else if (event.eEventId == SPEI_END_INPUT_STREAM) { + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_ENDED, tts->ids[event.ulStreamNum].id); + tts->ids.erase(event.ulStreamNum); + tts->_update_tts(); + } else if (event.eEventId == SPEI_WORD_BOUNDARY) { + const Char16String &string = tts->ids[event.ulStreamNum].string; + int pos = 0; + for (int i = 0; i < MIN(event.lParam, string.length()); i++) { + char16_t c = string[i]; + if ((c & 0xfffffc00) == 0xd800) { + i++; + } + pos++; + } + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_BOUNDARY, tts->ids[event.ulStreamNum].id, pos - tts->ids[event.ulStreamNum].offset); + } + } + } +} + +void TTS_Windows::_update_tts() { + if (!is_speaking() && !paused && queue.size() > 0) { + DisplayServer::TTSUtterance &message = queue.front()->get(); + + String text; + DWORD flags = SPF_ASYNC | SPF_PURGEBEFORESPEAK | SPF_IS_XML; + String pitch_tag = String("<pitch absmiddle=\"") + String::num_int64(message.pitch * 10 - 10, 10) + String("\">"); + text = pitch_tag + message.text + String("</pitch>"); + + IEnumSpObjectTokens *cpEnum; + ISpObjectToken *cpVoiceToken; + ULONG ulCount = 0; + ULONG stream_number = 0; + ISpObjectTokenCategory *cpCategory; + HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, nullptr, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (void **)&cpCategory); + if (SUCCEEDED(hr)) { + hr = cpCategory->SetId(SPCAT_VOICES, false); + if (SUCCEEDED(hr)) { + hr = cpCategory->EnumTokens(nullptr, nullptr, &cpEnum); + if (SUCCEEDED(hr)) { + hr = cpEnum->GetCount(&ulCount); + while (SUCCEEDED(hr) && ulCount--) { + wchar_t *w_id = 0L; + hr = cpEnum->Next(1, &cpVoiceToken, nullptr); + cpVoiceToken->GetId(&w_id); + if (String::utf16((const char16_t *)w_id) == message.voice) { + synth->SetVoice(cpVoiceToken); + cpVoiceToken->Release(); + break; + } + cpVoiceToken->Release(); + } + cpEnum->Release(); + } + } + cpCategory->Release(); + } + + UTData ut; + ut.string = text.utf16(); + ut.offset = pitch_tag.length(); // Substract injected <pitch> tag offset. + ut.id = message.id; + + synth->SetVolume(message.volume); + synth->SetRate(10.f * log10(message.rate) / log10(3.f)); + synth->Speak((LPCWSTR)ut.string.get_data(), flags, &stream_number); + + ids[stream_number] = ut; + + queue.pop_front(); + } +} + +bool TTS_Windows::is_speaking() const { + ERR_FAIL_COND_V(!synth, false); + + SPVOICESTATUS status; + synth->GetStatus(&status, nullptr); + return (status.dwRunningState == SPRS_IS_SPEAKING); +} + +bool TTS_Windows::is_paused() const { + ERR_FAIL_COND_V(!synth, false); + return paused; +} + +Array TTS_Windows::get_voices() const { + Array list; + IEnumSpObjectTokens *cpEnum; + ISpObjectToken *cpVoiceToken; + ISpDataKey *cpDataKeyAttribs; + ULONG ulCount = 0; + ISpObjectTokenCategory *cpCategory; + HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, nullptr, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (void **)&cpCategory); + if (SUCCEEDED(hr)) { + hr = cpCategory->SetId(SPCAT_VOICES, false); + if (SUCCEEDED(hr)) { + hr = cpCategory->EnumTokens(nullptr, nullptr, &cpEnum); + if (SUCCEEDED(hr)) { + hr = cpEnum->GetCount(&ulCount); + while (SUCCEEDED(hr) && ulCount--) { + hr = cpEnum->Next(1, &cpVoiceToken, nullptr); + HRESULT hr_attr = cpVoiceToken->OpenKey(SPTOKENKEY_ATTRIBUTES, &cpDataKeyAttribs); + if (SUCCEEDED(hr_attr)) { + wchar_t *w_id = nullptr; + wchar_t *w_lang = nullptr; + wchar_t *w_name = nullptr; + cpVoiceToken->GetId(&w_id); + cpDataKeyAttribs->GetStringValue(L"Language", &w_lang); + cpDataKeyAttribs->GetStringValue(nullptr, &w_name); + LCID locale = wcstol(w_lang, nullptr, 16); + + int locale_chars = GetLocaleInfoW(locale, LOCALE_SISO639LANGNAME, nullptr, 0); + int region_chars = GetLocaleInfoW(locale, LOCALE_SISO3166CTRYNAME, nullptr, 0); + wchar_t *w_lang_code = new wchar_t[locale_chars]; + wchar_t *w_reg_code = new wchar_t[region_chars]; + GetLocaleInfoW(locale, LOCALE_SISO639LANGNAME, w_lang_code, locale_chars); + GetLocaleInfoW(locale, LOCALE_SISO3166CTRYNAME, w_reg_code, region_chars); + + Dictionary voice_d; + voice_d["id"] = String::utf16((const char16_t *)w_id); + if (w_name) { + voice_d["name"] = String::utf16((const char16_t *)w_name); + } else { + voice_d["name"] = voice_d["id"].operator String().replace("HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech\\Voices\\Tokens\\", ""); + } + voice_d["language"] = String::utf16((const char16_t *)w_lang_code) + "_" + String::utf16((const char16_t *)w_reg_code); + list.push_back(voice_d); + + delete[] w_lang_code; + delete[] w_reg_code; + + cpDataKeyAttribs->Release(); + } + cpVoiceToken->Release(); + } + cpEnum->Release(); + } + } + cpCategory->Release(); + } + return list; +} + +void TTS_Windows::speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) { + ERR_FAIL_COND(!synth); + if (p_interrupt) { + stop(); + } + + if (p_text.is_empty()) { + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, p_utterance_id); + return; + } + + DisplayServer::TTSUtterance message; + message.text = p_text; + message.voice = p_voice; + message.volume = CLAMP(p_volume, 0, 100); + message.pitch = CLAMP(p_pitch, 0.f, 2.f); + message.rate = CLAMP(p_rate, 0.1f, 10.f); + message.id = p_utterance_id; + queue.push_back(message); + + if (is_paused()) { + resume(); + } else { + _update_tts(); + } +} + +void TTS_Windows::pause() { + ERR_FAIL_COND(!synth); + if (!paused) { + if (synth->Pause() == S_OK) { + paused = true; + } + } +} + +void TTS_Windows::resume() { + ERR_FAIL_COND(!synth); + synth->Resume(); + paused = false; +} + +void TTS_Windows::stop() { + ERR_FAIL_COND(!synth); + + SPVOICESTATUS status; + synth->GetStatus(&status, nullptr); + if (ids.has(status.ulCurrentStream)) { + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, ids[status.ulCurrentStream].id); + ids.erase(status.ulCurrentStream); + } + for (DisplayServer::TTSUtterance &message : queue) { + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, message.id); + } + queue.clear(); + synth->Speak(nullptr, SPF_PURGEBEFORESPEAK, nullptr); + synth->Resume(); + paused = false; +} + +TTS_Windows *TTS_Windows::get_singleton() { + return singleton; +} + +TTS_Windows::TTS_Windows() { + singleton = this; + CoInitialize(nullptr); + + if (SUCCEEDED(CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_ALL, IID_ISpVoice, (void **)&synth))) { + ULONGLONG event_mask = SPFEI(SPEI_END_INPUT_STREAM) | SPFEI(SPEI_START_INPUT_STREAM) | SPFEI(SPEI_WORD_BOUNDARY); + synth->SetInterest(event_mask, event_mask); + synth->SetNotifyCallbackFunction(&speech_event_callback, (WPARAM)(this), 0); + print_verbose("Text-to-Speech: SAPI initialized."); + } else { + print_verbose("Text-to-Speech: Cannot initialize ISpVoice!"); + } +} + +TTS_Windows::~TTS_Windows() { + if (synth) { + synth->Release(); + } + singleton = nullptr; +} |