From 015dbc39fa642c2c51cd7220b430f2861588773b Mon Sep 17 00:00:00 2001 From: sklett Date: Wed, 18 Nov 2015 10:54:26 +0100 Subject: [PATCH] New messages for communicating with text-to-speech systems --- proto/sandbox/rst/tts/TextToSpeechCommand.proto | 58 ++++++++++++ proto/sandbox/rst/tts/TextWithProsody.proto | 114 ++++++++++++++++++++++++ 2 files changed, 172 insertions(+) create mode 100644 proto/sandbox/rst/tts/TextToSpeechCommand.proto create mode 100644 proto/sandbox/rst/tts/TextWithProsody.proto diff --git a/proto/sandbox/rst/tts/TextToSpeechCommand.proto b/proto/sandbox/rst/tts/TextToSpeechCommand.proto new file mode 100644 index 0000000..ef6c5e0 --- /dev/null +++ b/proto/sandbox/rst/tts/TextToSpeechCommand.proto @@ -0,0 +1,58 @@ +package rst.tts; + +import "rst/tts/TextWithProsody.proto"; + +option java_outer_classname = "TextToSpeechCommandType"; + +/** + * Instructions to a Text-to-Speech module regarding the production of + * text. + * + * @author Soeren Klett + * @author Birte Carlmeyer + */ +message TextToSpeechCommand { + + /** + * The text to produce in case of @ref .PlaybackOption.PLAY. In all + * other cases this needs to be empty. + */ + optional TextWithProsody text = 1; + + enum PlaybackOption { + + /** + * Produce the text given in @ref .text. + * + * TODO what happens if a previous text production was + * interrupted with PAUSE. Will it be replaced by this call? + */ + PLAY = 0; + + /** + * Stop the current production and discard it. + */ + STOP = 1; + + /** + * Pause the current production. This allows to resume it using + * @ref .RESUME. + */ + PAUSE = 2; + + /** + * Resume a previously pause text production. + * + * TODO Specify the pre-conditions (what happens if nothing was + * paused before?) + */ + RESUME = 3; + + } + + /** + * Action to be executed by the Text-to-Speech engine. + */ + optional PlaybackOption playback_option = 2 [default = PLAY]; + +} diff --git a/proto/sandbox/rst/tts/TextWithProsody.proto b/proto/sandbox/rst/tts/TextWithProsody.proto new file mode 100644 index 0000000..b513eb0 --- /dev/null +++ b/proto/sandbox/rst/tts/TextWithProsody.proto @@ -0,0 +1,114 @@ +package rst.tts; + +option java_outer_classname = "TextWithProsodyType"; + +/** + * Represents a text and the its prosody information. Text can be + * anything from a few syllables to multiple sentences. The prosody is + * constant for the whole text. + + * @author Soeren Klett + * @author Birte Carlmeyer + */ +message TextWithProsody { + + /** + * The text to which the prosody information is attached. + * + * Must not be empty. + */ + optional string text = 1; + + /** + * Describes a constant prosody for an amount of text. It is assumed + * that there are application-specific default values for all + * aspects of the prosody (pitch, range, etc.). The prosody can be + * expressed either in relation to the baseline values or with + * absolute values. + * + * All aspects of the prosody are optional. In case an aspect is not + * defined, an executing TTS engine can decide on these aspects. + */ + message Prosody { + + /** + * Specifies the value for a prosody aspect using multiple + * possible formulations, which are represented by the different + * attributes of the message. + * + * Exactly one of the attributes needs to be set. + */ + message Value { + + /** + * Absolut value in the target unit. + */ + optional float absolute = 1; + + /** + * Offset to an application-specific default value given in + * the target unit. + */ + optional float relative = 2; + + /** + * Percentage of the application-specific default value. + * + * 100% equals 1.0. + */ + // @constraint(value > 0) + optional float percentage = 3 [default = 1]; + + } + + /** + * The baseline pitch for the contained words. + * + * Absolute and relative values are expressed in Hz. + */ + optional Value pitch = 1; + + /** + * The pitch range (variability) of the contained words. + * + * Absolute and relative values are expressed in Hz. + */ + optional Value range = 2; + + /** + * The desired change of volume of the contained words. + * + * Absolute and relative values are expressed in dB. + */ + optional Value volume = 3; + + /** + * A value in milliseconds for the desired time to take to read + * the contained words. + * + * TODO we usually use SI units. Can we change this to float in + * seconds? + */ + // @unit(millisecond) + // @constraint(value >= 0) + optional uint32 duration = 4; + + /** + * Relative speech rate given as a percentage of the + * application-specific base rate. + */ + // @constraint(value >= 0) + optional float rate = 5 [default = 1]; + + } + + /** + * Prosody to be applied to everything contained in @ref .text. + * + * TODO since all attributes inside the prosody are optional, we + * can always require the instance, which conforms with the message + * name. Is this ok? + */ + required Prosody prosody = 2; + +} -- 1.9.1