From bd8b63a357bb374f5875f0fedf2d677589419810 Mon Sep 17 00:00:00 2001 From: yum Date: Tue, 22 Nov 2022 15:36:19 -0800 Subject: Rework input controls Press joystick once to start recording, again to stop. When you start recording, any previous text on the board is cleared. Add 2 visual indicators: one to indicate speech, another to indicate that audio is paging. --- TaSTT.shader | 139 ++++++++++++++++++++++++++++++++++++++++++----------- generate_params.py | 6 +++ generate_utils.py | 6 +++ libtastt.py | 57 ++++++++++++++++++++++ libunity.py | 5 ++ osc_ctrl.py | 8 +++ string_matcher.py | 2 +- transcribe.py | 55 ++++++++++++--------- 8 files changed, 225 insertions(+), 53 deletions(-) diff --git a/TaSTT.shader b/TaSTT.shader index b722d63..272f08b 100644 --- a/TaSTT.shader +++ b/TaSTT.shader @@ -10,6 +10,11 @@ _Font_0xA000_0xBFFF ("Font 5 (unicode 0xA000 - 0xBFFFF)", 2D) = "white" {} _Font_0xC000_0xDFFF ("Font 6 (unicode 0xC000 - 0xDFFFF)", 2D) = "white" {} + TaSTT_Backplate("TaSTT_Backplate", 2D) = "black" {} + + TaSTT_Indicator_0("TaSTT_Indicator_0", float) = 0 + TaSTT_Indicator_1("TaSTT_Indicator_1", float) = 0 + // software "engineering" LULW _Letter_Row00_Col00_Byte0("_Letter_Row00_Col00_Byte0", float) = 0 _Letter_Row00_Col01_Byte0("_Letter_Row00_Col01_Byte0", float) = 0 @@ -403,6 +408,15 @@ Texture2D _Font_0xA000_0xBFFF; Texture2D _Font_0xC000_0xDFFF; + float TaSTT_Indicator_0; + static const fixed4 TaSTT_Indicator_0_Off_Color = fixed4(0.0, 1.0, 0.0, 2) * 0.7; + static const fixed4 TaSTT_Indicator_0_On_Color = fixed4(0.8, 0.2, 0.0, 2) * 0.9; + float TaSTT_Indicator_1; + static const fixed4 TaSTT_Indicator_1_Off_Color = fixed4(0.0, 1.0, 0.0, 2) * 0.7; + static const fixed4 TaSTT_Indicator_1_On_Color = fixed4(0.8, 0.2, 0.0, 2) * 0.9; + + Texture2D TaSTT_Backplate; + float _Letter_Row00_Col00_Byte0; float _Letter_Row00_Col01_Byte0; float _Letter_Row00_Col02_Byte0; @@ -764,14 +778,29 @@ return o; } - float2 AddMarginToUV(float2 uv, float x_frac, float y_frac) + float2 AddMarginToUV(float2 uv, float2 margin) { - float2 lo = float2(-x_frac / 2, -y_frac / 2); - float2 hi = float2(1.0 + x_frac / 2, 1.0 + y_frac / 2); + float2 lo = float2(-margin.x / 2, -margin.y / 2); + float2 hi = float2(1.0 + margin.x / 2, 1.0 + margin.y / 2); return clamp(lerp(lo, hi, uv), 0.0, 1.0); } + bool InMargin(float2 uv, float2 margin) + { + return uv.x < margin.x / 2 || + uv.x > 1 - margin.x / 2 || + uv.y < margin.y / 2 || + uv.y > 1 - margin.y / 2; + } + + // dist = sqrt(dx^2 + dy^2) = sqrt( * ) + bool InRadius2(float2 uv, float2 pos, float radius2) + { + float2 delta = uv - pos; + return dot(delta, delta) < radius2; + } + // Write the nth letter in the current cell and return the value of the // pixel. float2 GetLetter(float2 uv, int nth_letter) @@ -1238,34 +1267,88 @@ uv.x = 1.0 - uv.x; } - float uv_x_margin = 0.03; - float uv_y_margin = 0.03; - uv = AddMarginToUV(uv, uv_x_margin, uv_y_margin); + float2 uv_margin = float2(0.03, 0.06); + if (InMargin(uv, uv_margin)) { + // Margin is uv_margin/2 wide/tall. + // We want a circle whose radius is ~80% of that. + float radius_factor = 0.95; + float radius = (uv_margin.x / 2) * radius_factor; + // We want this circle to be centered halfway through the margin + // vertically, and at 1.5x the margin width horizontally. + float2 indicator_center = float2( + uv_margin.x * 0.5 + radius, + uv_margin.y * 0.5 * 0.5 + ); + // Finally, translate it to the top of the board instead of the + // bottom. + indicator_center.y = 1.0 - indicator_center.y; - int2 letter_bytes = (int2) floor(GetLetterParameter(uv)); - int letter = letter_bytes[0] | (letter_bytes[1] << 8); + if (InRadius2(uv, indicator_center, radius * radius)) { + if (floor(TaSTT_Indicator_0) == 0.0) { + return TaSTT_Indicator_0_Off_Color; + } else { + return TaSTT_Indicator_0_On_Color; + } + } - uv = GetLetter(uv, letter); + // Next, draw the second indicator. Same size as before, just shifted + // over a little. + indicator_center.x += radius * 2.5; + if (InRadius2(uv, indicator_center, radius * radius)) { + if (floor(TaSTT_Indicator_1) == 0.0) { + return TaSTT_Indicator_1_Off_Color; + } else { + return TaSTT_Indicator_1_On_Color; + } + } - int which_texture = (int) floor(letter / (64 * 128)); - [forcecase] switch (which_texture) - { - case 0: - return _Font_0x0000_0x1FFF.Sample(sampler_linear_repeat, uv); - case 1: - return _Font_0x2000_0x3FFF.Sample(sampler_linear_repeat, uv); - case 2: - return _Font_0x4000_0x5FFF.Sample(sampler_linear_repeat, uv); - case 3: - return _Font_0x6000_0x7FFF.Sample(sampler_linear_repeat, uv); - case 4: - return _Font_0x8000_0x9FFF.Sample(sampler_linear_repeat, uv); - case 5: - return _Font_0xA000_0xBFFF.Sample(sampler_linear_repeat, uv); - case 6: - return _Font_0xC000_0xDFFF.Sample(sampler_linear_repeat, uv); - default: - return _Font_0x0000_0x1FFF.Sample(sampler_linear_repeat, uv); + return fixed4(1,1,1,1); + } else { + uv_margin *= 2; + uv = AddMarginToUV(uv, uv_margin); + + int2 letter_bytes = (int2) floor(GetLetterParameter(uv)); + int letter = letter_bytes[0] | (letter_bytes[1] << 8); + + uv = GetLetter(uv, letter); + + fixed4 background = TaSTT_Backplate.Sample(sampler_linear_repeat, uv); + fixed4 text; + + int which_texture = (int) floor(letter / (64 * 128)); + [forcecase] switch (which_texture) + { + case 0: + text = _Font_0x0000_0x1FFF.Sample(sampler_linear_repeat, uv); + break; + case 1: + text = _Font_0x2000_0x3FFF.Sample(sampler_linear_repeat, uv); + break; + case 2: + text = _Font_0x4000_0x5FFF.Sample(sampler_linear_repeat, uv); + break; + case 3: + text = _Font_0x6000_0x7FFF.Sample(sampler_linear_repeat, uv); + break; + case 4: + text = _Font_0x8000_0x9FFF.Sample(sampler_linear_repeat, uv); + break; + case 5: + text = _Font_0xA000_0xBFFF.Sample(sampler_linear_repeat, uv); + break; + case 6: + text = _Font_0xC000_0xDFFF.Sample(sampler_linear_repeat, uv); + break; + default: + text = _Font_0x0000_0x1FFF.Sample(sampler_linear_repeat, uv); + break; + } + fixed4 black = fixed4(0,0,0,0); + if (text.r == black.r && text.g == black.g && text.b == black.b && text.a == black.a) { + return background; + } else { + return text; + } } } ENDCG diff --git a/generate_params.py b/generate_params.py index 63203d0..1146ee5 100644 --- a/generate_params.py +++ b/generate_params.py @@ -62,6 +62,12 @@ print(generate_utils.replaceMacros(BOOL_PARAM, params)) params["PARAM_NAME"] = generate_utils.getEnableParam() print(generate_utils.replaceMacros(BOOL_PARAM, params)) +params["PARAM_NAME"] = generate_utils.getIndicator0Param() +print(generate_utils.replaceMacros(BOOL_PARAM, params)) + +params["PARAM_NAME"] = generate_utils.getIndicator1Param() +print(generate_utils.replaceMacros(BOOL_PARAM, params)) + params["PARAM_NAME"] = generate_utils.getToggleParam() print(generate_utils.replaceMacros(BOOL_PARAM, params)) diff --git a/generate_utils.py b/generate_utils.py index 119714d..c4cbf4c 100644 --- a/generate_utils.py +++ b/generate_utils.py @@ -94,6 +94,12 @@ def getSelectParam() -> str: def getEnableParam(): return "TaSTT_Enable" +def getIndicator0Param(): + return "TaSTT_Indicator_0" + +def getIndicator1Param(): + return "TaSTT_Indicator_1" + def getBoardIndex(which_layer, select): # Because we divide the board into a multiple of 8 cells, some cells may # describe animations which don't exist, depending on the size of the board. diff --git a/libtastt.py b/libtastt.py index f580c1e..658e9ff 100644 --- a/libtastt.py +++ b/libtastt.py @@ -190,9 +190,54 @@ def generateClearAnimation(anim_dir, guid_map): guid_map[anim_path] = meta.guid guid_map[meta.guid] = anim_path +# Generate a toggle animation for a shader parameter. +def generateToggleAnimations(anim_dir, shader_param, guid_map): + print("Generating shader toggle animation", file=sys.stderr) + + parser = libunity.UnityParser() + parser.parse(LETTER_ANIMATION_TEMPLATE) + + # 0.0 represents false, 1.0 represents true. Don't forget that we add + # `UNITY_ANIMATION_FUDGE_MARGIN` to everything. + for shader_value in range(0, 2): + anim_node = parser.nodes[0] + anim_clip = anim_node.mapping['AnimationClip'] + curve_template = anim_clip.mapping['m_FloatCurves'].sequence[0] + anim_clip.mapping['m_FloatCurves'].sequence = [] + anim_clip.mapping['m_EditorCurves'].sequence = [] + + curve = curve_template.copy() + for keyframe in curve.mapping['curve'].mapping['m_Curve'].sequence: + keyframe.mapping['value'] = str(float(shader_value) + + UNITY_ANIMATION_FUDGE_MARGIN) + curve.mapping['attribute'] = "material.{}".format(shader_param) + curve.mapping['path'] = "World Constraint/Container/TaSTT" + # Add curve to animation + anim_clip.mapping['m_FloatCurves'].sequence.append(curve) + anim_clip.mapping['m_EditorCurves'].sequence.append(curve) + + # Serialize animation to file + anim_name = generate_utils.getClearAnimationName() + anim_suffix = "_Off" + if shader_value == 1: + anim_suffix = "_On" + anim_path = anim_dir + shader_param + anim_suffix + ".anim" + with open(anim_path, "w") as f: + f.write(libunity.unityYamlToString([anim_node])) + # Generate metadata + meta = libunity.Metadata() + with open(anim_path + ".meta", "w") as f: + f.write(str(meta)) + # Add metadata to guid map + guid_map[anim_path] = meta.guid + guid_map[meta.guid] = anim_path + def generateAnimations(anim_dir, guid_map): generateClearAnimation(args.gen_anim_dir, guid_map) + generateToggleAnimations(args.gen_anim_dir, generate_utils.getIndicator0Param(), guid_map) + generateToggleAnimations(args.gen_anim_dir, generate_utils.getIndicator1Param(), guid_map) + print("Generating letter animations", file=sys.stderr) parser = libunity.UnityParser() @@ -257,6 +302,8 @@ def generateFXController(anim: libunity.UnityAnimator) -> typing.Dict[int, libun anim.addParameter(generate_utils.getToggleParam(), bool) anim.addParameter(generate_utils.getSpeechNoiseEnableParam(), bool) anim.addParameter(generate_utils.getClearBoardParam(), bool) + anim.addParameter(generate_utils.getIndicator0Param(), bool) + anim.addParameter(generate_utils.getIndicator1Param(), bool) layers = {} for byte in range(0, generate_utils.BYTES_PER_CHAR): @@ -410,6 +457,16 @@ def generateFX(guid_map, gen_anim_dir): None, # No animation in the `off` state. generate_utils.getClearAnimationName() + ".anim", anim) + generateToggle(generate_utils.getIndicator0Param(), + gen_anim_dir, + generate_utils.getIndicator0Param() + "_Off.anim", + generate_utils.getIndicator0Param() + "_On.anim", + anim) + generateToggle(generate_utils.getIndicator1Param(), + gen_anim_dir, + generate_utils.getIndicator1Param() + "_Off.anim", + generate_utils.getIndicator1Param() + "_On.anim", + anim) return anim diff --git a/libunity.py b/libunity.py index 822c238..9380a6c 100644 --- a/libunity.py +++ b/libunity.py @@ -521,6 +521,11 @@ class UnityAnimator(): p0.sequence += p1.sequence a0.sequence += a1.sequence + for elm in p0.sequence: + elm.mapping['m_Controller'].mapping['fileID'] = ctrl0.anchor + for elm in a0.sequence: + elm.mapping['m_Controller'].mapping['fileID'] = ctrl0.anchor + return ctrl0 def merge(self, other): diff --git a/osc_ctrl.py b/osc_ctrl.py index be853dc..ea0c145 100644 --- a/osc_ctrl.py +++ b/osc_ctrl.py @@ -292,6 +292,14 @@ def clear(client, tx_state): tx_state.last_msg_encoded = [] +def indicateSpeech(client, is_speaking: bool): + addr = "/avatar/parameters/" + generate_utils.getIndicator0Param() + client.send_message(addr, is_speaking) + +def indicatePaging(client, is_paging: bool): + addr = "/avatar/parameters/" + generate_utils.getIndicator1Param() + client.send_message(addr, is_paging) + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-i", default="127.0.0.1", help="OSC server IP") diff --git a/string_matcher.py b/string_matcher.py index 543b18f..461f180 100644 --- a/string_matcher.py +++ b/string_matcher.py @@ -78,7 +78,7 @@ def matchStrings(old_text: str, new_text: str, window_size = 3) -> str: for j in range(0, 1 + len(new_text) - window_size): new_slice = new_text[j:j + window_size] cur_d = editdistance.eval(old_slice, new_slice) - if cur_d <= best_match_d: + if cur_d < best_match_d: best_match_i = i best_match_j = j best_match_d = cur_d diff --git a/transcribe.py b/transcribe.py index 1aabf6f..9290bdc 100644 --- a/transcribe.py +++ b/transcribe.py @@ -70,9 +70,7 @@ class AudioState: # this to whatever they want. language = whisper.tokenizer.TO_LANGUAGE_CODE["english"] - # When the user says `over`, we stop displaying new transcriptions until - # they clear the board again. - display_paused = False + audio_paused = False osc_client = osc_ctrl.getClient() @@ -121,6 +119,10 @@ def recordAudio(audio_state): while audio_state.run_app: data = audio_state.stream.read(audio_state.CHUNK) + if audio_state.audio_paused: + time.sleep(0.1) + continue + audio_state.lock.acquire() audio_state.frames.append(data) max_frames = int(audio_state.RATE * audio_state.MAX_LENGTH_S / audio_state.CHUNK) @@ -199,7 +201,8 @@ def transcribe(audio_state, model, filename): result = None #for temp in (0.00, 0.05, 0.10, 0.15, 0.20): - for temp in (0.00, 0.05): + #for temp in (0.00, 0.05): + for temp in (0.00,): print("temp: {}".format(temp)) options = whisper.DecodingOptions(language = audio_state.language, beam_size = 5, temperature = temp) @@ -256,11 +259,6 @@ def transcribeAudio(audio_state, model): words = ''.join(c for c in text.lower() if (c.isalpha() or c == " ")).split() - if len(words) > 0: - if words[-1] == "over": - words = words[0:-1] - audio_state.display_paused = True - print("Transcription: {}".format(audio_state.text)) old_text = audio_state.text @@ -280,14 +278,12 @@ def transcribeAudio(audio_state, model): def sendAudio(audio_state): while audio_state.run_app == True: - if audio_state.display_paused: - time.sleep(0.1) - continue - audio_state.lock.acquire() text = audio_state.committed_text + " " + audio_state.text - osc_ctrl.sendMessageLazy(audio_state.osc_client, text, audio_state.tx_state) + is_paging = not osc_ctrl.sendMessageLazy(audio_state.osc_client, text, + audio_state.tx_state) + osc_ctrl.indicatePaging(audio_state.osc_client, is_paging) audio_state.lock.release() # Pace this out @@ -295,19 +291,31 @@ def sendAudio(audio_state): def readControllerInput(audio_state): session = steamvr.SessionState() + RECORD_STATE = 0 + PAUSE_STATE = 1 + state = PAUSE_STATE while audio_state.run_app == True: time.sleep(0.05) event = steamvr.pollButtonPress(session) if event == steamvr.EVENT_RISING_EDGE: - print("event get") - audio_state.lock.acquire() - resetAudioLocked(audio_state) - resetDisplayLocked(audio_state) - audio_state.drop_transcription = True - audio_state.display_paused = False - audio_state.lock.release() + if state == RECORD_STATE: + state = PAUSE_STATE + osc_ctrl.indicateSpeech(audio_state.osc_client, False) + + audio_state.audio_paused = True + elif state == PAUSE_STATE: + state = RECORD_STATE + osc_ctrl.indicateSpeech(audio_state.osc_client, True) + + audio_state.lock.acquire() + resetAudioLocked(audio_state) + resetDisplayLocked(audio_state) + audio_state.drop_transcription = True + audio_state.audio_paused = False + audio_state.lock.release() + def transcribeLoop(mic: str, language: str): audio_state = getMicStream(mic) @@ -337,14 +345,13 @@ def transcribeLoop(mic: str, language: str): controller_input_thd.daemon = True controller_input_thd.start() - print("Press enter or say 'Clear' to start a new message. Say 'Over' to " + - "pause the display (saying 'Clear' resets it again).") + print("Press enter to start a new message.") for line in sys.stdin: audio_state.lock.acquire() resetAudioLocked(audio_state) resetDisplayLocked(audio_state) audio_state.drop_transcription = True - audio_state.display_paused = False + audio_state.audio_paused = False audio_state.lock.release() if "exit" in line or "quit" in line: break -- cgit v1.2.3