summaryrefslogtreecommitdiffstats
path: root/Scripts
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-08-31 17:11:11 -0700
committeryum <yum.food.vr@gmail.com>2023-08-31 17:17:01 -0700
commit3db4f81573d89f6ebefb5ec119c7d66affc1a4a0 (patch)
tree202672af81898cfdf559dcfa3a2d89341584f25c /Scripts
parent4fcf3e1e3ac8dcf510be96a84b81a688b1092869 (diff)
Bugfixes and tweaks
* Temporarily restore normal process priority. Working on adding a UI option to set STT prio. * Give audio indicator phonemes a 1/3 chance to do nothing. Makes result sound a little better imo. * Quiet down steamVR thread when steamVR isn't running * Fix use of `button_id` and `hand_id` in steamvr.py * Increase amount of silence allowed before transcript from 1 to 5 seconds. You want enough buffer to allow for a few full transcripts, else you risk spuriously dropping audio. * Enable background loading in audio metadata (required by vrc sdk)
Diffstat (limited to 'Scripts')
-rw-r--r--Scripts/osc_ctrl.py2
-rw-r--r--Scripts/steamvr.py13
-rw-r--r--Scripts/transcribe.py25
3 files changed, 18 insertions, 22 deletions
diff --git a/Scripts/osc_ctrl.py b/Scripts/osc_ctrl.py
index 413e2ae..6fc706e 100644
--- a/Scripts/osc_ctrl.py
+++ b/Scripts/osc_ctrl.py
@@ -108,7 +108,7 @@ def pageMessage(osc_state: OscState, msg: str, estate: EmotesState) -> bool:
letter_i += 1
if len(sounds_to_make) > 0:
for i in range(5):
- if i+1 in sounds_to_make:
+ if i+1 in sounds_to_make and random.randint(1,3) != 1:
playAudio(osc_state, i+1, True)
else:
playAudio(osc_state, i+1, False)
diff --git a/Scripts/steamvr.py b/Scripts/steamvr.py
index e0b59e3..da07134 100644
--- a/Scripts/steamvr.py
+++ b/Scripts/steamvr.py
@@ -30,19 +30,22 @@ def pollButtonPress(
buttons["thumbstick"] = vr.k_EButton_Axis0
system = None
+ first = True
while not system:
try:
system = vr.init(vr.VRApplication_Background)
except Exception as e:
- print(f"Failed to start steamVR input thread: {repr(e)}", file=sys.stderr)
- time.sleep(5)
+ if first:
+ print(f"Failed to start steamVR input thread: {repr(e)}", file=sys.stderr)
+ first = False
+ time.sleep(1)
last_packet = 0
event_high = False
while True:
time.sleep(0.01)
- lh_idx = system.getTrackedDeviceIndexForControllerRole(hand_id)
+ lh_idx = system.getTrackedDeviceIndexForControllerRole(hands[hand])
#print("left hand device idx: {}".format(lh_idx))
got_state, state = system.getControllerState(lh_idx)
@@ -58,7 +61,7 @@ def pollButtonPress(
# click, not movement.
dead_zone_radius = 0.7
- button_mask = (1 << button_id)
+ button_mask = (1 << buttons[button])
ret = EVENT_NONE
if (state.ulButtonPressed & button_mask) != 0 and\
(state.rAxis[0].x**2 + state.rAxis[0].y**2 < dead_zone_radius**2):
@@ -77,7 +80,7 @@ if __name__ == "__main__":
while True:
time.sleep(0.1)
- event = pollButtonPress(session_state, hand_id = hands["left"], button_id = buttons["joystick"])
+ event = pollButtonPress(session_state)
if event == EVENT_RISING_EDGE:
print("rising edge")
elif event == EVENT_FALLING_EDGE:
diff --git a/Scripts/transcribe.py b/Scripts/transcribe.py
index cea2da0..c718f73 100644
--- a/Scripts/transcribe.py
+++ b/Scripts/transcribe.py
@@ -170,10 +170,6 @@ def onAudioFramesAvailable(
else:
audio_state.commit_fuzz_threshold = 1
- max_frames = int(input_rate * audio_state.MAX_LENGTH_S /
- audio_state.CHUNK)
- if len(audio_state.frames) > max_frames:
- audio_state.frames = audio_state.frames[-1 * max_frames:]
if audio_state.drop_samples_till_i > 0:
# Caller wants us to keep this many *whisper* samples, assuming that
# we're getting one full frame every (1024 / 16KHz) seconds.
@@ -198,6 +194,11 @@ def onAudioFramesAvailable(
audio_state.frames[0] = b'00' * n_samples_to_drop + audio_state.frames[0][n_samples_to_drop * bytes_per_sample:]
audio_state.drop_samples_till_i = -1
+ max_frames = int(input_rate * audio_state.MAX_LENGTH_S /
+ audio_state.CHUNK)
+ if len(audio_state.frames) > max_frames:
+ audio_state.frames = audio_state.frames[-1 * max_frames:]
+
# Now enforce a minimum duration on frames. This reduces cases where the
# STT hallucinates random things. In the Whisper paper, they enforce a
# minimum audio buffer duration of 5.0 seconds, so I do the same here.
@@ -393,9 +394,9 @@ def transcribeAudio(audio_state,
if audio_state.enable_debug_mode:
print("no transcription, spin ({} seconds)".format(time.time() - last_transcribe_time))
last_transcribe_time = time.time()
- # Prevent audio buffer from holding more than 1 second of silence
+ # Prevent audio buffer from holding more than a few seconds of silence
# before real speech.
- audio_state.MAX_LENGTH_S = 1
+ audio_state.MAX_LENGTH_S = 5
continue
else:
audio_state.MAX_LENGTH_S = 300
@@ -731,14 +732,12 @@ def transcribeLoop(mic: str,
window_duration_s: int,
gpu_idx: int,
keyboard_hotkey: str,
- reset_on_toggle: bool,
- commit_fuzz_threshold: int):
+ reset_on_toggle: bool):
audio_state = getMicStream(mic)
audio_state.whisper_language = language
audio_state.language = langcodes.find(language).language
audio_state.MAX_LENGTH_S = window_duration_s
audio_state.reset_on_toggle = reset_on_toggle
- #audio_state.commit_fuzz_threshold = commit_fuzz_threshold
audio_state.enable_debug_mode = enable_debug_mode
audio_state.enable_profanity_filter = enable_profanity_filter
@@ -905,7 +904,6 @@ if __name__ == "__main__":
parser.add_argument("--gpu_idx", type=str, help="The index of the GPU device to use. On single GPU systems, use 0.")
parser.add_argument("--keybind", type=str, help="The keyboard hotkey to use to toggle transcription. For example, ctrl+shift+s")
parser.add_argument("--reset_on_toggle", type=int, help="Whether to reset (clear) the transcript every time that transcription is toggled on.")
- parser.add_argument("--commit_fuzz_threshold", type=int, help="The edit distance under which two consecutive transcripts are considered to match.")
parser.add_argument("--enable_debug_mode", type=int, help="If set to 1, print additional information to stdout while transcribing.")
args = parser.parse_args()
@@ -945,10 +943,6 @@ if __name__ == "__main__":
print("--gpu_idx required", file=sys.stderr)
sys.exit(1)
- if not args.commit_fuzz_threshold:
- print("--commit_fuzz_threshold required", file=sys.stderr)
- sys.exit(1)
-
args.gpu_idx = int(args.gpu_idx)
window_duration_s = 120
@@ -1027,6 +1021,5 @@ if __name__ == "__main__":
estate, window_duration_s,
args.gpu_idx,
args.keybind,
- args.reset_on_toggle,
- args.commit_fuzz_threshold)
+ args.reset_on_toggle)