summaryrefslogtreecommitdiffstats
path: root/curate
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2023-12-26 02:18:58 -0800
committeryum <yum.food.vr@gmail.com>2023-12-26 02:18:58 -0800
commita4c1870f724f18e98c33468b4d038dd1c742e4bd (patch)
tree54721b70fd73997076199da0ea39e0ce5d2ef367 /curate
parente773bf75a562a8ed5afe72642ed39ba196ffab75 (diff)
Add optional transcription & curation components
Add a transcribe button, which transcribes each .wav file using openai/whisper-large-v2, producing a corresponding .txt file. Also add a TUI tool for WSL. This tool lets you view transcripts and delete them with vi-like commands. Useful for cleaning data.
Diffstat (limited to 'curate')
-rw-r--r--curate/Makefile25
-rw-r--r--curate/ui.cc156
2 files changed, 181 insertions, 0 deletions
diff --git a/curate/Makefile b/curate/Makefile
new file mode 100644
index 0000000..d4e63dc
--- /dev/null
+++ b/curate/Makefile
@@ -0,0 +1,25 @@
+CC=g++
+CFLAGS=-c -O2 -std=c++20
+LDFLAGS=-lcurses
+
+EXE=ui
+SRCS=ui.cc
+OBJS=$(SRCS:.cc=.o)
+HDRS=
+
+.PHONY: all
+all: $(EXE)
+
+$(EXE): $(OBJS)
+ $(CC) $^ $(LDFLAGS) -o $@
+
+%.o: %.cc %.h
+ $(CC) $(CFLAGS) $< -o $@
+
+%.o: %.cc
+ $(CC) $(CFLAGS) $< -o $@
+
+.PHONY: clean
+clean:
+ @rm -f $(OBJS) $(EXE)
+
diff --git a/curate/ui.cc b/curate/ui.cc
new file mode 100644
index 0000000..222bfe4
--- /dev/null
+++ b/curate/ui.cc
@@ -0,0 +1,156 @@
+#include <curses.h>
+#include <ncurses.h>
+#include <stdio.h>
+
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+typedef std::pair<std::string, std::string> datapoint_t;
+
+const int PAGE_LINES = 40;
+const int TRANSCRIPT_CHARS = 120;
+
+void getData(
+ const std::filesystem::path& data_path,
+ std::vector<datapoint_t> &datapoints,
+ std::map<std::string, std::string> &transcripts) {
+ datapoints.clear();
+ transcripts.clear();
+ printw("Scanning for files at %s\n", data_path.string().c_str());
+ for (const auto& entry : std::filesystem::directory_iterator(data_path)) {
+ //printw(" Checking file %s\n", entry.path().string().c_str());
+ if (entry.is_regular_file()) {
+ std::filesystem::path filepath = entry.path();
+ std::string filename = filepath.stem().string();
+
+ if (filepath.extension() == ".wav") {
+ std::filesystem::path txt_file = filepath.replace_extension(".txt");
+ if (std::filesystem::exists(txt_file)) {
+ datapoints.emplace_back(filepath.string(), txt_file.string());
+ std::ifstream fileStream(txt_file);
+ std::stringstream buffer;
+ buffer << fileStream.rdbuf();
+ std::string contents = buffer.str();
+ contents.erase(std::remove(contents.begin(), contents.end(), '\n'), contents.cend());
+ contents.erase(std::remove(contents.begin(), contents.end(), '\r'), contents.cend());
+ contents = contents.substr(0, TRANSCRIPT_CHARS);
+ transcripts[txt_file.string()] = contents;
+ }
+ }
+ }
+ }
+}
+
+
+int main(int argc, char* argv[]) {
+ const std::filesystem::path cwd = std::filesystem::current_path();
+ std::filesystem::path data_path = std::filesystem::current_path();
+ if (argc == 2) {
+ data_path = std::filesystem::path(argv[1]);
+ }
+
+ // Initialize ncurses
+ initscr();
+ cbreak();
+ noecho();
+ keypad(stdscr, TRUE);
+
+ // Clear the screen and wait for 'q' or 'x'
+ bool run = true;
+ bool redraw = true;
+ int idx = 0;
+ int page_offset = 0;
+
+ std::vector<datapoint_t> datapoints;
+ std::map<std::string, std::string> transcripts;
+
+ std::string digits;
+ while (run) {
+ clear();
+ {
+ int cur_idx = 0;
+ getData(data_path, datapoints, transcripts);
+ for (const auto& [txt_path, transcript] : transcripts) {
+ if (cur_idx < page_offset * PAGE_LINES) {
+ ++cur_idx;
+ continue;
+ }
+
+ char selector = ((cur_idx % PAGE_LINES) == idx) ? '>' : ' ';
+ printw("%02d %c %s: %s\n", (cur_idx % PAGE_LINES), selector, txt_path.c_str(), transcript.c_str());
+ ++cur_idx;
+
+ if (cur_idx >= (page_offset + 1) * PAGE_LINES) {
+ break;
+ }
+ }
+ }
+ refresh();
+
+ int ch = getch();
+ if (ch == 'q') {
+ run = false;
+ continue;
+ } else if (ch == 'j') {
+ int step_sz = 1;
+ if (digits.size() > 0) {
+ step_sz = std::atoi(digits.c_str());
+ digits.clear();
+ }
+
+ idx += step_sz;
+ idx = std::min(PAGE_LINES - 1, idx);
+ } else if (ch == 'k') {
+ int step_sz = 1;
+ if (digits.size() > 0) {
+ step_sz = std::atoi(digits.c_str());
+ digits.clear();
+ }
+
+ idx -= step_sz;
+ idx = std::max(0, idx);
+ } else if (ch == KEY_NPAGE) {
+ ++page_offset;
+ } else if (ch == KEY_PPAGE) {
+ --page_offset;
+ page_offset = std::max(0, page_offset);
+ } else if (ch == 'x') {
+ int cur_idx = 0;
+ for (const auto& [txt_path, transcript] : transcripts) {
+ if (cur_idx != page_offset * PAGE_LINES + idx) {
+ ++cur_idx;
+ continue;
+ }
+ std::filesystem::path wav_file = std::filesystem::path(txt_path).replace_extension(".wav");
+ std::filesystem::remove(txt_path);
+ std::filesystem::remove(wav_file);
+ break;
+ }
+ } else if (ch >= '0' && ch <= '9') {
+ digits += ch;
+ } else if (ch == 'g') {
+ int target = idx;
+ if (digits.size() > 0) {
+ target = std::atoi(digits.c_str());
+ digits.clear();
+ }
+
+ idx = target;
+ idx = std::min(PAGE_LINES - 1, idx);
+ idx = std::max(0, idx);
+ } else if (ch == 27) { // ASCII value of esc key
+ digits.clear();
+ }
+ }
+
+ // End ncurses mode
+ endwin();
+
+ return 0;
+}
+