1 files changed, 92 insertions, 0 deletions
diff --git a/Scripts/obfuscate.py b/Scripts/obfuscate.py
new file mode 100644
index 0000000..8d01e10
--- /dev/null
+++ b/Scripts/obfuscate.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+
+# This module is used to implement obfuscation of TaSTT network
+# speech data. At a high level, TaSTT is simply streaming N bits of
+# arbitrary data to a shader via VRChat's parameter sync mechanism.
+#
+# It would be trivial to mine this data for speech information, since
+# we're sending unicode (or ASCII) characters to peers.
+#
+# To raise the cost for the casual data collector, we can obfuscate
+# this data using a one-time pad in cipher-block chaining mode.
+#
+# Making things interesting, encrypted data will arrive at the Unity
+# animator,  which processes them in 8 bit chunks. They are written
+# into contiguous blocks of the animator. Thus the shader can decrypt
+# the board by decrypting each block. This is thus stronger than
+# applying a one-time pad to each byte of the plaintext, since the
+# statistical distribution of individual letters is destroyed.
+# Obviously due to the lack of an initialization vector, the
+# distribution of phrases (blocks) is preserved.
+
+import math
+import os
+
+def genKey(n_bits = 128) -> bytearray:
+    return os.urandom(int(n_bits / 8))
+
+def saveKey(filename: str, key: str):
+    with open(filename, "wb") as f:
+        f.write(key)
+
+def loadKey(filename: str) -> bytearray:
+    with open(filename, "rb") as f:
+        return f.read()
+
+# Apply a symmetric cypher to `data` using cypher-block chaining.
+def obfuscate(data: bytearray, key: bytearray) -> str:
+    n_blocks = int(math.ceil(len(data) / len(key)))
+    # This is a misnomer. A true IV would be randomized, but we can't
+    # do that since the shader doesn't have access to it. We just use
+    # this to implement the "chaining" aspect of CBC.
+    iv = bytearray(b'\x00') * len(key)
+    result = bytearray()
+    for i in range(0, n_blocks):
+        block_begin = i * len(key)
+        block_end = (i + 1) * len(key)
+        block_plain = data[block_begin:block_end]
+        block_cypher = block_plain.copy()
+        for i in range(0, len(block_cypher)):
+            block_cypher[i] ^= iv[i]
+            block_cypher[i] ^= key[i]
+        result += block_cypher
+        iv = block_cypher
+    return result
+
+def deobfuscate(data: bytearray, key: bytearray) -> str:
+    n_blocks = int(math.ceil(len(data) / len(key)))
+    # This is a misnomer. A true IV would be randomized, but we can't
+    # do that since the shader doesn't have access to it. We just use
+    # this to implement the "chaining" aspect of CBC.
+    iv = bytearray(b'\x00') * len(key)
+    result = bytearray()
+    for i in range(0, n_blocks):
+        block_begin = i * len(key)
+        block_end = (i + 1) * len(key)
+        block_cypher = data[block_begin:block_end]
+        block_plain = block_cypher.copy()
+        for i in range(0, len(block_plain)):
+            block_plain[i] ^= key[i]
+            block_plain[i] ^= iv[i]
+        result += block_plain
+        iv = block_cypher
+    return result
+
+def test():
+    key = genKey()
+    saveKey("test.key", key)
+    new_key = loadKey("test.key")
+    os.remove("test.key")
+    assert(key == new_key)
+
+    plaintext_original = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+    plaintext_bytes = bytearray(plaintext_original, "utf-8")
+    cyphertext = obfuscate(plaintext_bytes, key)
+    assert(len(plaintext_bytes) == len(cyphertext))
+    plaintext_recovered = deobfuscate(cyphertext, key).decode("utf-8")
+    assert(plaintext_original == plaintext_recovered)
+    assert(plaintext_bytes != cyphertext)
+
+if __name__ == "__main__":
+    test()
+