Import FastTextPager repo

author: yum <yum.food.vr@gmail.com> 2025-07-23 22:39:45 -0700
committer: yum <yum.food.vr@gmail.com> 2025-07-23 22:39:45 -0700
commit: f6b93a20d754579008076e85f5c0a97e1bcbc258 (patch)
tree: 7288699d6f22e76c4f30636a37e94265b3ef7708 /tokenize_me.py
parent: f3782c200c9a2ec2b77708da67b4127a38465ad1 (diff)
parent: 043a447133695bfd2285a534b941db972873a692 (diff)
1 files changed, 28 insertions, 0 deletions
diff --git a/tokenize_me.py b/tokenize_me.py
new file mode 100644
index 0000000..83be290
--- /dev/null
+++ b/tokenize_me.py
@@ -0,0 +1,28 @@
+import argparse
+import sentencepiece as spm
+
+def get_tokenizer():
+    model_path = "./custom_unigram_tokenizer_65k/unigram.model"
+    print(f"Loading SentencePiece tokenizer from: {model_path}")
+    sp = spm.SentencePieceProcessor()
+    sp.load(model_path)
+    print(f"Successfully loaded SentencePiece model. Vocab size: {sp.get_piece_size()}")
+    return sp
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Tokenize a given string using a SentencePiece model.")
+    parser.add_argument("text", type=str, help="The string to tokenize.")
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+tok = get_tokenizer()
+tokens = tok.encode_as_pieces(args.text)
+print("Tokens:", tokens)
+
+token_ids = tok.encode_as_ids(args.text)
+print("Token IDs:", token_ids)
+
+# Split each token ID into two 8-bit chunks (high byte, low byte)
+byte_pairs = [(tid >> 8, tid & 0xFF) for tid in token_ids]
+print("Token ID Byte Pairs:", byte_pairs)
author	yum <yum.food.vr@gmail.com>	2025-07-23 22:39:45 -0700
committer	yum <yum.food.vr@gmail.com>	2025-07-23 22:39:45 -0700
commit	f6b93a20d754579008076e85f5c0a97e1bcbc258 (patch)
tree	7288699d6f22e76c4f30636a37e94265b3ef7708 /tokenize_me.py
parent	f3782c200c9a2ec2b77708da67b4127a38465ad1 (diff)
parent	043a447133695bfd2285a534b941db972873a692 (diff)