summaryrefslogtreecommitdiffstats
path: root/bpe_dump.py
diff options
context:
space:
mode:
authoryum <yum.food.vr@gmail.com>2025-05-11 22:22:48 -0700
committeryum <yum.food.vr@gmail.com>2025-05-11 22:23:25 -0700
commit0c54e1fc74fe7677a0d4fef1c147c6e886d182db (patch)
tree8e862e95fe56b5960a53b4dd30465371d8afc47b /bpe_dump.py
code bomb
Diffstat (limited to 'bpe_dump.py')
-rw-r--r--bpe_dump.py61
1 files changed, 61 insertions, 0 deletions
diff --git a/bpe_dump.py b/bpe_dump.py
new file mode 100644
index 0000000..74ee08f
--- /dev/null
+++ b/bpe_dump.py
@@ -0,0 +1,61 @@
+import math
+import sentencepiece as spm
+
+def get_tokenizer():
+ model_path = "./custom_unigram_tokenizer_65k/unigram.model"
+ sp = spm.SentencePieceProcessor()
+ sp.load(model_path)
+ return sp
+
+tokenizer = get_tokenizer()
+
+print(f"vocabulary size: {tokenizer.get_piece_size()}")
+# Sentencepiece uses U+2581 (lower one eighth block) to indicate a space before
+# a subword.
+sp_space = chr(9601)
+tokens_with_non_ascii = set()
+subword_len_histo = dict()
+# The sum of the lengths of each subword in the vocabulary. These are rounded
+# up to 4 characters.
+vocab_len_4c_quantized = 0
+
+for i in range(tokenizer.get_piece_size()):
+ k = tokenizer.id_to_piece(i)
+ v = i
+ print(f" Original token ({v}): {repr(k)} ({' '.join(str(ord(k_c)) for k_c in k)})")
+ for k_c in k:
+ if ord(k_c) > 127 and ord(k_c) != 9601:
+ tokens_with_non_ascii.add(k)
+ break
+ k_processed = k.replace(sp_space, ' ')
+ if not k.startswith(sp_space) and k not in ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]:
+ k_processed = k
+ else:
+ k_processed = k_processed
+
+ current_len = len(k_processed)
+ if current_len in subword_len_histo:
+ subword_len_histo[current_len] += 1
+ else:
+ subword_len_histo[current_len] = 1
+
+ vocab_len_4c_quantized += math.ceil(current_len / 4.0) * 4.0
+ print(f" {v}: {k_processed}")
+
+print(f"Num tokens with non-ascii: {len(tokens_with_non_ascii)} ({100 * len(tokens_with_non_ascii) / tokenizer.get_piece_size():.2f})%")
+
+print(f"Subword length histogram:")
+avg_subword_len = 0
+total_pieces_for_avg = 0
+for k_len, v_count in sorted(subword_len_histo.items(), key=lambda x: x[0]):
+ avg_subword_len += k_len * v_count
+ total_pieces_for_avg += v_count
+ print(f" {k_len}: {v_count}")
+
+if total_pieces_for_avg > 0:
+ avg_subword_len /= total_pieces_for_avg
+ print(f"Average subword length: {avg_subword_len:.4f}")
+else:
+ print("Average subword length: N/A (no pieces analyzed)")
+
+print(f"Sum of all subword lengths, quantized to 4 character chunks: {vocab_len_4c_quantized}")