diff options
Diffstat (limited to 'bpe_dump.py')
| -rw-r--r-- | bpe_dump.py | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/bpe_dump.py b/bpe_dump.py new file mode 100644 index 0000000..74ee08f --- /dev/null +++ b/bpe_dump.py @@ -0,0 +1,61 @@ +import math +import sentencepiece as spm + +def get_tokenizer(): + model_path = "./custom_unigram_tokenizer_65k/unigram.model" + sp = spm.SentencePieceProcessor() + sp.load(model_path) + return sp + +tokenizer = get_tokenizer() + +print(f"vocabulary size: {tokenizer.get_piece_size()}") +# Sentencepiece uses U+2581 (lower one eighth block) to indicate a space before +# a subword. +sp_space = chr(9601) +tokens_with_non_ascii = set() +subword_len_histo = dict() +# The sum of the lengths of each subword in the vocabulary. These are rounded +# up to 4 characters. +vocab_len_4c_quantized = 0 + +for i in range(tokenizer.get_piece_size()): + k = tokenizer.id_to_piece(i) + v = i + print(f" Original token ({v}): {repr(k)} ({' '.join(str(ord(k_c)) for k_c in k)})") + for k_c in k: + if ord(k_c) > 127 and ord(k_c) != 9601: + tokens_with_non_ascii.add(k) + break + k_processed = k.replace(sp_space, ' ') + if not k.startswith(sp_space) and k not in ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]: + k_processed = k + else: + k_processed = k_processed + + current_len = len(k_processed) + if current_len in subword_len_histo: + subword_len_histo[current_len] += 1 + else: + subword_len_histo[current_len] = 1 + + vocab_len_4c_quantized += math.ceil(current_len / 4.0) * 4.0 + print(f" {v}: {k_processed}") + +print(f"Num tokens with non-ascii: {len(tokens_with_non_ascii)} ({100 * len(tokens_with_non_ascii) / tokenizer.get_piece_size():.2f})%") + +print(f"Subword length histogram:") +avg_subword_len = 0 +total_pieces_for_avg = 0 +for k_len, v_count in sorted(subword_len_histo.items(), key=lambda x: x[0]): + avg_subword_len += k_len * v_count + total_pieces_for_avg += v_count + print(f" {k_len}: {v_count}") + +if total_pieces_for_avg > 0: + avg_subword_len /= total_pieces_for_avg + print(f"Average subword length: {avg_subword_len:.4f}") +else: + print("Average subword length: N/A (no pieces analyzed)") + +print(f"Sum of all subword lengths, quantized to 4 character chunks: {vocab_len_4c_quantized}") |
