blob: 74ee08fdf8ce61b7556d0e1e3ad3c54a379e6a01 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
import math
import sentencepiece as spm
def get_tokenizer():
model_path = "./custom_unigram_tokenizer_65k/unigram.model"
sp = spm.SentencePieceProcessor()
sp.load(model_path)
return sp
tokenizer = get_tokenizer()
print(f"vocabulary size: {tokenizer.get_piece_size()}")
# Sentencepiece uses U+2581 (lower one eighth block) to indicate a space before
# a subword.
sp_space = chr(9601)
tokens_with_non_ascii = set()
subword_len_histo = dict()
# The sum of the lengths of each subword in the vocabulary. These are rounded
# up to 4 characters.
vocab_len_4c_quantized = 0
for i in range(tokenizer.get_piece_size()):
k = tokenizer.id_to_piece(i)
v = i
print(f" Original token ({v}): {repr(k)} ({' '.join(str(ord(k_c)) for k_c in k)})")
for k_c in k:
if ord(k_c) > 127 and ord(k_c) != 9601:
tokens_with_non_ascii.add(k)
break
k_processed = k.replace(sp_space, ' ')
if not k.startswith(sp_space) and k not in ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]:
k_processed = k
else:
k_processed = k_processed
current_len = len(k_processed)
if current_len in subword_len_histo:
subword_len_histo[current_len] += 1
else:
subword_len_histo[current_len] = 1
vocab_len_4c_quantized += math.ceil(current_len / 4.0) * 4.0
print(f" {v}: {k_processed}")
print(f"Num tokens with non-ascii: {len(tokens_with_non_ascii)} ({100 * len(tokens_with_non_ascii) / tokenizer.get_piece_size():.2f})%")
print(f"Subword length histogram:")
avg_subword_len = 0
total_pieces_for_avg = 0
for k_len, v_count in sorted(subword_len_histo.items(), key=lambda x: x[0]):
avg_subword_len += k_len * v_count
total_pieces_for_avg += v_count
print(f" {k_len}: {v_count}")
if total_pieces_for_avg > 0:
avg_subword_len /= total_pieces_for_avg
print(f"Average subword length: {avg_subword_len:.4f}")
else:
print("Average subword length: N/A (no pieces analyzed)")
print(f"Sum of all subword lengths, quantized to 4 character chunks: {vocab_len_4c_quantized}")
|