from math import ceil, floor from PIL import Image from unidecode import unidecode import sentencepiece as spm IMG_RES = 512 # square image def get_tokenizer(): use_sentencepiece = True if not use_sentencepiece: from tokenizers import Tokenizer tokenizer_json = "./custom_wordpiece_tokenizer_65k/tokenizer.json" print(f"Loading Tokenizers library tokenizer from: {tokenizer_json}") return Tokenizer.from_file(tokenizer_json) else: model_path = "./custom_unigram_tokenizer_65k/unigram.model" print(f"Loading SentencePiece tokenizer from: {model_path}") sp = spm.SentencePieceProcessor() sp.load(model_path) print(f"Successfully loaded SentencePiece model. Vocab size: {sp.get_piece_size()}") return sp def get_words(): tokenizer = get_tokenizer() print(f"vocabulary size: {tokenizer.get_piece_size()}") # sp_space = sentencepiece space. # A special character sentencepiece uses to represent spaces before words. sp_space = chr(9601) words = [] # Accumulate words into a list, indexed by the token number. Sanitize them as # you go. for i in range(tokenizer.get_piece_size()): word = tokenizer.id_to_piece(i) tok = i #print(f" Original token ({tok}): {repr(word)} ({' '.join(str(ord(c)) for c in word)})") word_sanitized = "" # Dirty hack: convert non-ASCII characters to nearest ASCII equivalent for c in word: if ord(c) > 127 and c != sp_space: c_plain = unidecode(c) print(f" Resolved {c} to {c_plain}") word_sanitized += c_plain else: word_sanitized += c # Replace sp_space with ' ' word_sanitized = word_sanitized.replace(sp_space, ' ') #print(f" {tok}: {word_sanitized}") words.append(word_sanitized) # Special word: empty string. SentencePiece doesn't support this natively. words.append('') return words # Fold a flat index into a IMG_RESxIMG_RES box. Return the (x,y) coordinate of # the folded index. def fold_idx(flat_idx): return (flat_idx % IMG_RES, int(floor(flat_idx / IMG_RES))) def unfold_idx(coord): return coord[0] + coord[1] * IMG_RES assert unfold_idx(fold_idx(1533125)) == 1533125 assert unfold_idx(fold_idx(8538235)) == 8538235 assert fold_idx(unfold_idx((192,235))) == (192,235) assert fold_idx(unfold_idx((83,388))) == (83,388) def generate_lut(words, filename): # Write the texture header. black = (0, 0, 0, 255) img = Image.new('RGBA', (IMG_RES, IMG_RES), black) # The header is `len(words)` slots long. Thus the actual LUT content starts at # the index `len(words)`. pixel_data = img.load() lut_ptr = len(words) for i in range(0, len(words)): # Get pointer to the actual word data. tok_ptr = lut_ptr tok_len = len(words[i]) rgba = ((tok_ptr >> 0) & 0xFF, (tok_ptr >> 8) & 0xFF, (tok_ptr >> 16) & 0xFF, tok_len) print(f"Writing {rgba} to {i} / {fold_idx(i)}") idx_x, idx_y = fold_idx(i) pixel_data[idx_x, idx_y] = rgba for j in range(0, ceil(tok_len/4.0)): quad_ptr = tok_ptr + j tok_0 = ord(words[i][j*4]) tok_1 = ord(words[i][j*4+1] if tok_len > j*4+1 else ' ') tok_2 = ord(words[i][j*4+2] if tok_len > j*4+2 else ' ') tok_3 = ord(words[i][j*4+3] if tok_len > j*4+3 else ' ') rgba = (tok_0, tok_1, tok_2, tok_3) idx_x, idx_y = fold_idx(quad_ptr) print(f" Writing {rgba} to {quad_ptr} / {fold_idx(quad_ptr)}") pixel_data[idx_x, idx_y] = rgba # Advance the LUT ptr. Since we store 4 chars per pixel (RGBA), we advance # it by ceil(tok_len/4). lut_ptr += int(ceil(tok_len/4.0)) pretty = False if pretty: for y in range(0, IMG_RES): for x in range(0, IMG_RES): rgba = pixel_data[x, y] pixel_data[x, y] = (rgba[0], rgba[1], rgba[2], 255) print(f"Saving to {filename}") img.save(filename) if __name__ == "__main__": words = get_words() generate_lut(words, "bpe_lut.png")