diff options
| author | yum <yum.food.vr@gmail.com> | 2025-07-23 22:39:45 -0700 |
|---|---|---|
| committer | yum <yum.food.vr@gmail.com> | 2025-07-23 22:39:45 -0700 |
| commit | f6b93a20d754579008076e85f5c0a97e1bcbc258 (patch) | |
| tree | 7288699d6f22e76c4f30636a37e94265b3ef7708 /tokenize_me.py | |
| parent | f3782c200c9a2ec2b77708da67b4127a38465ad1 (diff) | |
| parent | 043a447133695bfd2285a534b941db972873a692 (diff) | |
Import FastTextPager repo
Diffstat (limited to 'tokenize_me.py')
| -rw-r--r-- | tokenize_me.py | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/tokenize_me.py b/tokenize_me.py new file mode 100644 index 0000000..83be290 --- /dev/null +++ b/tokenize_me.py @@ -0,0 +1,28 @@ +import argparse +import sentencepiece as spm + +def get_tokenizer(): + model_path = "./custom_unigram_tokenizer_65k/unigram.model" + print(f"Loading SentencePiece tokenizer from: {model_path}") + sp = spm.SentencePieceProcessor() + sp.load(model_path) + print(f"Successfully loaded SentencePiece model. Vocab size: {sp.get_piece_size()}") + return sp + +def parse_args(): + parser = argparse.ArgumentParser(description="Tokenize a given string using a SentencePiece model.") + parser.add_argument("text", type=str, help="The string to tokenize.") + args = parser.parse_args() + return args + +args = parse_args() +tok = get_tokenizer() +tokens = tok.encode_as_pieces(args.text) +print("Tokens:", tokens) + +token_ids = tok.encode_as_ids(args.text) +print("Token IDs:", token_ids) + +# Split each token ID into two 8-bit chunks (high byte, low byte) +byte_pairs = [(tid >> 8, tid & 0xFF) for tid in token_ids] +print("Token ID Byte Pairs:", byte_pairs) |
