Coverage for maze_dataset/tokenization/modular/fst_load.py: 33%
21 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-20 17:51 -0600
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-20 17:51 -0600
1"""to check if a tokenizer is one of our "approved" ones, look in an fst set we made with `rust_fst`
3this file handles the creation of this fst file, which we ship to the user
5this file relies on importing `get_all_tokenizers` and thus `MazeTokenizerModular`.
6as such, loading this file for validating a tokenizer is the separate `maze_dataset.tokenization.modular.fst_load`
7module, since we need to be able to import that from `maze_dataset.tokenization.modular.maze_tokenizer_modular` and
8we cannot circularly import
10thanks to https://github.com/rozbb for suggesting doing this instead of storing a whole bunch of hashes like we were doing before
12"""
14from functools import cache
15from pathlib import Path
17from rust_fst import Set as FstSet # type: ignore[import-untyped]
19MMT_FST_PATH: Path = Path(__file__).parent / "MazeTokenizerModular_tested.fst"
22@cache
23def get_tokenizers_fst() -> FstSet:
24 """(cached) load the tokenizers fst set from `MMT_FST_PATH`"""
25 return FstSet(MMT_FST_PATH.as_posix())
28def check_tokenizer_in_fst(tokenizer_name: str, do_except: bool = False) -> bool:
29 """check if a tokenizer is in the fst set
31 prints nearest matches if `do_except` is `True` and the tokenizer is not found
32 """
33 search_0: list[str] = list(get_tokenizers_fst().search(tokenizer_name, 0))
34 in_fst: bool = len(search_0) == 1 and search_0[0] == tokenizer_name
36 if do_except and not in_fst:
37 search_1: list[str] | None = None
38 search_2: list[str] | None = None
39 try:
40 search_1 = list(get_tokenizers_fst().search(tokenizer_name, 1))
41 search_2 = list(get_tokenizers_fst().search(tokenizer_name, 2))
42 except Exception: # noqa: BLE001, S110
43 # the only thing failing here is getting possible match tokenizers, so it's fine to just ignore the errors
44 pass
46 err_msg: str = (
47 f"Tokenizer `{tokenizer_name}` not found in the list of tested tokenizers, and {do_except = }. We found the following matches based on edit distance:"
48 f"\nedit dist 0 (should be empty?): {search_0}"
49 + (f"\nedit dist 1: {search_1}" if search_1 is not None else "")
50 + (f"\nedit dist 2: {search_2}" if search_2 is not None else "")
51 )
52 raise ValueError(err_msg)
54 return in_fst