docs for maze-dataset v1.3.0
View Source on GitHub

maze_dataset.tokenization.modular.fst_load

to check if a tokenizer is one of our "approved" ones, look in an fst set we made with rust_fst

this file handles the creation of this fst file, which we ship to the user

this file relies on importing get_all_tokenizers and thus MazeTokenizerModular. as such, loading this file for validating a tokenizer is the separate maze_dataset.tokenization.modular.fst_load module, since we need to be able to import that from maze_dataset.tokenization.modular.maze_tokenizer_modular and we cannot circularly import

thanks to https://github.com/rozbb for suggesting doing this instead of storing a whole bunch of hashes like we were doing before


 1"""to check if a tokenizer is one of our "approved" ones, look in an fst set we made with `rust_fst`
 2
 3this file handles the creation of this fst file, which we ship to the user
 4
 5this file relies on importing `get_all_tokenizers` and thus `MazeTokenizerModular`.
 6as such, loading this file for validating a tokenizer is the separate `maze_dataset.tokenization.modular.fst_load`
 7module, since we need to be able to import that from `maze_dataset.tokenization.modular.maze_tokenizer_modular` and
 8we cannot circularly import
 9
10thanks to https://github.com/rozbb for suggesting doing this instead of storing a whole bunch of hashes like we were doing before
11
12"""
13
14from functools import cache
15from pathlib import Path
16
17from rust_fst import Set as FstSet  # type: ignore[import-untyped]
18
19MMT_FST_PATH: Path = Path(__file__).parent / "MazeTokenizerModular_tested.fst"
20
21
22@cache
23def get_tokenizers_fst() -> FstSet:
24	"""(cached) load the tokenizers fst set from `MMT_FST_PATH`"""
25	return FstSet(MMT_FST_PATH.as_posix())
26
27
28def check_tokenizer_in_fst(tokenizer_name: str, do_except: bool = False) -> bool:
29	"""check if a tokenizer is in the fst set
30
31	prints nearest matches if `do_except` is `True` and the tokenizer is not found
32	"""
33	search_0: list[str] = list(get_tokenizers_fst().search(tokenizer_name, 0))
34	in_fst: bool = len(search_0) == 1 and search_0[0] == tokenizer_name
35
36	if do_except and not in_fst:
37		search_1: list[str] | None = None
38		search_2: list[str] | None = None
39		try:
40			search_1 = list(get_tokenizers_fst().search(tokenizer_name, 1))
41			search_2 = list(get_tokenizers_fst().search(tokenizer_name, 2))
42		except Exception:  # noqa: BLE001, S110
43			# the only thing failing here is getting possible match tokenizers, so it's fine to just ignore the errors
44			pass
45
46		err_msg: str = (
47			f"Tokenizer `{tokenizer_name}` not found in the list of tested tokenizers, and {do_except = }. We found the following matches based on edit distance:"
48			f"\nedit dist 0 (should be empty?): {search_0}"
49			+ (f"\nedit dist 1: {search_1}" if search_1 is not None else "")
50			+ (f"\nedit dist 2: {search_2}" if search_2 is not None else "")
51		)
52		raise ValueError(err_msg)
53
54	return in_fst

MMT_FST_PATH: pathlib.Path = PosixPath('/home/miv/projects/mazes/maze-dataset/maze_dataset/tokenization/modular/MazeTokenizerModular_testedmaze_dataset.tokenization.modular.fst')
@cache
def get_tokenizers_fst() -> rust_fst.set.Set:
23@cache
24def get_tokenizers_fst() -> FstSet:
25	"""(cached) load the tokenizers fst set from `MMT_FST_PATH`"""
26	return FstSet(MMT_FST_PATH.as_posix())

(cached) load the tokenizers fst set from MMT_FST_PATH

def check_tokenizer_in_fst(tokenizer_name: str, do_except: bool = False) -> bool:
29def check_tokenizer_in_fst(tokenizer_name: str, do_except: bool = False) -> bool:
30	"""check if a tokenizer is in the fst set
31
32	prints nearest matches if `do_except` is `True` and the tokenizer is not found
33	"""
34	search_0: list[str] = list(get_tokenizers_fst().search(tokenizer_name, 0))
35	in_fst: bool = len(search_0) == 1 and search_0[0] == tokenizer_name
36
37	if do_except and not in_fst:
38		search_1: list[str] | None = None
39		search_2: list[str] | None = None
40		try:
41			search_1 = list(get_tokenizers_fst().search(tokenizer_name, 1))
42			search_2 = list(get_tokenizers_fst().search(tokenizer_name, 2))
43		except Exception:  # noqa: BLE001, S110
44			# the only thing failing here is getting possible match tokenizers, so it's fine to just ignore the errors
45			pass
46
47		err_msg: str = (
48			f"Tokenizer `{tokenizer_name}` not found in the list of tested tokenizers, and {do_except = }. We found the following matches based on edit distance:"
49			f"\nedit dist 0 (should be empty?): {search_0}"
50			+ (f"\nedit dist 1: {search_1}" if search_1 is not None else "")
51			+ (f"\nedit dist 2: {search_2}" if search_2 is not None else "")
52		)
53		raise ValueError(err_msg)
54
55	return in_fst

check if a tokenizer is in the fst set

prints nearest matches if do_except is True and the tokenizer is not found