Coverage for maze_dataset/tokenization/modular/fst_load.py: 33%

21 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-20 17:51 -0600

1"""to check if a tokenizer is one of our "approved" ones, look in an fst set we made with `rust_fst` 

2 

3this file handles the creation of this fst file, which we ship to the user 

4 

5this file relies on importing `get_all_tokenizers` and thus `MazeTokenizerModular`. 

6as such, loading this file for validating a tokenizer is the separate `maze_dataset.tokenization.modular.fst_load` 

7module, since we need to be able to import that from `maze_dataset.tokenization.modular.maze_tokenizer_modular` and 

8we cannot circularly import 

9 

10thanks to https://github.com/rozbb for suggesting doing this instead of storing a whole bunch of hashes like we were doing before 

11 

12""" 

13 

14from functools import cache 

15from pathlib import Path 

16 

17from rust_fst import Set as FstSet # type: ignore[import-untyped] 

18 

19MMT_FST_PATH: Path = Path(__file__).parent / "MazeTokenizerModular_tested.fst" 

20 

21 

22@cache 

23def get_tokenizers_fst() -> FstSet: 

24 """(cached) load the tokenizers fst set from `MMT_FST_PATH`""" 

25 return FstSet(MMT_FST_PATH.as_posix()) 

26 

27 

28def check_tokenizer_in_fst(tokenizer_name: str, do_except: bool = False) -> bool: 

29 """check if a tokenizer is in the fst set 

30 

31 prints nearest matches if `do_except` is `True` and the tokenizer is not found 

32 """ 

33 search_0: list[str] = list(get_tokenizers_fst().search(tokenizer_name, 0)) 

34 in_fst: bool = len(search_0) == 1 and search_0[0] == tokenizer_name 

35 

36 if do_except and not in_fst: 

37 search_1: list[str] | None = None 

38 search_2: list[str] | None = None 

39 try: 

40 search_1 = list(get_tokenizers_fst().search(tokenizer_name, 1)) 

41 search_2 = list(get_tokenizers_fst().search(tokenizer_name, 2)) 

42 except Exception: # noqa: BLE001, S110 

43 # the only thing failing here is getting possible match tokenizers, so it's fine to just ignore the errors 

44 pass 

45 

46 err_msg: str = ( 

47 f"Tokenizer `{tokenizer_name}` not found in the list of tested tokenizers, and {do_except = }. We found the following matches based on edit distance:" 

48 f"\nedit dist 0 (should be empty?): {search_0}" 

49 + (f"\nedit dist 1: {search_1}" if search_1 is not None else "") 

50 + (f"\nedit dist 2: {search_2}" if search_2 is not None else "") 

51 ) 

52 raise ValueError(err_msg) 

53 

54 return in_fst