docs for muutils v0.8.10
View Source on GitHub

muutils.jsonlines

utilities for reading and writing jsonlines files, including gzip support


 1"utilities for reading and writing jsonlines files, including gzip support"
 2
 3from __future__ import annotations
 4
 5import gzip
 6import json
 7from typing import Callable, Sequence
 8
 9from muutils.json_serialize import JSONitem
10
11_GZIP_EXTENSIONS: tuple = (".gz", ".gzip")
12
13
14def _file_is_gzip(path: str) -> bool:
15    return any(str(path).endswith(ext) for ext in _GZIP_EXTENSIONS)
16
17
18def _get_opener(
19    path: str,
20    use_gzip: bool | None = None,
21) -> Callable:
22    if use_gzip is None:
23        use_gzip = _file_is_gzip(path)
24
25    # appears to be another mypy bug
26    # https://github.com/python/mypy/issues/10740
27    return open if not use_gzip else gzip.open  # type: ignore
28
29
30def jsonl_load(
31    path: str,
32    /,
33    *,
34    use_gzip: bool | None = None,
35) -> list[JSONitem]:
36    opener: Callable = _get_opener(path, use_gzip)
37
38    data: list[JSONitem] = list()
39    with opener(path, "rt", encoding="UTF-8") as f:
40        for line in f:
41            data.append(json.loads(line))
42
43    return data
44
45
46def jsonl_load_log(
47    path: str,
48    /,
49    *,
50    use_gzip: bool | None = None,
51) -> list[dict]:
52    data: list[JSONitem] = jsonl_load(path, use_gzip=use_gzip)
53    for idx, item in enumerate(data):
54        assert isinstance(
55            item, dict
56        ), f"item {idx = } from file {path} is not a dict: {type(item) = }\t{item = }"
57
58    # mypy complains that we are returning a list[JSONitem] but the function signature says list[dict]
59    # it can't figure out that we are asserting that all items are dicts
60    return data  # type: ignore
61
62
63def jsonl_write(
64    path: str,
65    items: Sequence[JSONitem],
66    use_gzip: bool | None = None,
67    gzip_compresslevel: int = 2,
68) -> None:
69    opener: Callable = _get_opener(path, use_gzip)
70
71    opener_kwargs: dict = dict()
72    if use_gzip:
73        opener_kwargs = dict(compresslevel=gzip_compresslevel)
74
75    with opener(path, "wt", encoding="UTF-8", **opener_kwargs) as f:
76        for item in items:
77            f.write(json.dumps(item) + "\n")

def jsonl_load( path: str, /, *, use_gzip: bool | None = None) -> list[typing.Union[bool, int, float, str, NoneType, typing.List[typing.Union[bool, int, float, str, NoneType, typing.List[typing.Any], typing.Dict[str, typing.Any]]], typing.Dict[str, typing.Union[bool, int, float, str, NoneType, typing.List[typing.Any], typing.Dict[str, typing.Any]]]]]:
31def jsonl_load(
32    path: str,
33    /,
34    *,
35    use_gzip: bool | None = None,
36) -> list[JSONitem]:
37    opener: Callable = _get_opener(path, use_gzip)
38
39    data: list[JSONitem] = list()
40    with opener(path, "rt", encoding="UTF-8") as f:
41        for line in f:
42            data.append(json.loads(line))
43
44    return data
def jsonl_load_log(path: str, /, *, use_gzip: bool | None = None) -> list[dict]:
47def jsonl_load_log(
48    path: str,
49    /,
50    *,
51    use_gzip: bool | None = None,
52) -> list[dict]:
53    data: list[JSONitem] = jsonl_load(path, use_gzip=use_gzip)
54    for idx, item in enumerate(data):
55        assert isinstance(
56            item, dict
57        ), f"item {idx = } from file {path} is not a dict: {type(item) = }\t{item = }"
58
59    # mypy complains that we are returning a list[JSONitem] but the function signature says list[dict]
60    # it can't figure out that we are asserting that all items are dicts
61    return data  # type: ignore
def jsonl_write( path: str, items: Sequence[Union[bool, int, float, str, NoneType, List[Union[bool, int, float, str, NoneType, List[Any], Dict[str, Any]]], Dict[str, Union[bool, int, float, str, NoneType, List[Any], Dict[str, Any]]]]], use_gzip: bool | None = None, gzip_compresslevel: int = 2) -> None:
64def jsonl_write(
65    path: str,
66    items: Sequence[JSONitem],
67    use_gzip: bool | None = None,
68    gzip_compresslevel: int = 2,
69) -> None:
70    opener: Callable = _get_opener(path, use_gzip)
71
72    opener_kwargs: dict = dict()
73    if use_gzip:
74        opener_kwargs = dict(compresslevel=gzip_compresslevel)
75
76    with opener(path, "wt", encoding="UTF-8", **opener_kwargs) as f:
77        for item in items:
78            f.write(json.dumps(item) + "\n")