Coverage for muutils/jsonlines.py: 0%

32 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2025-04-04 03:33 -0600

1"utilities for reading and writing jsonlines files, including gzip support" 

2 

3from __future__ import annotations 

4 

5import gzip 

6import json 

7from typing import Callable, Sequence 

8 

9from muutils.json_serialize import JSONitem 

10 

11_GZIP_EXTENSIONS: tuple = (".gz", ".gzip") 

12 

13 

14def _file_is_gzip(path: str) -> bool: 

15 return any(str(path).endswith(ext) for ext in _GZIP_EXTENSIONS) 

16 

17 

18def _get_opener( 

19 path: str, 

20 use_gzip: bool | None = None, 

21) -> Callable: 

22 if use_gzip is None: 

23 use_gzip = _file_is_gzip(path) 

24 

25 # appears to be another mypy bug 

26 # https://github.com/python/mypy/issues/10740 

27 return open if not use_gzip else gzip.open # type: ignore 

28 

29 

30def jsonl_load( 

31 path: str, 

32 /, 

33 *, 

34 use_gzip: bool | None = None, 

35) -> list[JSONitem]: 

36 opener: Callable = _get_opener(path, use_gzip) 

37 

38 data: list[JSONitem] = list() 

39 with opener(path, "rt", encoding="UTF-8") as f: 

40 for line in f: 

41 data.append(json.loads(line)) 

42 

43 return data 

44 

45 

46def jsonl_load_log( 

47 path: str, 

48 /, 

49 *, 

50 use_gzip: bool | None = None, 

51) -> list[dict]: 

52 data: list[JSONitem] = jsonl_load(path, use_gzip=use_gzip) 

53 for idx, item in enumerate(data): 

54 assert isinstance( 

55 item, dict 

56 ), f"item {idx = } from file {path} is not a dict: {type(item) = }\t{item = }" 

57 

58 # mypy complains that we are returning a list[JSONitem] but the function signature says list[dict] 

59 # it can't figure out that we are asserting that all items are dicts 

60 return data # type: ignore 

61 

62 

63def jsonl_write( 

64 path: str, 

65 items: Sequence[JSONitem], 

66 use_gzip: bool | None = None, 

67 gzip_compresslevel: int = 2, 

68) -> None: 

69 opener: Callable = _get_opener(path, use_gzip) 

70 

71 opener_kwargs: dict = dict() 

72 if use_gzip: 

73 opener_kwargs = dict(compresslevel=gzip_compresslevel) 

74 

75 with opener(path, "wt", encoding="UTF-8", **opener_kwargs) as f: 

76 for item in items: 

77 f.write(json.dumps(item) + "\n")