Coverage for tests / unit / test_jsonlines.py: 100%

95 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-18 02:51 -0700

1from __future__ import annotations 

2 

3import gzip 

4import json 

5from pathlib import Path 

6 

7import pytest 

8 

9from muutils.json_serialize import JSONitem 

10from muutils.jsonlines import jsonl_load, jsonl_load_log, jsonl_write 

11 

12TEMP_PATH: Path = Path("tests/_temp/jsonl") 

13 

14 

15def test_jsonl_load(): 

16 """Test loading jsonlines file - write data, load it back, verify it matches.""" 

17 # Create temp directory 

18 TEMP_PATH.mkdir(parents=True, exist_ok=True) 

19 

20 test_file = TEMP_PATH / "test_load.jsonl" 

21 

22 # Create test data 

23 test_data = [ 

24 {"id": 1, "name": "Alice", "value": 42.5}, 

25 {"id": 2, "name": "Bob", "value": 17.3}, 

26 {"id": 3, "name": "Charlie", "value": None}, 

27 {"list": [1, 2, 3], "nested": {"a": 1, "b": 2}}, 

28 ] 

29 

30 # Write the data manually 

31 with open(test_file, "w", encoding="UTF-8") as f: 

32 for item in test_data: 

33 f.write(json.dumps(item) + "\n") 

34 

35 # Load it back using jsonl_load 

36 loaded_data = jsonl_load(str(test_file)) 

37 

38 # Verify the data matches 

39 assert loaded_data == test_data 

40 assert len(loaded_data) == 4 

41 loaded_item_0 = loaded_data[0] 

42 assert isinstance(loaded_item_0, dict) 

43 assert loaded_item_0["name"] == "Alice" # ty: ignore[invalid-argument-type] 

44 loaded_item_3 = loaded_data[3] 

45 assert isinstance(loaded_item_3, dict) 

46 loaded_item_3_nested = loaded_item_3["nested"] # ty: ignore[invalid-argument-type] 

47 assert isinstance(loaded_item_3_nested, dict) 

48 assert loaded_item_3_nested["b"] == 2 

49 

50 

51def test_jsonl_write(): 

52 """Test writing jsonlines data - write using jsonl_write, read raw contents, verify format.""" 

53 # Create temp directory 

54 TEMP_PATH.mkdir(parents=True, exist_ok=True) 

55 

56 test_file = TEMP_PATH / "test_write.jsonl" 

57 

58 # Test data 

59 test_data: list[JSONitem] = [ 

60 {"id": 1, "status": "active"}, 

61 {"id": 2, "status": "inactive"}, 

62 {"id": 3, "status": "pending", "metadata": {"priority": "high"}}, 

63 ] 

64 

65 # Write using jsonl_write 

66 jsonl_write(str(test_file), test_data) 

67 

68 # Read raw contents 

69 with open(test_file, "r", encoding="UTF-8") as f: 

70 lines = f.readlines() 

71 

72 # Verify format 

73 assert len(lines) == 3 

74 

75 # Each line should be valid JSON 

76 for i, line in enumerate(lines): 

77 assert line.endswith("\n") 

78 parsed = json.loads(line) 

79 assert parsed == test_data[i] 

80 

81 # Verify specific content 

82 assert json.loads(lines[0]) == {"id": 1, "status": "active"} 

83 assert json.loads(lines[2])["metadata"]["priority"] == "high" 

84 

85 

86def test_gzip_support(): 

87 """Test .gz extension auto-detection for both reading and writing.""" 

88 # Create temp directory 

89 TEMP_PATH.mkdir(parents=True, exist_ok=True) 

90 

91 test_file_gz = TEMP_PATH / "test_gzip.jsonl.gz" 

92 test_file_gzip = TEMP_PATH / "test_gzip2.jsonl.gzip" 

93 

94 # Test data 

95 test_data: list[JSONitem] = [ 

96 {"compressed": True, "value": 123}, 

97 {"compressed": True, "value": 456}, 

98 ] 

99 

100 # Test with .gz extension - auto-detection 

101 jsonl_write(str(test_file_gz), test_data) 

102 

103 # Verify it's actually gzipped by trying to read with gzip 

104 with gzip.open(test_file_gz, "rt", encoding="UTF-8") as f: 

105 lines = f.readlines() 

106 assert len(lines) == 2 

107 

108 # Load back using jsonl_load with auto-detection 

109 loaded_data = jsonl_load(str(test_file_gz)) 

110 assert loaded_data == test_data 

111 

112 # Test with .gzip extension 

113 jsonl_write(str(test_file_gzip), test_data) 

114 loaded_data_gzip = jsonl_load(str(test_file_gzip)) 

115 assert loaded_data_gzip == test_data 

116 

117 # Test explicit use_gzip parameter 

118 test_file_explicit = TEMP_PATH / "test_explicit.jsonl" 

119 jsonl_write(str(test_file_explicit), test_data, use_gzip=True) 

120 

121 # Should be gzipped even without .gz extension 

122 with gzip.open(test_file_explicit, "rt", encoding="UTF-8") as f: 

123 lines = f.readlines() 

124 assert len(lines) == 2 

125 

126 loaded_explicit = jsonl_load(str(test_file_explicit), use_gzip=True) 

127 assert loaded_explicit == test_data 

128 

129 

130def test_jsonl_load_log(): 

131 """Test jsonl_load_log with dict assertion - test with valid dicts and non-dict items.""" 

132 # Create temp directory 

133 TEMP_PATH.mkdir(parents=True, exist_ok=True) 

134 

135 # Test with valid dict data 

136 test_file_valid = TEMP_PATH / "test_log_valid.jsonl" 

137 valid_data: list[JSONitem] = [ 

138 {"level": "INFO", "message": "Starting process"}, 

139 {"level": "WARNING", "message": "Low memory"}, 

140 {"level": "ERROR", "message": "Connection failed"}, 

141 ] 

142 

143 jsonl_write(str(test_file_valid), valid_data) 

144 loaded_log = jsonl_load_log(str(test_file_valid)) 

145 

146 assert loaded_log == valid_data 

147 assert all(isinstance(item, dict) for item in loaded_log) 

148 

149 # Test with non-dict items - should raise AssertionError 

150 test_file_invalid = TEMP_PATH / "test_log_invalid.jsonl" 

151 invalid_data: list[JSONitem] = [ 

152 {"level": "INFO", "message": "Valid entry"}, 

153 "not a dict", # This is invalid 

154 {"level": "ERROR", "message": "Another valid entry"}, 

155 ] 

156 

157 jsonl_write(str(test_file_invalid), invalid_data) 

158 

159 with pytest.raises(AssertionError) as exc_info: 

160 jsonl_load_log(str(test_file_invalid)) 

161 

162 # Verify the error message contains useful information 

163 error_msg = str(exc_info.value) 

164 assert "idx = 1" in error_msg 

165 assert "is not a dict" in error_msg 

166 

167 # Test with list item 

168 test_file_list = TEMP_PATH / "test_log_list.jsonl" 

169 list_data: list[JSONitem] = [ 

170 {"level": "INFO"}, 

171 [1, 2, 3], # List instead of dict 

172 ] 

173 

174 jsonl_write(str(test_file_list), list_data) 

175 

176 with pytest.raises(AssertionError) as exc_info: 

177 jsonl_load_log(str(test_file_list)) 

178 

179 error_msg = str(exc_info.value) 

180 assert "idx = 1" in error_msg 

181 assert "is not a dict" in error_msg 

182 

183 

184def test_gzip_compresslevel(): 

185 """Test that gzip_compresslevel parameter works without errors.""" 

186 # Create temp directory 

187 TEMP_PATH.mkdir(parents=True, exist_ok=True) 

188 

189 test_file = TEMP_PATH / "test_compresslevel.jsonl.gz" 

190 

191 # Create test data 

192 test_data: list[JSONitem] = [{"value": i, "data": "content"} for i in range(10)] 

193 

194 # Write with different compression levels - should not error 

195 jsonl_write(str(test_file), test_data, gzip_compresslevel=1) 

196 loaded_data = jsonl_load(str(test_file)) 

197 assert loaded_data == test_data 

198 

199 jsonl_write(str(test_file), test_data, gzip_compresslevel=9) 

200 loaded_data = jsonl_load(str(test_file)) 

201 assert loaded_data == test_data