Coverage for muutils/web/bundle_html.py: 90%

147 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2025-06-06 11:28 -0600

1""" 

2Inline / bundle external assets (CSS, JS, SVG, PNG) into an HTML document. 

3 

4Default mode uses **zero external dependencies** and a few well-targeted 

5regular expressions. If you install *beautifulsoup4* you can enable the 

6far more robust BS4 mode by passing `InlineConfig(use_bs4=True)`. 

7""" 

8 

9from __future__ import annotations 

10 

11import base64 

12import re 

13import urllib.request 

14import warnings 

15from dataclasses import dataclass, field 

16from pathlib import Path 

17from typing import Final, Literal 

18 

19# bs4 import deferred to avoid an unconditional dependency. 

20 

21# constants 

22# --------------------------------------------------------------------- 

23 

24AssetExt = Literal[".css", ".js", ".svg", ".png"] 

25 

26DEFAULT_ALLOWED_EXTENSIONS: Final[set[AssetExt]] = {".css", ".js", ".svg", ".png"} 

27 

28DEFAULT_TAG_ATTR: Final[dict[str, str]] = { 

29 "link": "href", # <link rel="stylesheet" href="..."> 

30 "script": "src", # <script src="..."></script> 

31 "img": "src", # <img src="..."> 

32 "use": "xlink:href", # <use xlink:href="sprite.svg#id"> 

33} 

34 

35MIME_BY_EXT: Final[dict[AssetExt, str]] = { 

36 ".css": "text/css", 

37 ".js": "application/javascript", 

38 ".svg": "image/svg+xml", 

39 ".png": "image/png", 

40} 

41 

42# Configuration 

43# --------------------------------------------------------------------- 

44 

45 

46@dataclass 

47class InlineConfig: 

48 """High-level configuration for the inliner. 

49 

50 # Parameters 

51 - `allowed_extensions : set[AssetExt]` 

52 Extensions that may be inlined. 

53 - `tag_attr : dict[str, str]` 

54 Mapping *tag -> attribute* that holds the asset reference. 

55 - `max_bytes : int` 

56 Assets larger than this are ignored. 

57 - `local : bool` 

58 Allow local filesystem assets. 

59 - `remote : bool` 

60 Allow remote http/https assets. 

61 - `include_filename_comments : bool` 

62 Surround every replacement with `<!-- begin '...' -->` 

63 and `<!-- end '...' -->`. 

64 - `use_bs4 : bool` 

65 Parse the document with BeautifulSoup if available. 

66 """ 

67 

68 allowed_extensions: set[AssetExt] = field( 

69 default_factory=lambda: set(DEFAULT_ALLOWED_EXTENSIONS) 

70 ) 

71 tag_attr: dict[str, str] = field(default_factory=lambda: dict(DEFAULT_TAG_ATTR)) 

72 max_bytes: int = 128 * 1024 

73 local: bool = True 

74 remote: bool = False 

75 include_filename_comments: bool = True 

76 use_bs4: bool = False 

77 

78 

79# Low-level helpers 

80# --------------------------------------------------------------------- 

81 

82 

83def _is_remote(url: str) -> bool: 

84 """Return *True* if *url* starts with http:// or https://.""" 

85 return url.lower().startswith(("http://", "https://")) 

86 

87 

88def _fetch_bytes(src: str, base: Path) -> bytes: 

89 """Fetch *src* (local or remote) and return its raw bytes.""" 

90 if _is_remote(src): 

91 with urllib.request.urlopen(src) as resp: 

92 return resp.read() 

93 return (base / src).read_bytes() 

94 

95 

96def _decode_text(buf: bytes) -> str: 

97 """Decode *buf* as UTF-8, falling back to replacement.""" 

98 try: 

99 return buf.decode() 

100 except UnicodeDecodeError: 

101 return buf.decode("utf-8", "replace") 

102 

103 

104# Regex-based implementation (no deps) 

105# --------------------------------------------------------------------- 

106 

107 

108def _apply_indent(html: str, start: int, replacement: str) -> str: 

109 """Indent *replacement* to match the line that starts at *start*.""" 

110 line_start: int = html.rfind("\n", 0, start) + 1 

111 indent: str = html[line_start:start] 

112 return "\n".join(indent + line for line in replacement.splitlines()) 

113 

114 

115def _inline_with_regex(html: str, base: Path, cfg: InlineConfig) -> str: 

116 """Inline assets using pure-regex parsing (no third-party libs).""" 

117 tag: str 

118 attr: str 

119 for tag, attr in cfg.tag_attr.items(): 

120 pattern: str 

121 if tag == "script": 

122 pattern = ( 

123 rf"<script\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>\s*</script>" 

124 ) 

125 elif tag == "link": 

126 pattern = rf"<link\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>" 

127 else: # img, use, etc. 

128 pattern = rf"<{tag}\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>" 

129 

130 matches: list[re.Match[str]] = list(re.finditer(pattern, html, re.IGNORECASE)) 

131 m: re.Match[str] 

132 for m in reversed(matches): 

133 raw_src: str = m.group(1) # may contain #fragment 

134 clean_src: str = re.split(r"[?#]", raw_src, maxsplit=1)[0] # file path only 

135 ext: str = Path(clean_src).suffix.lower() 

136 

137 if ext not in cfg.allowed_extensions: 

138 continue 

139 if _is_remote(clean_src) and not cfg.remote: 

140 continue 

141 if not _is_remote(clean_src) and not cfg.local: 

142 continue 

143 

144 try: 

145 data: bytes = _fetch_bytes(clean_src, base) 

146 except Exception as err: 

147 warnings.warn(f"skip '{raw_src}': {err}") 

148 continue 

149 

150 if len(data) > cfg.max_bytes: 

151 continue 

152 

153 # build replacement 

154 replacement: str 

155 if ext in {".css", ".js"}: 

156 tag_name: str = "style" if ext == ".css" else "script" 

157 replacement = f"<{tag_name}>\n{_decode_text(data)}\n</{tag_name}>" 

158 else: # .svg or .png 

159 b64: str = base64.b64encode(data).decode() 

160 # TYPING: we check earlier, ext if for sure in MIME_BY_EXT 

161 data_uri: str = f"data:{MIME_BY_EXT[ext]};base64,{b64}" # type: ignore[index] 

162 replacement = m.group(0).replace(raw_src, data_uri, 1) 

163 

164 if cfg.include_filename_comments: 

165 replacement = f"<!-- begin '{clean_src}' -->\n{replacement}\n<!-- end '{clean_src}' -->" 

166 

167 replacement = _apply_indent(html, m.start(), replacement) 

168 html = html[: m.start()] + replacement + html[m.end() :] 

169 

170 return html 

171 

172 

173# BeautifulSoup-based implementation (optional) 

174# --------------------------------------------------------------------- 

175 

176 

177def _inline_with_bs4(html: str, base: Path, cfg: InlineConfig) -> str: 

178 """Inline assets using BeautifulSoup when available.""" 

179 try: 

180 from bs4 import BeautifulSoup, Comment, Tag 

181 except ModuleNotFoundError as exc: # pragma: no cover 

182 raise RuntimeError("BeautifulSoup requested but not installed") from exc 

183 

184 soup: BeautifulSoup = BeautifulSoup(html, "html.parser") 

185 

186 tag: Tag # TYPING: i think soup.find_all() returns a list of Tag objects? mypy thinks it should be PageElement (of which Tag is a subclass) 

187 for tag in list(soup.find_all(cfg.tag_attr.keys())): # type: ignore[assignment] 

188 attr: str = cfg.tag_attr[tag.name] 

189 # TYPING: error: Incompatible types in assignment (expression has type "str | AttributeValueList | None", variable has type "str | None") [assignment] 

190 src_full: str | None = tag.get(attr) # type: ignore[assignment] 

191 if not src_full: 

192 continue 

193 

194 clean_src: str = re.split(r"[?#]", src_full, 1)[0] 

195 ext: str = Path(clean_src).suffix.lower() 

196 

197 if ext not in cfg.allowed_extensions: 

198 continue 

199 if _is_remote(clean_src) and not cfg.remote: 

200 continue 

201 if not _is_remote(clean_src) and not cfg.local: 

202 continue 

203 

204 try: 

205 data: bytes = _fetch_bytes(clean_src, base) 

206 except Exception as err: 

207 warnings.warn(f"skip '{src_full}': {err}") 

208 continue 

209 

210 if len(data) > cfg.max_bytes: 

211 continue 

212 

213 if ext in {".css", ".js"}: 

214 new_tag: Tag = soup.new_tag("style" if ext == ".css" else "script") 

215 new_tag.string = _decode_text(data) 

216 if cfg.include_filename_comments: 

217 tag.insert_before(Comment(f" begin '{src_full}' ")) 

218 tag.insert_after(Comment(f" end '{src_full}' ")) 

219 tag.replace_with(new_tag) 

220 else: # .svg or .png 

221 b64: str = base64.b64encode(data).decode() 

222 # we are sure ext is in MIME_BY_EXT, so ignore type error 

223 tag[attr] = f"data:{MIME_BY_EXT[ext]};base64,{b64}" # type: ignore[index] 

224 if cfg.include_filename_comments: 

225 tag.insert_before(Comment(f" begin '{src_full}' ")) 

226 tag.insert_after(Comment(f" end '{src_full}' ")) 

227 

228 return str(soup) 

229 

230 

231# Public API 

232# --------------------------------------------------------------------- 

233 

234 

235def inline_html_assets( 

236 html: str, 

237 *, 

238 base_path: Path, 

239 config: InlineConfig | None = None, 

240 prettify: bool = False, # kept for API compatibility (ignored in regex mode) 

241) -> str: 

242 """Inline permitted external assets inside *html*. 

243 

244 # Parameters 

245 - `html : str` 

246 Raw HTML text. 

247 - `base_path : Path` 

248 Directory used to resolve relative asset paths. 

249 - `config : InlineConfig | None` 

250 Inlining options (see `InlineConfig`). 

251 - `prettify : bool` 

252 Pretty-print output (only effective in BS4 mode). 

253 

254 # Returns 

255 - `str` 

256 Modified HTML. 

257 """ 

258 cfg: InlineConfig = config or InlineConfig() 

259 if cfg.use_bs4: 

260 html_out: str = _inline_with_bs4(html, base_path, cfg) 

261 if prettify: 

262 # lazy import to avoid unconditional dependency 

263 from bs4 import BeautifulSoup 

264 

265 # TYPING: .prettify() returns str if no encoding is set 

266 html_out = str(BeautifulSoup(html_out, "html.parser").prettify()) 

267 else: 

268 html_out = _inline_with_regex(html, base_path, cfg) 

269 return html_out 

270 

271 

272def inline_html_file( 

273 html_path: Path, 

274 output_path: Path, 

275 base_path: Path | None = None, 

276 config: InlineConfig | None = None, 

277 prettify: bool = False, 

278) -> Path: 

279 """Read *html_path*, inline its assets, and write the result. 

280 

281 # Parameters 

282 - `html_path : Path` 

283 Source HTML file. 

284 - `output_path : Path` 

285 Destination path to write the modified HTML. 

286 - `base_path : Path | None` 

287 Directory used to resolve relative asset paths (defaults to the HTML file's directory). 

288 If `None`, uses the directory of *html_path*. 

289 (default: `None` -> use `html_path.parent`) 

290 - `config : InlineConfig | None` 

291 Inlining options. 

292 If `None`, uses default configuration. 

293 (default: `None` -> use `InlineConfig()`) 

294 - `prettify : bool` 

295 Pretty-print when `use_bs4=True`. 

296 (default: `False`) 

297 

298 # Returns 

299 - `Path` 

300 Path actually written. 

301 """ 

302 if base_path is None: 

303 base_path = html_path.parent 

304 html_raw: str = html_path.read_text() 

305 html_new: str = inline_html_assets( 

306 html_raw, 

307 base_path=base_path, 

308 config=config, 

309 prettify=prettify, 

310 ) 

311 dest: Path = output_path or html_path 

312 dest.write_text(html_new) 

313 return dest 

314 

315 

316# CLI 

317# --------------------------------------------------------------------- 

318 

319if __name__ == "__main__": 

320 import argparse 

321 

322 parser: argparse.ArgumentParser = argparse.ArgumentParser( 

323 description="Inline / bundle CSS, JS, SVG, PNG assets. " 

324 "Uses regex parsing by default; pass --bs4 to require BeautifulSoup." 

325 ) 

326 parser.add_argument("html", type=Path, help="input HTML file") 

327 parser.add_argument( 

328 "-o", 

329 "--output", 

330 type=Path, 

331 help="output file", 

332 required=True, 

333 ) 

334 parser.add_argument( 

335 "--source-dir", 

336 type=Path, 

337 default=None, 

338 help="base directory for relative asset paths (defaults to the HTML file's directory)", 

339 ) 

340 parser.add_argument("--remote", action="store_true", help="allow remote URLs") 

341 parser.add_argument("--bs4", action="store_true", help="use BeautifulSoup parser") 

342 parser.add_argument( 

343 "--prettify", action="store_true", help="pretty-print with BeautifulSoup)" 

344 ) 

345 parser.add_argument( 

346 "--max-bytes", type=int, default=128 * 1024, help="size limit per asset" 

347 ) 

348 parser.add_argument( 

349 "--ext", 

350 nargs="+", 

351 default=list(DEFAULT_ALLOWED_EXTENSIONS), 

352 help="extensions to inline", 

353 ) 

354 parser.add_argument( 

355 "--tag-attr", 

356 type=str, 

357 default=None, 

358 help='override tag->attr map. format: "tag1=attr1,tag2=attr2"', 

359 ) 

360 parser.add_argument("--no-comments", dest="comments", action="store_false") 

361 args: argparse.Namespace = parser.parse_args() 

362 

363 tag_attr: dict[str, str] 

364 if args.tag_attr: 

365 tag_attr = { 

366 tag: attr 

367 for tag, attr in (item.split("=") for item in args.tag_attr.split(",")) 

368 } 

369 

370 else: 

371 tag_attr = dict(DEFAULT_TAG_ATTR) 

372 

373 cfg: InlineConfig = InlineConfig( 

374 allowed_extensions=set(args.ext), # type: ignore[arg-type] 

375 tag_attr=tag_attr, 

376 max_bytes=args.max_bytes, 

377 remote=args.remote, 

378 include_filename_comments=args.comments, 

379 use_bs4=args.bs4, 

380 ) 

381 

382 inline_html_file( 

383 args.html, 

384 output_path=args.output, 

385 base_path=args.source_dir, 

386 config=cfg, 

387 prettify=args.prettify, 

388 )