Coverage for muutils/web/bundle_html.py: 90%
147 statements
« prev ^ index » next coverage.py v7.6.1, created at 2025-06-06 11:28 -0600
« prev ^ index » next coverage.py v7.6.1, created at 2025-06-06 11:28 -0600
1"""
2Inline / bundle external assets (CSS, JS, SVG, PNG) into an HTML document.
4Default mode uses **zero external dependencies** and a few well-targeted
5regular expressions. If you install *beautifulsoup4* you can enable the
6far more robust BS4 mode by passing `InlineConfig(use_bs4=True)`.
7"""
9from __future__ import annotations
11import base64
12import re
13import urllib.request
14import warnings
15from dataclasses import dataclass, field
16from pathlib import Path
17from typing import Final, Literal
19# bs4 import deferred to avoid an unconditional dependency.
21# constants
22# ---------------------------------------------------------------------
24AssetExt = Literal[".css", ".js", ".svg", ".png"]
26DEFAULT_ALLOWED_EXTENSIONS: Final[set[AssetExt]] = {".css", ".js", ".svg", ".png"}
28DEFAULT_TAG_ATTR: Final[dict[str, str]] = {
29 "link": "href", # <link rel="stylesheet" href="...">
30 "script": "src", # <script src="..."></script>
31 "img": "src", # <img src="...">
32 "use": "xlink:href", # <use xlink:href="sprite.svg#id">
33}
35MIME_BY_EXT: Final[dict[AssetExt, str]] = {
36 ".css": "text/css",
37 ".js": "application/javascript",
38 ".svg": "image/svg+xml",
39 ".png": "image/png",
40}
42# Configuration
43# ---------------------------------------------------------------------
46@dataclass
47class InlineConfig:
48 """High-level configuration for the inliner.
50 # Parameters
51 - `allowed_extensions : set[AssetExt]`
52 Extensions that may be inlined.
53 - `tag_attr : dict[str, str]`
54 Mapping *tag -> attribute* that holds the asset reference.
55 - `max_bytes : int`
56 Assets larger than this are ignored.
57 - `local : bool`
58 Allow local filesystem assets.
59 - `remote : bool`
60 Allow remote http/https assets.
61 - `include_filename_comments : bool`
62 Surround every replacement with `<!-- begin '...' -->`
63 and `<!-- end '...' -->`.
64 - `use_bs4 : bool`
65 Parse the document with BeautifulSoup if available.
66 """
68 allowed_extensions: set[AssetExt] = field(
69 default_factory=lambda: set(DEFAULT_ALLOWED_EXTENSIONS)
70 )
71 tag_attr: dict[str, str] = field(default_factory=lambda: dict(DEFAULT_TAG_ATTR))
72 max_bytes: int = 128 * 1024
73 local: bool = True
74 remote: bool = False
75 include_filename_comments: bool = True
76 use_bs4: bool = False
79# Low-level helpers
80# ---------------------------------------------------------------------
83def _is_remote(url: str) -> bool:
84 """Return *True* if *url* starts with http:// or https://."""
85 return url.lower().startswith(("http://", "https://"))
88def _fetch_bytes(src: str, base: Path) -> bytes:
89 """Fetch *src* (local or remote) and return its raw bytes."""
90 if _is_remote(src):
91 with urllib.request.urlopen(src) as resp:
92 return resp.read()
93 return (base / src).read_bytes()
96def _decode_text(buf: bytes) -> str:
97 """Decode *buf* as UTF-8, falling back to replacement."""
98 try:
99 return buf.decode()
100 except UnicodeDecodeError:
101 return buf.decode("utf-8", "replace")
104# Regex-based implementation (no deps)
105# ---------------------------------------------------------------------
108def _apply_indent(html: str, start: int, replacement: str) -> str:
109 """Indent *replacement* to match the line that starts at *start*."""
110 line_start: int = html.rfind("\n", 0, start) + 1
111 indent: str = html[line_start:start]
112 return "\n".join(indent + line for line in replacement.splitlines())
115def _inline_with_regex(html: str, base: Path, cfg: InlineConfig) -> str:
116 """Inline assets using pure-regex parsing (no third-party libs)."""
117 tag: str
118 attr: str
119 for tag, attr in cfg.tag_attr.items():
120 pattern: str
121 if tag == "script":
122 pattern = (
123 rf"<script\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>\s*</script>"
124 )
125 elif tag == "link":
126 pattern = rf"<link\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>"
127 else: # img, use, etc.
128 pattern = rf"<{tag}\b[^>]*\s{attr}\s*=\s*['\"]([^'\"]+)['\"][^>]*>"
130 matches: list[re.Match[str]] = list(re.finditer(pattern, html, re.IGNORECASE))
131 m: re.Match[str]
132 for m in reversed(matches):
133 raw_src: str = m.group(1) # may contain #fragment
134 clean_src: str = re.split(r"[?#]", raw_src, maxsplit=1)[0] # file path only
135 ext: str = Path(clean_src).suffix.lower()
137 if ext not in cfg.allowed_extensions:
138 continue
139 if _is_remote(clean_src) and not cfg.remote:
140 continue
141 if not _is_remote(clean_src) and not cfg.local:
142 continue
144 try:
145 data: bytes = _fetch_bytes(clean_src, base)
146 except Exception as err:
147 warnings.warn(f"skip '{raw_src}': {err}")
148 continue
150 if len(data) > cfg.max_bytes:
151 continue
153 # build replacement
154 replacement: str
155 if ext in {".css", ".js"}:
156 tag_name: str = "style" if ext == ".css" else "script"
157 replacement = f"<{tag_name}>\n{_decode_text(data)}\n</{tag_name}>"
158 else: # .svg or .png
159 b64: str = base64.b64encode(data).decode()
160 # TYPING: we check earlier, ext if for sure in MIME_BY_EXT
161 data_uri: str = f"data:{MIME_BY_EXT[ext]};base64,{b64}" # type: ignore[index]
162 replacement = m.group(0).replace(raw_src, data_uri, 1)
164 if cfg.include_filename_comments:
165 replacement = f"<!-- begin '{clean_src}' -->\n{replacement}\n<!-- end '{clean_src}' -->"
167 replacement = _apply_indent(html, m.start(), replacement)
168 html = html[: m.start()] + replacement + html[m.end() :]
170 return html
173# BeautifulSoup-based implementation (optional)
174# ---------------------------------------------------------------------
177def _inline_with_bs4(html: str, base: Path, cfg: InlineConfig) -> str:
178 """Inline assets using BeautifulSoup when available."""
179 try:
180 from bs4 import BeautifulSoup, Comment, Tag
181 except ModuleNotFoundError as exc: # pragma: no cover
182 raise RuntimeError("BeautifulSoup requested but not installed") from exc
184 soup: BeautifulSoup = BeautifulSoup(html, "html.parser")
186 tag: Tag # TYPING: i think soup.find_all() returns a list of Tag objects? mypy thinks it should be PageElement (of which Tag is a subclass)
187 for tag in list(soup.find_all(cfg.tag_attr.keys())): # type: ignore[assignment]
188 attr: str = cfg.tag_attr[tag.name]
189 # TYPING: error: Incompatible types in assignment (expression has type "str | AttributeValueList | None", variable has type "str | None") [assignment]
190 src_full: str | None = tag.get(attr) # type: ignore[assignment]
191 if not src_full:
192 continue
194 clean_src: str = re.split(r"[?#]", src_full, 1)[0]
195 ext: str = Path(clean_src).suffix.lower()
197 if ext not in cfg.allowed_extensions:
198 continue
199 if _is_remote(clean_src) and not cfg.remote:
200 continue
201 if not _is_remote(clean_src) and not cfg.local:
202 continue
204 try:
205 data: bytes = _fetch_bytes(clean_src, base)
206 except Exception as err:
207 warnings.warn(f"skip '{src_full}': {err}")
208 continue
210 if len(data) > cfg.max_bytes:
211 continue
213 if ext in {".css", ".js"}:
214 new_tag: Tag = soup.new_tag("style" if ext == ".css" else "script")
215 new_tag.string = _decode_text(data)
216 if cfg.include_filename_comments:
217 tag.insert_before(Comment(f" begin '{src_full}' "))
218 tag.insert_after(Comment(f" end '{src_full}' "))
219 tag.replace_with(new_tag)
220 else: # .svg or .png
221 b64: str = base64.b64encode(data).decode()
222 # we are sure ext is in MIME_BY_EXT, so ignore type error
223 tag[attr] = f"data:{MIME_BY_EXT[ext]};base64,{b64}" # type: ignore[index]
224 if cfg.include_filename_comments:
225 tag.insert_before(Comment(f" begin '{src_full}' "))
226 tag.insert_after(Comment(f" end '{src_full}' "))
228 return str(soup)
231# Public API
232# ---------------------------------------------------------------------
235def inline_html_assets(
236 html: str,
237 *,
238 base_path: Path,
239 config: InlineConfig | None = None,
240 prettify: bool = False, # kept for API compatibility (ignored in regex mode)
241) -> str:
242 """Inline permitted external assets inside *html*.
244 # Parameters
245 - `html : str`
246 Raw HTML text.
247 - `base_path : Path`
248 Directory used to resolve relative asset paths.
249 - `config : InlineConfig | None`
250 Inlining options (see `InlineConfig`).
251 - `prettify : bool`
252 Pretty-print output (only effective in BS4 mode).
254 # Returns
255 - `str`
256 Modified HTML.
257 """
258 cfg: InlineConfig = config or InlineConfig()
259 if cfg.use_bs4:
260 html_out: str = _inline_with_bs4(html, base_path, cfg)
261 if prettify:
262 # lazy import to avoid unconditional dependency
263 from bs4 import BeautifulSoup
265 # TYPING: .prettify() returns str if no encoding is set
266 html_out = str(BeautifulSoup(html_out, "html.parser").prettify())
267 else:
268 html_out = _inline_with_regex(html, base_path, cfg)
269 return html_out
272def inline_html_file(
273 html_path: Path,
274 output_path: Path,
275 base_path: Path | None = None,
276 config: InlineConfig | None = None,
277 prettify: bool = False,
278) -> Path:
279 """Read *html_path*, inline its assets, and write the result.
281 # Parameters
282 - `html_path : Path`
283 Source HTML file.
284 - `output_path : Path`
285 Destination path to write the modified HTML.
286 - `base_path : Path | None`
287 Directory used to resolve relative asset paths (defaults to the HTML file's directory).
288 If `None`, uses the directory of *html_path*.
289 (default: `None` -> use `html_path.parent`)
290 - `config : InlineConfig | None`
291 Inlining options.
292 If `None`, uses default configuration.
293 (default: `None` -> use `InlineConfig()`)
294 - `prettify : bool`
295 Pretty-print when `use_bs4=True`.
296 (default: `False`)
298 # Returns
299 - `Path`
300 Path actually written.
301 """
302 if base_path is None:
303 base_path = html_path.parent
304 html_raw: str = html_path.read_text()
305 html_new: str = inline_html_assets(
306 html_raw,
307 base_path=base_path,
308 config=config,
309 prettify=prettify,
310 )
311 dest: Path = output_path or html_path
312 dest.write_text(html_new)
313 return dest
316# CLI
317# ---------------------------------------------------------------------
319if __name__ == "__main__":
320 import argparse
322 parser: argparse.ArgumentParser = argparse.ArgumentParser(
323 description="Inline / bundle CSS, JS, SVG, PNG assets. "
324 "Uses regex parsing by default; pass --bs4 to require BeautifulSoup."
325 )
326 parser.add_argument("html", type=Path, help="input HTML file")
327 parser.add_argument(
328 "-o",
329 "--output",
330 type=Path,
331 help="output file",
332 required=True,
333 )
334 parser.add_argument(
335 "--source-dir",
336 type=Path,
337 default=None,
338 help="base directory for relative asset paths (defaults to the HTML file's directory)",
339 )
340 parser.add_argument("--remote", action="store_true", help="allow remote URLs")
341 parser.add_argument("--bs4", action="store_true", help="use BeautifulSoup parser")
342 parser.add_argument(
343 "--prettify", action="store_true", help="pretty-print with BeautifulSoup)"
344 )
345 parser.add_argument(
346 "--max-bytes", type=int, default=128 * 1024, help="size limit per asset"
347 )
348 parser.add_argument(
349 "--ext",
350 nargs="+",
351 default=list(DEFAULT_ALLOWED_EXTENSIONS),
352 help="extensions to inline",
353 )
354 parser.add_argument(
355 "--tag-attr",
356 type=str,
357 default=None,
358 help='override tag->attr map. format: "tag1=attr1,tag2=attr2"',
359 )
360 parser.add_argument("--no-comments", dest="comments", action="store_false")
361 args: argparse.Namespace = parser.parse_args()
363 tag_attr: dict[str, str]
364 if args.tag_attr:
365 tag_attr = {
366 tag: attr
367 for tag, attr in (item.split("=") for item in args.tag_attr.split(","))
368 }
370 else:
371 tag_attr = dict(DEFAULT_TAG_ATTR)
373 cfg: InlineConfig = InlineConfig(
374 allowed_extensions=set(args.ext), # type: ignore[arg-type]
375 tag_attr=tag_attr,
376 max_bytes=args.max_bytes,
377 remote=args.remote,
378 include_filename_comments=args.comments,
379 use_bs4=args.bs4,
380 )
382 inline_html_file(
383 args.html,
384 output_path=args.output,
385 base_path=args.source_dir,
386 config=cfg,
387 prettify=args.prettify,
388 )