Created
April 17, 2026 07:56
-
-
Save yeiichi/6f6b59728da66fb6361f4e0fd8d28371 to your computer and use it in GitHub Desktop.
Extract HTML and resources from an Apple .webarchive file using Python's plistlib
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| webarchive_to_html.py | |
| Extract HTML and resources from an Apple .webarchive file using Python's plistlib, | |
| then rewrite common resource links for better offline viewing. | |
| Usage: | |
| python webarchive_to_html.py input.webarchive | |
| python webarchive_to_html.py input.webarchive -o output_dir | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import html | |
| import mimetypes | |
| import plistlib | |
| import re | |
| from pathlib import Path | |
| from urllib.parse import urlparse | |
| def safe_name(name: str, fallback: str = "resource") -> str: | |
| """Return a filesystem-safe filename.""" | |
| bad_chars = '<>:"/\\|?*' | |
| cleaned = "".join("-" if c in bad_chars else c for c in name).strip() | |
| return cleaned or fallback | |
| def uniquify_path(fp: Path) -> Path: | |
| """Return a non-conflicting path by appending a numeric suffix if needed.""" | |
| if not fp.exists(): | |
| return fp | |
| stem = fp.stem | |
| suffix = fp.suffix | |
| parent = fp.parent | |
| i = 2 | |
| while True: | |
| candidate = parent / f"{stem}_{i}{suffix}" | |
| if not candidate.exists(): | |
| return candidate | |
| i += 1 | |
| def guess_extension(mime_type: str | None, url: str | None = None) -> str: | |
| """Guess a file extension from MIME type or URL.""" | |
| if mime_type: | |
| ext = mimetypes.guess_extension(mime_type) | |
| if ext: | |
| return ext | |
| if url: | |
| path = urlparse(url).path | |
| suffix = Path(path).suffix | |
| if path and suffix: | |
| return suffix | |
| return "" | |
| def dump_resource( | |
| resource: dict, | |
| out_dir: Path, | |
| prefix: str, | |
| ) -> tuple[Path | None, str, str]: | |
| """Write one archived resource to disk and return path, original URL, and MIME type.""" | |
| data = resource.get("WebResourceData") | |
| url = resource.get("WebResourceURL", "") or "" | |
| mime_type = resource.get("WebResourceMIMEType", "") or "" | |
| if not data: | |
| return None, url, mime_type | |
| response = resource.get("WebResourceResponse", {}) | |
| suggested_name = "" | |
| if isinstance(response, dict): | |
| suggested_name = response.get("NSURLResponseSuggestedFilename", "") or "" | |
| name_from_url = Path(urlparse(url).path).name if url else "" | |
| base_name = suggested_name or name_from_url or prefix | |
| base_name = safe_name(base_name, fallback=prefix) | |
| if not Path(base_name).suffix: | |
| base_name += guess_extension(mime_type, url) | |
| fp = uniquify_path(out_dir / base_name) | |
| fp.write_bytes(data) | |
| return fp, url, mime_type | |
| def extract_main_html(archive: dict) -> str: | |
| """Return decoded text of the main HTML resource.""" | |
| main = archive["WebMainResource"] | |
| data = main["WebResourceData"] | |
| encoding = main.get("WebResourceTextEncodingName") or "utf-8" | |
| try: | |
| return data.decode(encoding, errors="replace") | |
| except LookupError: | |
| return data.decode("utf-8", errors="replace") | |
| def candidate_urls(url: str) -> list[str]: | |
| """Return possible URL variants as they may appear in HTML or CSS.""" | |
| parsed = urlparse(url) | |
| variants: list[str] = [] | |
| if url: | |
| variants.append(url) | |
| if parsed.scheme and parsed.netloc: | |
| variants.append(f"//{parsed.netloc}{parsed.path}") | |
| variants.append(parsed.path) | |
| if parsed.query: | |
| variants.append(f"{parsed.path}?{parsed.query}") | |
| variants.append(f"//{parsed.netloc}{parsed.path}?{parsed.query}") | |
| escaped_variants = [] | |
| for item in variants: | |
| if "&" in item: | |
| escaped_variants.append(html.escape(item, quote=True)) | |
| variants.extend(escaped_variants) | |
| seen = set() | |
| out = [] | |
| for item in variants: | |
| if item and item not in seen: | |
| seen.add(item) | |
| out.append(item) | |
| return out | |
| def build_lookup(url_map: dict[str, str]) -> dict[str, str]: | |
| """Expand URL map into a lookup table with normalized candidate forms.""" | |
| lookup: dict[str, str] = {} | |
| for original_url, local_path in url_map.items(): | |
| for candidate in candidate_urls(original_url): | |
| lookup[candidate] = local_path | |
| return lookup | |
| def _apply_css_url_rewrites(text: str, lookup: dict[str, str]) -> str: | |
| """Helper to rewrite CSS url(...) references.""" | |
| def replace_css_url(match: re.Match) -> str: | |
| quote = match.group(1) or "" | |
| value = match.group(2) | |
| local_path = lookup.get(value) | |
| if local_path: | |
| return f"url({quote}{local_path}{quote})" | |
| return match.group(0) | |
| return re.sub( | |
| r'url\(\s*(["\']?)(.*?)\1\s*\)', | |
| replace_css_url, | |
| text, | |
| flags=re.IGNORECASE | re.DOTALL, | |
| ) | |
| def rewrite_html_refs(html_text: str, url_map: dict[str, str]) -> str: | |
| """Rewrite common HTML and inline-CSS references to local extracted files.""" | |
| lookup = build_lookup(url_map) | |
| def replace_attr(match: re.Match) -> str: | |
| attr = match.group(1) | |
| quote = match.group(2) | |
| value = match.group(3) | |
| local_path = lookup.get(value) | |
| if local_path: | |
| return f"{attr}={quote}{local_path}{quote}" | |
| return match.group(0) | |
| def replace_srcset(match: re.Match) -> str: | |
| attr = match.group(1) | |
| quote = match.group(2) | |
| value = match.group(3) | |
| parts = [] | |
| for item in value.split(","): | |
| item = item.strip() | |
| if not item: | |
| continue | |
| bits = item.split() | |
| url = bits[0] | |
| rest = " ".join(bits[1:]) | |
| mapped = lookup.get(url, url) | |
| parts.append(f"{mapped} {rest}".strip()) | |
| return f'{attr}={quote}{", ".join(parts)}{quote}' | |
| html_text = re.sub( | |
| r'\b(href|src)\s*=\s*(["\'])(.*?)\2', | |
| replace_attr, | |
| html_text, | |
| flags=re.IGNORECASE | re.DOTALL, | |
| ) | |
| html_text = re.sub( | |
| r'\b(srcset)\s*=\s*(["\'])(.*?)\2', | |
| replace_srcset, | |
| html_text, | |
| flags=re.IGNORECASE | re.DOTALL, | |
| ) | |
| return _apply_css_url_rewrites(html_text, lookup) | |
| def rewrite_css_refs(css_text: str, url_map: dict[str, str]) -> str: | |
| """Rewrite CSS url(...) references to local extracted files.""" | |
| lookup = build_lookup(url_map) | |
| return _apply_css_url_rewrites(css_text, lookup) | |
| def rewrite_saved_css_files(css_files: list[Path], url_map: dict[str, str]) -> None: | |
| """Rewrite extracted CSS files in place.""" | |
| # CSS files live in resources/, so local refs to peer resources are just filenames. | |
| adjusted_map = { | |
| url: Path(local_path).name if local_path.startswith("resources/") else local_path | |
| for url, local_path in url_map.items() | |
| } | |
| for css_fp in css_files: | |
| try: | |
| css_text = css_fp.read_text(encoding="utf-8") | |
| except UnicodeDecodeError: | |
| try: | |
| css_text = css_fp.read_text(encoding="utf-8", errors="replace") | |
| except Exception as e: | |
| print(f"[WARN] failed to read CSS {css_fp.name}: {e}") | |
| continue | |
| rewritten = rewrite_css_refs(css_text, adjusted_map) | |
| css_fp.write_text(rewritten, encoding="utf-8") | |
| print(f"[OK] rewritten CSS: {css_fp.name}") | |
| def process_webarchive(src: Path, out_dir: Path) -> None: | |
| """Parse .webarchive and write extracted files.""" | |
| with src.open("rb") as f: | |
| archive = plistlib.load(f) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| html_text = extract_main_html(archive) | |
| main_resource = archive.get("WebMainResource", {}) | |
| main_url = main_resource.get("WebResourceURL", "") | |
| main_mime = main_resource.get("WebResourceMIMEType", "") | |
| print(f" URL : {main_url}") | |
| print(f" MIME: {main_mime}") | |
| subresources = archive.get("WebSubresources", []) | |
| print(f"[INFO] subresources: {len(subresources)}") | |
| assets_dir = out_dir / "resources" | |
| assets_dir.mkdir(exist_ok=True) | |
| url_map: dict[str, str] = {} | |
| css_files: list[Path] = [] | |
| for idx, resource in enumerate(subresources, start=1): | |
| fp, url, mime_type = dump_resource(resource, assets_dir, prefix=f"resource_{idx:04d}") | |
| if fp: | |
| rel_path = fp.relative_to(out_dir).as_posix() | |
| print(f"[OK] resource: {fp.name}") | |
| if url: | |
| url_map[url] = rel_path | |
| if mime_type == "text/css" or fp.suffix.lower() == ".css": | |
| css_files.append(fp) | |
| html_text = rewrite_html_refs(html_text, url_map) | |
| html_fp = out_dir / "main.html" | |
| html_fp.write_text(html_text, encoding="utf-8") | |
| print(f"[OK] main HTML: {html_fp}") | |
| rewrite_saved_css_files(css_files, url_map) | |
| subframes = archive.get("WebSubframeArchives", []) | |
| print(f"[INFO] subframe archives: {len(subframes)}") | |
| for idx, frame_archive in enumerate(subframes, start=1): | |
| frame_dir = out_dir / f"subframe_{idx:02d}" | |
| frame_dir.mkdir(exist_ok=True) | |
| try: | |
| frame_html = extract_main_html(frame_archive) | |
| frame_fp = frame_dir / "main.html" | |
| frame_fp.write_text(frame_html, encoding="utf-8") | |
| print(f"[OK] subframe HTML: {frame_fp}") | |
| except Exception as e: | |
| print(f"[WARN] failed subframe {idx}: {e}") | |
| def build_parser() -> argparse.ArgumentParser: | |
| parser = argparse.ArgumentParser( | |
| description="Extract HTML and resources from Apple .webarchive files." | |
| ) | |
| parser.add_argument("src", type=Path, help="Input .webarchive file") | |
| parser.add_argument( | |
| "-o", | |
| "--out", | |
| type=Path, | |
| help="Output directory (default: <input_stem>_extracted)", | |
| ) | |
| return parser | |
| def print_format_notes() -> None: | |
| """Print notes about the .webarchive structure and script output.""" | |
| instructions = """ | |
| plistlib parses the container, but .webarchive itself is usually shaped like this: | |
| { | |
| "WebMainResource": {...}, | |
| "WebSubresources": [...], | |
| "WebSubframeArchives": [...] | |
| } | |
| The most important fields inside a resource are often: | |
| resource["WebResourceData"] # bytes | |
| resource["WebResourceURL"] # original URL | |
| resource["WebResourceMIMEType"] # MIME type | |
| resource["WebResourceTextEncodingName"] # text encoding, if relevant | |
| Run it like this: | |
| python webarchive_to_html.py page.webarchive | |
| or | |
| python webarchive_to_html.py page.webarchive -o out_dir | |
| That will produce: | |
| main.html | |
| resources/ for CSS, JS, images, etc. | |
| optional subframe_*/main.html | |
| """ | |
| print(instructions) | |
| def main() -> None: | |
| parser = build_parser() | |
| args = parser.parse_args() | |
| src = args.src | |
| if not src.is_file(): | |
| raise SystemExit(f"Input file not found: {src}") | |
| out_dir = args.out or src.with_name(f"{src.stem}_extracted") | |
| process_webarchive(src, out_dir) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment