Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Created April 17, 2026 07:56
Show Gist options
  • Select an option

  • Save yeiichi/6f6b59728da66fb6361f4e0fd8d28371 to your computer and use it in GitHub Desktop.

Select an option

Save yeiichi/6f6b59728da66fb6361f4e0fd8d28371 to your computer and use it in GitHub Desktop.
Extract HTML and resources from an Apple .webarchive file using Python's plistlib
#!/usr/bin/env python3
"""
webarchive_to_html.py
Extract HTML and resources from an Apple .webarchive file using Python's plistlib,
then rewrite common resource links for better offline viewing.
Usage:
python webarchive_to_html.py input.webarchive
python webarchive_to_html.py input.webarchive -o output_dir
"""
from __future__ import annotations
import argparse
import html
import mimetypes
import plistlib
import re
from pathlib import Path
from urllib.parse import urlparse
def safe_name(name: str, fallback: str = "resource") -> str:
"""Return a filesystem-safe filename."""
bad_chars = '<>:"/\\|?*'
cleaned = "".join("-" if c in bad_chars else c for c in name).strip()
return cleaned or fallback
def uniquify_path(fp: Path) -> Path:
"""Return a non-conflicting path by appending a numeric suffix if needed."""
if not fp.exists():
return fp
stem = fp.stem
suffix = fp.suffix
parent = fp.parent
i = 2
while True:
candidate = parent / f"{stem}_{i}{suffix}"
if not candidate.exists():
return candidate
i += 1
def guess_extension(mime_type: str | None, url: str | None = None) -> str:
"""Guess a file extension from MIME type or URL."""
if mime_type:
ext = mimetypes.guess_extension(mime_type)
if ext:
return ext
if url:
path = urlparse(url).path
suffix = Path(path).suffix
if path and suffix:
return suffix
return ""
def dump_resource(
resource: dict,
out_dir: Path,
prefix: str,
) -> tuple[Path | None, str, str]:
"""Write one archived resource to disk and return path, original URL, and MIME type."""
data = resource.get("WebResourceData")
url = resource.get("WebResourceURL", "") or ""
mime_type = resource.get("WebResourceMIMEType", "") or ""
if not data:
return None, url, mime_type
response = resource.get("WebResourceResponse", {})
suggested_name = ""
if isinstance(response, dict):
suggested_name = response.get("NSURLResponseSuggestedFilename", "") or ""
name_from_url = Path(urlparse(url).path).name if url else ""
base_name = suggested_name or name_from_url or prefix
base_name = safe_name(base_name, fallback=prefix)
if not Path(base_name).suffix:
base_name += guess_extension(mime_type, url)
fp = uniquify_path(out_dir / base_name)
fp.write_bytes(data)
return fp, url, mime_type
def extract_main_html(archive: dict) -> str:
"""Return decoded text of the main HTML resource."""
main = archive["WebMainResource"]
data = main["WebResourceData"]
encoding = main.get("WebResourceTextEncodingName") or "utf-8"
try:
return data.decode(encoding, errors="replace")
except LookupError:
return data.decode("utf-8", errors="replace")
def candidate_urls(url: str) -> list[str]:
"""Return possible URL variants as they may appear in HTML or CSS."""
parsed = urlparse(url)
variants: list[str] = []
if url:
variants.append(url)
if parsed.scheme and parsed.netloc:
variants.append(f"//{parsed.netloc}{parsed.path}")
variants.append(parsed.path)
if parsed.query:
variants.append(f"{parsed.path}?{parsed.query}")
variants.append(f"//{parsed.netloc}{parsed.path}?{parsed.query}")
escaped_variants = []
for item in variants:
if "&" in item:
escaped_variants.append(html.escape(item, quote=True))
variants.extend(escaped_variants)
seen = set()
out = []
for item in variants:
if item and item not in seen:
seen.add(item)
out.append(item)
return out
def build_lookup(url_map: dict[str, str]) -> dict[str, str]:
"""Expand URL map into a lookup table with normalized candidate forms."""
lookup: dict[str, str] = {}
for original_url, local_path in url_map.items():
for candidate in candidate_urls(original_url):
lookup[candidate] = local_path
return lookup
def _apply_css_url_rewrites(text: str, lookup: dict[str, str]) -> str:
"""Helper to rewrite CSS url(...) references."""
def replace_css_url(match: re.Match) -> str:
quote = match.group(1) or ""
value = match.group(2)
local_path = lookup.get(value)
if local_path:
return f"url({quote}{local_path}{quote})"
return match.group(0)
return re.sub(
r'url\(\s*(["\']?)(.*?)\1\s*\)',
replace_css_url,
text,
flags=re.IGNORECASE | re.DOTALL,
)
def rewrite_html_refs(html_text: str, url_map: dict[str, str]) -> str:
"""Rewrite common HTML and inline-CSS references to local extracted files."""
lookup = build_lookup(url_map)
def replace_attr(match: re.Match) -> str:
attr = match.group(1)
quote = match.group(2)
value = match.group(3)
local_path = lookup.get(value)
if local_path:
return f"{attr}={quote}{local_path}{quote}"
return match.group(0)
def replace_srcset(match: re.Match) -> str:
attr = match.group(1)
quote = match.group(2)
value = match.group(3)
parts = []
for item in value.split(","):
item = item.strip()
if not item:
continue
bits = item.split()
url = bits[0]
rest = " ".join(bits[1:])
mapped = lookup.get(url, url)
parts.append(f"{mapped} {rest}".strip())
return f'{attr}={quote}{", ".join(parts)}{quote}'
html_text = re.sub(
r'\b(href|src)\s*=\s*(["\'])(.*?)\2',
replace_attr,
html_text,
flags=re.IGNORECASE | re.DOTALL,
)
html_text = re.sub(
r'\b(srcset)\s*=\s*(["\'])(.*?)\2',
replace_srcset,
html_text,
flags=re.IGNORECASE | re.DOTALL,
)
return _apply_css_url_rewrites(html_text, lookup)
def rewrite_css_refs(css_text: str, url_map: dict[str, str]) -> str:
"""Rewrite CSS url(...) references to local extracted files."""
lookup = build_lookup(url_map)
return _apply_css_url_rewrites(css_text, lookup)
def rewrite_saved_css_files(css_files: list[Path], url_map: dict[str, str]) -> None:
"""Rewrite extracted CSS files in place."""
# CSS files live in resources/, so local refs to peer resources are just filenames.
adjusted_map = {
url: Path(local_path).name if local_path.startswith("resources/") else local_path
for url, local_path in url_map.items()
}
for css_fp in css_files:
try:
css_text = css_fp.read_text(encoding="utf-8")
except UnicodeDecodeError:
try:
css_text = css_fp.read_text(encoding="utf-8", errors="replace")
except Exception as e:
print(f"[WARN] failed to read CSS {css_fp.name}: {e}")
continue
rewritten = rewrite_css_refs(css_text, adjusted_map)
css_fp.write_text(rewritten, encoding="utf-8")
print(f"[OK] rewritten CSS: {css_fp.name}")
def process_webarchive(src: Path, out_dir: Path) -> None:
"""Parse .webarchive and write extracted files."""
with src.open("rb") as f:
archive = plistlib.load(f)
out_dir.mkdir(parents=True, exist_ok=True)
html_text = extract_main_html(archive)
main_resource = archive.get("WebMainResource", {})
main_url = main_resource.get("WebResourceURL", "")
main_mime = main_resource.get("WebResourceMIMEType", "")
print(f" URL : {main_url}")
print(f" MIME: {main_mime}")
subresources = archive.get("WebSubresources", [])
print(f"[INFO] subresources: {len(subresources)}")
assets_dir = out_dir / "resources"
assets_dir.mkdir(exist_ok=True)
url_map: dict[str, str] = {}
css_files: list[Path] = []
for idx, resource in enumerate(subresources, start=1):
fp, url, mime_type = dump_resource(resource, assets_dir, prefix=f"resource_{idx:04d}")
if fp:
rel_path = fp.relative_to(out_dir).as_posix()
print(f"[OK] resource: {fp.name}")
if url:
url_map[url] = rel_path
if mime_type == "text/css" or fp.suffix.lower() == ".css":
css_files.append(fp)
html_text = rewrite_html_refs(html_text, url_map)
html_fp = out_dir / "main.html"
html_fp.write_text(html_text, encoding="utf-8")
print(f"[OK] main HTML: {html_fp}")
rewrite_saved_css_files(css_files, url_map)
subframes = archive.get("WebSubframeArchives", [])
print(f"[INFO] subframe archives: {len(subframes)}")
for idx, frame_archive in enumerate(subframes, start=1):
frame_dir = out_dir / f"subframe_{idx:02d}"
frame_dir.mkdir(exist_ok=True)
try:
frame_html = extract_main_html(frame_archive)
frame_fp = frame_dir / "main.html"
frame_fp.write_text(frame_html, encoding="utf-8")
print(f"[OK] subframe HTML: {frame_fp}")
except Exception as e:
print(f"[WARN] failed subframe {idx}: {e}")
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Extract HTML and resources from Apple .webarchive files."
)
parser.add_argument("src", type=Path, help="Input .webarchive file")
parser.add_argument(
"-o",
"--out",
type=Path,
help="Output directory (default: <input_stem>_extracted)",
)
return parser
def print_format_notes() -> None:
"""Print notes about the .webarchive structure and script output."""
instructions = """
plistlib parses the container, but .webarchive itself is usually shaped like this:
{
"WebMainResource": {...},
"WebSubresources": [...],
"WebSubframeArchives": [...]
}
The most important fields inside a resource are often:
resource["WebResourceData"] # bytes
resource["WebResourceURL"] # original URL
resource["WebResourceMIMEType"] # MIME type
resource["WebResourceTextEncodingName"] # text encoding, if relevant
Run it like this:
python webarchive_to_html.py page.webarchive
or
python webarchive_to_html.py page.webarchive -o out_dir
That will produce:
main.html
resources/ for CSS, JS, images, etc.
optional subframe_*/main.html
"""
print(instructions)
def main() -> None:
parser = build_parser()
args = parser.parse_args()
src = args.src
if not src.is_file():
raise SystemExit(f"Input file not found: {src}")
out_dir = args.out or src.with_name(f"{src.stem}_extracted")
process_webarchive(src, out_dir)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment