yeiichi · April 17, 2026 07:56
diff --git a/webarchive_to_html.py b/webarchive_to_html.py
 #!/usr/bin/env python3
 """
 webarchive_to_html.py

 Extract HTML and resources from an Apple .webarchive file using Python's plistlib,
 then rewrite common resource links for better offline viewing.

 Usage:
    python webarchive_to_html.py input.webarchive
    python webarchive_to_html.py input.webarchive -o output_dir
 """

 from __future__ import annotations

 import argparse
 import html
 import mimetypes
 import plistlib
 import re
 from pathlib import Path
 from urllib.parse import urlparse


 def safe_name(name: str, fallback: str = "resource") -> str:
    """Return a filesystem-safe filename."""
    bad_chars = '<>:"/\\|?*'
    cleaned = "".join("-" if c in bad_chars else c for c in name).strip()
    return cleaned or fallback


 def uniquify_path(fp: Path) -> Path:
    """Return a non-conflicting path by appending a numeric suffix if needed."""
    if not fp.exists():
        return fp

    stem = fp.stem
    suffix = fp.suffix
    parent = fp.parent

    i = 2
    while True:
        candidate = parent / f"{stem}_{i}{suffix}"
        if not candidate.exists():
            return candidate
        i += 1


 def guess_extension(mime_type: str | None, url: str | None = None) -> str:
    """Guess a file extension from MIME type or URL."""
    if mime_type:
        ext = mimetypes.guess_extension(mime_type)
        if ext:
            return ext

    if url:
        path = urlparse(url).path
        suffix = Path(path).suffix
        if path and suffix:
            return suffix

    return ""


 def dump_resource(
    resource: dict,
    out_dir: Path,
    prefix: str,
 ) -> tuple[Path | None, str, str]:
    """Write one archived resource to disk and return path, original URL, and MIME type."""
    data = resource.get("WebResourceData")
    url = resource.get("WebResourceURL", "") or ""
    mime_type = resource.get("WebResourceMIMEType", "") or ""

    if not data:
        return None, url, mime_type

    response = resource.get("WebResourceResponse", {})
    suggested_name = ""

    if isinstance(response, dict):
        suggested_name = response.get("NSURLResponseSuggestedFilename", "") or ""

    name_from_url = Path(urlparse(url).path).name if url else ""
    base_name = suggested_name or name_from_url or prefix
    base_name = safe_name(base_name, fallback=prefix)

    if not Path(base_name).suffix:
        base_name += guess_extension(mime_type, url)

    fp = uniquify_path(out_dir / base_name)
    fp.write_bytes(data)
    return fp, url, mime_type


 def extract_main_html(archive: dict) -> str:
    """Return decoded text of the main HTML resource."""
    main = archive["WebMainResource"]
    data = main["WebResourceData"]

    encoding = main.get("WebResourceTextEncodingName") or "utf-8"

    try:
        return data.decode(encoding, errors="replace")
    except LookupError:
        return data.decode("utf-8", errors="replace")


 def candidate_urls(url: str) -> list[str]:
    """Return possible URL variants as they may appear in HTML or CSS."""
    parsed = urlparse(url)
    variants: list[str] = []

    if url:
        variants.append(url)

    if parsed.scheme and parsed.netloc:
        variants.append(f"//{parsed.netloc}{parsed.path}")
        variants.append(parsed.path)
        if parsed.query:
            variants.append(f"{parsed.path}?{parsed.query}")
            variants.append(f"//{parsed.netloc}{parsed.path}?{parsed.query}")

    escaped_variants = []
    for item in variants:
        if "&" in item:
            escaped_variants.append(html.escape(item, quote=True))
    variants.extend(escaped_variants)

    seen = set()
    out = []
    for item in variants:
        if item and item not in seen:
            seen.add(item)
            out.append(item)
    return out


 def build_lookup(url_map: dict[str, str]) -> dict[str, str]:
    """Expand URL map into a lookup table with normalized candidate forms."""
    lookup: dict[str, str] = {}
    for original_url, local_path in url_map.items():
        for candidate in candidate_urls(original_url):
            lookup[candidate] = local_path
    return lookup


 def _apply_css_url_rewrites(text: str, lookup: dict[str, str]) -> str:
    """Helper to rewrite CSS url(...) references."""
    def replace_css_url(match: re.Match) -> str:
        quote = match.group(1) or ""
        value = match.group(2)
        local_path = lookup.get(value)
        if local_path:
            return f"url({quote}{local_path}{quote})"
        return match.group(0)

    return re.sub(
        r'url\(\s*(["\']?)(.*?)\1\s*\)',
        replace_css_url,
        text,
        flags=re.IGNORECASE | re.DOTALL,
    )


 def rewrite_html_refs(html_text: str, url_map: dict[str, str]) -> str:
    """Rewrite common HTML and inline-CSS references to local extracted files."""
    lookup = build_lookup(url_map)

    def replace_attr(match: re.Match) -> str:
        attr = match.group(1)
        quote = match.group(2)
        value = match.group(3)
        local_path = lookup.get(value)
        if local_path:
            return f"{attr}={quote}{local_path}{quote}"
        return match.group(0)

    def replace_srcset(match: re.Match) -> str:
        attr = match.group(1)
        quote = match.group(2)
        value = match.group(3)

        parts = []
        for item in value.split(","):
            item = item.strip()
            if not item:
                continue
            bits = item.split()
            url = bits[0]
            rest = " ".join(bits[1:])
            mapped = lookup.get(url, url)
            parts.append(f"{mapped} {rest}".strip())

        return f'{attr}={quote}{", ".join(parts)}{quote}'

    html_text = re.sub(
        r'\b(href|src)\s*=\s*(["\'])(.*?)\2',
        replace_attr,
        html_text,
        flags=re.IGNORECASE | re.DOTALL,
    )

    html_text = re.sub(
        r'\b(srcset)\s*=\s*(["\'])(.*?)\2',
        replace_srcset,
        html_text,
        flags=re.IGNORECASE | re.DOTALL,
    )

    return _apply_css_url_rewrites(html_text, lookup)


 def rewrite_css_refs(css_text: str, url_map: dict[str, str]) -> str:
    """Rewrite CSS url(...) references to local extracted files."""
    lookup = build_lookup(url_map)
    return _apply_css_url_rewrites(css_text, lookup)


 def rewrite_saved_css_files(css_files: list[Path], url_map: dict[str, str]) -> None:
    """Rewrite extracted CSS files in place."""
    # CSS files live in resources/, so local refs to peer resources are just filenames.
    adjusted_map = {
        url: Path(local_path).name if local_path.startswith("resources/") else local_path
        for url, local_path in url_map.items()
    }

    for css_fp in css_files:
        try:
            css_text = css_fp.read_text(encoding="utf-8")
        except UnicodeDecodeError:
            try:
                css_text = css_fp.read_text(encoding="utf-8", errors="replace")
            except Exception as e:
                print(f"[WARN] failed to read CSS {css_fp.name}: {e}")
                continue

        rewritten = rewrite_css_refs(css_text, adjusted_map)
        css_fp.write_text(rewritten, encoding="utf-8")
        print(f"[OK] rewritten CSS: {css_fp.name}")


 def process_webarchive(src: Path, out_dir: Path) -> None:
    """Parse .webarchive and write extracted files."""
    with src.open("rb") as f:
        archive = plistlib.load(f)

    out_dir.mkdir(parents=True, exist_ok=True)

    html_text = extract_main_html(archive)

    main_resource = archive.get("WebMainResource", {})
    main_url = main_resource.get("WebResourceURL", "")
    main_mime = main_resource.get("WebResourceMIMEType", "")

    print(f"      URL : {main_url}")
    print(f"      MIME: {main_mime}")

    subresources = archive.get("WebSubresources", [])
    print(f"[INFO] subresources: {len(subresources)}")

    assets_dir = out_dir / "resources"
    assets_dir.mkdir(exist_ok=True)

    url_map: dict[str, str] = {}
    css_files: list[Path] = []

    for idx, resource in enumerate(subresources, start=1):
        fp, url, mime_type = dump_resource(resource, assets_dir, prefix=f"resource_{idx:04d}")
        if fp:
            rel_path = fp.relative_to(out_dir).as_posix()
            print(f"[OK] resource: {fp.name}")
            if url:
                url_map[url] = rel_path
            if mime_type == "text/css" or fp.suffix.lower() == ".css":
                css_files.append(fp)

    html_text = rewrite_html_refs(html_text, url_map)

    html_fp = out_dir / "main.html"
    html_fp.write_text(html_text, encoding="utf-8")
    print(f"[OK] main HTML: {html_fp}")

    rewrite_saved_css_files(css_files, url_map)

    subframes = archive.get("WebSubframeArchives", [])
    print(f"[INFO] subframe archives: {len(subframes)}")
    for idx, frame_archive in enumerate(subframes, start=1):
        frame_dir = out_dir / f"subframe_{idx:02d}"
        frame_dir.mkdir(exist_ok=True)
        try:
            frame_html = extract_main_html(frame_archive)
            frame_fp = frame_dir / "main.html"
            frame_fp.write_text(frame_html, encoding="utf-8")
            print(f"[OK] subframe HTML: {frame_fp}")
        except Exception as e:
            print(f"[WARN] failed subframe {idx}: {e}")


 def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="Extract HTML and resources from Apple .webarchive files."
    )
    parser.add_argument("src", type=Path, help="Input .webarchive file")
    parser.add_argument(
        "-o",
        "--out",
        type=Path,
        help="Output directory (default: <input_stem>_extracted)",
    )
    return parser


 def print_format_notes() -> None:
    """Print notes about the .webarchive structure and script output."""
    instructions = """
 plistlib parses the container, but .webarchive itself is usually shaped like this:

 {
    "WebMainResource": {...},
    "WebSubresources": [...],
    "WebSubframeArchives": [...]
 }

 The most important fields inside a resource are often:

 resource["WebResourceData"]             # bytes
 resource["WebResourceURL"]              # original URL
 resource["WebResourceMIMEType"]         # MIME type
 resource["WebResourceTextEncodingName"] # text encoding, if relevant

 Run it like this:

 python webarchive_to_html.py page.webarchive

 or

 python webarchive_to_html.py page.webarchive -o out_dir

 That will produce:

 main.html
 resources/ for CSS, JS, images, etc.
 optional subframe_*/main.html
 """
    print(instructions)


 def main() -> None:
    parser = build_parser()
    args = parser.parse_args()

    src = args.src
    if not src.is_file():
        raise SystemExit(f"Input file not found: {src}")

    out_dir = args.out or src.with_name(f"{src.stem}_extracted")
    process_webarchive(src, out_dir)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	webarchive_to_html.py

	Extract HTML and resources from an Apple .webarchive file using Python's plistlib,
	then rewrite common resource links for better offline viewing.

	Usage:
	python webarchive_to_html.py input.webarchive
	python webarchive_to_html.py input.webarchive -o output_dir
	"""

	from __future__ import annotations

	import argparse
	import html
	import mimetypes
	import plistlib
	import re
	from pathlib import Path
	from urllib.parse import urlparse


	def safe_name(name: str, fallback: str = "resource") -> str:
	"""Return a filesystem-safe filename."""
	bad_chars = '<>:"/\\\|?*'
	cleaned = "".join("-" if c in bad_chars else c for c in name).strip()
	return cleaned or fallback


	def uniquify_path(fp: Path) -> Path:
	"""Return a non-conflicting path by appending a numeric suffix if needed."""
	if not fp.exists():
	return fp

	stem = fp.stem
	suffix = fp.suffix
	parent = fp.parent

	i = 2
	while True:
	candidate = parent / f"{stem}_{i}{suffix}"
	if not candidate.exists():
	return candidate
	i += 1


	def guess_extension(mime_type: str \| None, url: str \| None = None) -> str:
	"""Guess a file extension from MIME type or URL."""
	if mime_type:
	ext = mimetypes.guess_extension(mime_type)
	if ext:
	return ext

	if url:
	path = urlparse(url).path
	suffix = Path(path).suffix
	if path and suffix:
	return suffix

	return ""


	def dump_resource(
	resource: dict,
	out_dir: Path,
	prefix: str,
	) -> tuple[Path \| None, str, str]:
	"""Write one archived resource to disk and return path, original URL, and MIME type."""
	data = resource.get("WebResourceData")
	url = resource.get("WebResourceURL", "") or ""
	mime_type = resource.get("WebResourceMIMEType", "") or ""

	if not data:
	return None, url, mime_type

	response = resource.get("WebResourceResponse", {})
	suggested_name = ""

	if isinstance(response, dict):
	suggested_name = response.get("NSURLResponseSuggestedFilename", "") or ""

	name_from_url = Path(urlparse(url).path).name if url else ""
	base_name = suggested_name or name_from_url or prefix
	base_name = safe_name(base_name, fallback=prefix)

	if not Path(base_name).suffix:
	base_name += guess_extension(mime_type, url)

	fp = uniquify_path(out_dir / base_name)
	fp.write_bytes(data)
	return fp, url, mime_type


	def extract_main_html(archive: dict) -> str:
	"""Return decoded text of the main HTML resource."""
	main = archive["WebMainResource"]
	data = main["WebResourceData"]

	encoding = main.get("WebResourceTextEncodingName") or "utf-8"

	try:
	return data.decode(encoding, errors="replace")
	except LookupError:
	return data.decode("utf-8", errors="replace")


	def candidate_urls(url: str) -> list[str]:
	"""Return possible URL variants as they may appear in HTML or CSS."""
	parsed = urlparse(url)
	variants: list[str] = []

	if url:
	variants.append(url)

	if parsed.scheme and parsed.netloc:
	variants.append(f"//{parsed.netloc}{parsed.path}")
	variants.append(parsed.path)
	if parsed.query:
	variants.append(f"{parsed.path}?{parsed.query}")
	variants.append(f"//{parsed.netloc}{parsed.path}?{parsed.query}")

	escaped_variants = []
	for item in variants:
	if "&" in item:
	escaped_variants.append(html.escape(item, quote=True))
	variants.extend(escaped_variants)

	seen = set()
	out = []
	for item in variants:
	if item and item not in seen:
	seen.add(item)
	out.append(item)
	return out


	def build_lookup(url_map: dict[str, str]) -> dict[str, str]:
	"""Expand URL map into a lookup table with normalized candidate forms."""
	lookup: dict[str, str] = {}
	for original_url, local_path in url_map.items():
	for candidate in candidate_urls(original_url):
	lookup[candidate] = local_path
	return lookup


	def _apply_css_url_rewrites(text: str, lookup: dict[str, str]) -> str:
	"""Helper to rewrite CSS url(...) references."""
	def replace_css_url(match: re.Match) -> str:
	quote = match.group(1) or ""
	value = match.group(2)
	local_path = lookup.get(value)
	if local_path:
	return f"url({quote}{local_path}{quote})"
	return match.group(0)

	return re.sub(
	r'url\(\s(["\']?)(.?)\1\s*\)',
	replace_css_url,
	text,
	flags=re.IGNORECASE \| re.DOTALL,
	)


	def rewrite_html_refs(html_text: str, url_map: dict[str, str]) -> str:
	"""Rewrite common HTML and inline-CSS references to local extracted files."""
	lookup = build_lookup(url_map)

	def replace_attr(match: re.Match) -> str:
	attr = match.group(1)
	quote = match.group(2)
	value = match.group(3)
	local_path = lookup.get(value)
	if local_path:
	return f"{attr}={quote}{local_path}{quote}"
	return match.group(0)

	def replace_srcset(match: re.Match) -> str:
	attr = match.group(1)
	quote = match.group(2)
	value = match.group(3)

	parts = []
	for item in value.split(","):
	item = item.strip()
	if not item:
	continue
	bits = item.split()
	url = bits[0]
	rest = " ".join(bits[1:])
	mapped = lookup.get(url, url)
	parts.append(f"{mapped} {rest}".strip())

	return f'{attr}={quote}{", ".join(parts)}{quote}'

	html_text = re.sub(
	r'\b(href\|src)\s=\s(["\'])(.*?)\2',
	replace_attr,
	html_text,
	flags=re.IGNORECASE \| re.DOTALL,
	)

	html_text = re.sub(
	r'\b(srcset)\s=\s(["\'])(.*?)\2',
	replace_srcset,
	html_text,
	flags=re.IGNORECASE \| re.DOTALL,
	)

	return _apply_css_url_rewrites(html_text, lookup)


	def rewrite_css_refs(css_text: str, url_map: dict[str, str]) -> str:
	"""Rewrite CSS url(...) references to local extracted files."""
	lookup = build_lookup(url_map)
	return _apply_css_url_rewrites(css_text, lookup)


	def rewrite_saved_css_files(css_files: list[Path], url_map: dict[str, str]) -> None:
	"""Rewrite extracted CSS files in place."""
	# CSS files live in resources/, so local refs to peer resources are just filenames.
	adjusted_map = {
	url: Path(local_path).name if local_path.startswith("resources/") else local_path
	for url, local_path in url_map.items()
	}

	for css_fp in css_files:
	try:
	css_text = css_fp.read_text(encoding="utf-8")
	except UnicodeDecodeError:
	try:
	css_text = css_fp.read_text(encoding="utf-8", errors="replace")
	except Exception as e:
	print(f"[WARN] failed to read CSS {css_fp.name}: {e}")
	continue

	rewritten = rewrite_css_refs(css_text, adjusted_map)
	css_fp.write_text(rewritten, encoding="utf-8")
	print(f"[OK] rewritten CSS: {css_fp.name}")


	def process_webarchive(src: Path, out_dir: Path) -> None:
	"""Parse .webarchive and write extracted files."""
	with src.open("rb") as f:
	archive = plistlib.load(f)

	out_dir.mkdir(parents=True, exist_ok=True)

	html_text = extract_main_html(archive)

	main_resource = archive.get("WebMainResource", {})
	main_url = main_resource.get("WebResourceURL", "")
	main_mime = main_resource.get("WebResourceMIMEType", "")

	print(f" URL : {main_url}")
	print(f" MIME: {main_mime}")

	subresources = archive.get("WebSubresources", [])
	print(f"[INFO] subresources: {len(subresources)}")

	assets_dir = out_dir / "resources"
	assets_dir.mkdir(exist_ok=True)

	url_map: dict[str, str] = {}
	css_files: list[Path] = []

	for idx, resource in enumerate(subresources, start=1):
	fp, url, mime_type = dump_resource(resource, assets_dir, prefix=f"resource_{idx:04d}")
	if fp:
	rel_path = fp.relative_to(out_dir).as_posix()
	print(f"[OK] resource: {fp.name}")
	if url:
	url_map[url] = rel_path
	if mime_type == "text/css" or fp.suffix.lower() == ".css":
	css_files.append(fp)

	html_text = rewrite_html_refs(html_text, url_map)

	html_fp = out_dir / "main.html"
	html_fp.write_text(html_text, encoding="utf-8")
	print(f"[OK] main HTML: {html_fp}")

	rewrite_saved_css_files(css_files, url_map)

	subframes = archive.get("WebSubframeArchives", [])
	print(f"[INFO] subframe archives: {len(subframes)}")
	for idx, frame_archive in enumerate(subframes, start=1):
	frame_dir = out_dir / f"subframe_{idx:02d}"
	frame_dir.mkdir(exist_ok=True)
	try:
	frame_html = extract_main_html(frame_archive)
	frame_fp = frame_dir / "main.html"
	frame_fp.write_text(frame_html, encoding="utf-8")
	print(f"[OK] subframe HTML: {frame_fp}")
	except Exception as e:
	print(f"[WARN] failed subframe {idx}: {e}")


	def build_parser() -> argparse.ArgumentParser:
	parser = argparse.ArgumentParser(
	description="Extract HTML and resources from Apple .webarchive files."
	)
	parser.add_argument("src", type=Path, help="Input .webarchive file")
	parser.add_argument(
	"-o",
	"--out",
	type=Path,
	help="Output directory (default: <input_stem>_extracted)",
	)
	return parser


	def print_format_notes() -> None:
	"""Print notes about the .webarchive structure and script output."""
	instructions = """
	plistlib parses the container, but .webarchive itself is usually shaped like this:

	{
	"WebMainResource": {...},
	"WebSubresources": [...],
	"WebSubframeArchives": [...]
	}

	The most important fields inside a resource are often:

	resource["WebResourceData"] # bytes
	resource["WebResourceURL"] # original URL
	resource["WebResourceMIMEType"] # MIME type
	resource["WebResourceTextEncodingName"] # text encoding, if relevant

	Run it like this:

	python webarchive_to_html.py page.webarchive

	or

	python webarchive_to_html.py page.webarchive -o out_dir

	That will produce:

	main.html
	resources/ for CSS, JS, images, etc.
	optional subframe_*/main.html
	"""
	print(instructions)


	def main() -> None:
	parser = build_parser()
	args = parser.parse_args()

	src = args.src
	if not src.is_file():
	raise SystemExit(f"Input file not found: {src}")

	out_dir = args.out or src.with_name(f"{src.stem}_extracted")
	process_webarchive(src, out_dir)


	if __name__ == "__main__":
	main()
No results found