Skip to content

Instantly share code, notes, and snippets.

@tmo1
Created May 14, 2026 17:53
Show Gist options
  • Select an option

  • Save tmo1/eb3160b02eb03dd20ef80885421ad043 to your computer and use it in GitHub Desktop.

Select an option

Save tmo1/eb3160b02eb03dd20ef80885421ad043 to your computer and use it in GitHub Desktop.
List and tally the unique TLDs of 'From:' addresses of messages in a tree of MH mailboxes
#! /usr/bin/python3
# Copyright (C) 2026 Thomas More
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
List and tally the unique TLDs of 'From:' addresses of messages in a tree of MH mailboxes
Inspired by this comment: https://old.reddit.com/r/mxroute/comments/1tc6x5i/getting_slammed_with_spam/olmimk8/
Usage: find_tlds.py /path/to/MH_mailbox
"""
import os
import sys
import email
from email.utils import parseaddr
tlds, good_messages, bad_messages = set(), 0, 0
for root, dirs, files in os.walk(sys.argv[1]):
for filename in files:
# MH message files are typically numbered
if filename.isdigit():
filepath = os.path.join(root, filename)
# We read the message file in binary mode since standard UTF-8 decoding can fail on some messages
with open(filepath, 'rb') as f:
msg = email.message_from_binary_file(f)
# If we parse with "strict=False", we'll process more emails, but some "From:" headers will be parsed incorrectly, e.g., if the real name part contains a "@":
# https://github.com/python/cpython/issues/78336
# https://kalunite.net/parsing-email-addresses-in-python.html
sender = parseaddr(msg['From'])[1]
if '@' in sender:
tlds.add(sender.split('@')[1].split('.')[-1].casefold())
good_messages += 1
else:
#print(f"{filepath}\tBad sender address: '{msg['From']}'")
bad_messages += 1
print(f"Total messages scanned:\t\t{good_messages + bad_messages}\nMessages with valid senders:\t{good_messages}\nTotal unique TLDs:\t\t{len(tlds)}\n\n{tlds}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment