Skip to content

Instantly share code, notes, and snippets.

@jgusta
Last active May 8, 2026 00:31
Show Gist options
  • Select an option

  • Save jgusta/cbd18595031c5d55251f0facc4f34790 to your computer and use it in GitHub Desktop.

Select an option

Save jgusta/cbd18595031c5d55251f0facc4f34790 to your computer and use it in GitHub Desktop.
Scrub text file of AI (non-human) characters and glyphs
#!/usr/bin/env bash
# This script opens a file and scrubs it of known AI-specific characters.
# By AI-specific I mean characters that no one with an american keyboard
# would ever type. Examples are the emdash, fancy quotation marks,
# unusual line endings, arrows, ellipsis, zero-width spaces, copyright
# symbol, apple key symbol, etc. Anything not replaced and is not in the
# list of human characters is removed.
#
# USAGE
# clean_ai_text.sh filename.txt
#
# OUTPUT
# Outputs the same file with `.fixed` prepended to the extension
# if the input is filename.txt the output will be filename.fixed.txt
#
set -euo pipefail
in="$1"
addext="fixed"
dir=""
base="$in"
if [[ "$in" == */* ]]; then
dir="${in%/*}/"
base="${in##*/}"
fi
if [[ "$base" == *.* ]]; then
name="${base%%.*}"
ext="${base#*.}"
out="${dir}${name}.${addext}.${ext}"
else
out="${dir}${base}.${addext}.txt"
fi
perl_script="$(cat <<'PERL'
# Quotes
s/[\x{201C}\x{201D}]/\x{22}/g; # “ ” -> double quote
s/[\x{2018}\x{2019}]/\x{27}/g; # ‘ ’ -> apostrophe
s/\x{2032}/\x{27}/g; # prime -> apostrophe
s/\x{2033}/\x{22}/g; # double prime -> double quote
s/\x{2039}/</g; # single left angle quote -> <
s/\x{203A}/>/g; # single right angle quote -> >
s/\x{00AB}/\x{22}/g; # left guillemet -> double quote
s/\x{00BB}/\x{22}/g; # right guillemet -> double quote
# Dashes and hyphens
s/\x{2014}/--/g; # em dash -> --
s/\x{2013}/-/g; # en dash -> -
s/\x{2010}/-/g; # hyphen -> -
s/\x{2011}/-/g; # non-breaking hyphen -> -
s/\x{2012}/-/g; # figure dash -> -
s/\x{2015}/--/g; # horizontal bar -> --
s/\x{2212}/-/g; # minus sign -> -
s/\x{00AD}//g; # soft hyphen -> deleted
# Spaces and invisible characters
s/\x{00A0}/ /g; # non-breaking space -> space
s/\x{2002}/ /g; # en space -> space
s/\x{2003}/ /g; # em space -> space
s/\x{2009}/ /g; # thin space -> space
s/\x{200A}/ /g; # hair space -> space
s/\x{200B}//g; # zero-width space -> deleted
s/\x{200C}//g; # zero-width non-joiner -> deleted
s/\x{200D}//g; # zero-width joiner -> deleted
s/\x{2060}//g; # word joiner -> deleted
s/\x{FEFF}//g; # BOM / zero-width no-break space -> deleted
# Line and paragraph separators
s/\x{2028}/\n/g; # line separator -> newline
s/\x{2029}/\n\n/g; # paragraph separator -> two newlines
# Arrows
s/\x{2192}/->/g; # right arrow -> ->
s/\x{2190}/<-/g; # left arrow -> <-
s/\x{2191}/^/g; # up arrow -> ^
s/\x{2193}/v/g; # down arrow -> v
s/\x{21D2}/=>/g; # right double arrow -> =>
s/\x{21D0}/<=/g; # left double arrow -> <=
# Common prose/math symbols
s/\x{2026}/.../g; # ellipsis -> ...
s/\x{00B7}/-/g; # middle dot -> -
s/\x{2022}/-/g; # bullet -> -
s/\x{2044}/\//g; # fraction slash -> /
s/\x{00D7}/x/g; # multiplication sign -> x
s/\x{00B1}/+\/-/g; # plus-minus -> +/-
s/\x{2248}/~/g; # approximately equal -> ~
s/\x{2260}/!=/g; # not equal -> !=
s/\x{2264}/<=/g; # less-than-or-equal -> <=
s/\x{2265}/>=/g; # greater-than-or-equal -> >=
s/\x{00BC}/1\/4/g; # one quarter -> 1/4
s/\x{00BD}/1\/2/g; # one half -> 1/2
s/\x{00BE}/3\/4/g; # three quarters -> 3/4
s/\x{00B0}/ degrees/g; # degree sign -> degrees
# Trademark/copyright
s/\x{00A9}/(c)/g; # copyright -> (c)
s/\x{00AE}/(R)/g; # registered trademark -> (R)
s/\x{2122}/TM/g; # trademark -> TM
# Mac keyboard symbols
s/\x{2303}/Control/g; # control -> Control
s/\x{2318}/Command/g; # command -> Command
s/\x{2325}/Option/g; # option -> Option
s/\x{21E7}/Shift/g; # shift -> Shift
s/\x{23CE}/Return/g; # return -> Return
s/\x{232B}/Delete/g; # delete -> Delete
# Delete everything else that is not standard printable ASCII,
# while preserving tab, newline, and carriage return.
s/[^\x09\x0A\x0D\x20-\x7E]//g;
PERL
)"
perl -Mutf8 -CSDA -pe "$perl_script" "$in" > "$out"
echo "written to $out"
# Copyright (c) 2026 @jgusta
# Licensed under the MIT License
# The MIT License (MIT)
# Copyright (c) github.com/jgusta
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment