Last active
May 8, 2026 00:31
-
-
Save jgusta/cbd18595031c5d55251f0facc4f34790 to your computer and use it in GitHub Desktop.
Scrub text file of AI (non-human) characters and glyphs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # This script opens a file and scrubs it of known AI-specific characters. | |
| # By AI-specific I mean characters that no one with an american keyboard | |
| # would ever type. Examples are the emdash, fancy quotation marks, | |
| # unusual line endings, arrows, ellipsis, zero-width spaces, copyright | |
| # symbol, apple key symbol, etc. Anything not replaced and is not in the | |
| # list of human characters is removed. | |
| # | |
| # USAGE | |
| # clean_ai_text.sh filename.txt | |
| # | |
| # OUTPUT | |
| # Outputs the same file with `.fixed` prepended to the extension | |
| # if the input is filename.txt the output will be filename.fixed.txt | |
| # | |
| set -euo pipefail | |
| in="$1" | |
| addext="fixed" | |
| dir="" | |
| base="$in" | |
| if [[ "$in" == */* ]]; then | |
| dir="${in%/*}/" | |
| base="${in##*/}" | |
| fi | |
| if [[ "$base" == *.* ]]; then | |
| name="${base%%.*}" | |
| ext="${base#*.}" | |
| out="${dir}${name}.${addext}.${ext}" | |
| else | |
| out="${dir}${base}.${addext}.txt" | |
| fi | |
| perl_script="$(cat <<'PERL' | |
| # Quotes | |
| s/[\x{201C}\x{201D}]/\x{22}/g; # “ ” -> double quote | |
| s/[\x{2018}\x{2019}]/\x{27}/g; # ‘ ’ -> apostrophe | |
| s/\x{2032}/\x{27}/g; # prime -> apostrophe | |
| s/\x{2033}/\x{22}/g; # double prime -> double quote | |
| s/\x{2039}/</g; # single left angle quote -> < | |
| s/\x{203A}/>/g; # single right angle quote -> > | |
| s/\x{00AB}/\x{22}/g; # left guillemet -> double quote | |
| s/\x{00BB}/\x{22}/g; # right guillemet -> double quote | |
| # Dashes and hyphens | |
| s/\x{2014}/--/g; # em dash -> -- | |
| s/\x{2013}/-/g; # en dash -> - | |
| s/\x{2010}/-/g; # hyphen -> - | |
| s/\x{2011}/-/g; # non-breaking hyphen -> - | |
| s/\x{2012}/-/g; # figure dash -> - | |
| s/\x{2015}/--/g; # horizontal bar -> -- | |
| s/\x{2212}/-/g; # minus sign -> - | |
| s/\x{00AD}//g; # soft hyphen -> deleted | |
| # Spaces and invisible characters | |
| s/\x{00A0}/ /g; # non-breaking space -> space | |
| s/\x{2002}/ /g; # en space -> space | |
| s/\x{2003}/ /g; # em space -> space | |
| s/\x{2009}/ /g; # thin space -> space | |
| s/\x{200A}/ /g; # hair space -> space | |
| s/\x{200B}//g; # zero-width space -> deleted | |
| s/\x{200C}//g; # zero-width non-joiner -> deleted | |
| s/\x{200D}//g; # zero-width joiner -> deleted | |
| s/\x{2060}//g; # word joiner -> deleted | |
| s/\x{FEFF}//g; # BOM / zero-width no-break space -> deleted | |
| # Line and paragraph separators | |
| s/\x{2028}/\n/g; # line separator -> newline | |
| s/\x{2029}/\n\n/g; # paragraph separator -> two newlines | |
| # Arrows | |
| s/\x{2192}/->/g; # right arrow -> -> | |
| s/\x{2190}/<-/g; # left arrow -> <- | |
| s/\x{2191}/^/g; # up arrow -> ^ | |
| s/\x{2193}/v/g; # down arrow -> v | |
| s/\x{21D2}/=>/g; # right double arrow -> => | |
| s/\x{21D0}/<=/g; # left double arrow -> <= | |
| # Common prose/math symbols | |
| s/\x{2026}/.../g; # ellipsis -> ... | |
| s/\x{00B7}/-/g; # middle dot -> - | |
| s/\x{2022}/-/g; # bullet -> - | |
| s/\x{2044}/\//g; # fraction slash -> / | |
| s/\x{00D7}/x/g; # multiplication sign -> x | |
| s/\x{00B1}/+\/-/g; # plus-minus -> +/- | |
| s/\x{2248}/~/g; # approximately equal -> ~ | |
| s/\x{2260}/!=/g; # not equal -> != | |
| s/\x{2264}/<=/g; # less-than-or-equal -> <= | |
| s/\x{2265}/>=/g; # greater-than-or-equal -> >= | |
| s/\x{00BC}/1\/4/g; # one quarter -> 1/4 | |
| s/\x{00BD}/1\/2/g; # one half -> 1/2 | |
| s/\x{00BE}/3\/4/g; # three quarters -> 3/4 | |
| s/\x{00B0}/ degrees/g; # degree sign -> degrees | |
| # Trademark/copyright | |
| s/\x{00A9}/(c)/g; # copyright -> (c) | |
| s/\x{00AE}/(R)/g; # registered trademark -> (R) | |
| s/\x{2122}/TM/g; # trademark -> TM | |
| # Mac keyboard symbols | |
| s/\x{2303}/Control/g; # control -> Control | |
| s/\x{2318}/Command/g; # command -> Command | |
| s/\x{2325}/Option/g; # option -> Option | |
| s/\x{21E7}/Shift/g; # shift -> Shift | |
| s/\x{23CE}/Return/g; # return -> Return | |
| s/\x{232B}/Delete/g; # delete -> Delete | |
| # Delete everything else that is not standard printable ASCII, | |
| # while preserving tab, newline, and carriage return. | |
| s/[^\x09\x0A\x0D\x20-\x7E]//g; | |
| PERL | |
| )" | |
| perl -Mutf8 -CSDA -pe "$perl_script" "$in" > "$out" | |
| echo "written to $out" | |
| # Copyright (c) 2026 @jgusta | |
| # Licensed under the MIT License | |
| # The MIT License (MIT) | |
| # Copyright (c) github.com/jgusta | |
| # Permission is hereby granted, free of charge, to any person obtaining a copy | |
| # of this software and associated documentation files (the "Software"), to deal | |
| # in the Software without restriction, including without limitation the rights | |
| # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| # copies of the Software, and to permit persons to whom the Software is | |
| # furnished to do so, subject to the following conditions: | |
| # The above copyright notice and this permission notice shall be included in all | |
| # copies or substantial portions of the Software. | |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| # SOFTWARE. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment