Skip to content

Instantly share code, notes, and snippets.

@DMontgomery40
Last active January 30, 2026 15:00
Show Gist options
  • Select an option

  • Save DMontgomery40/ec5300e88b1866401e5d1efa1db5ffc0 to your computer and use it in GitHub Desktop.

Select an option

Save DMontgomery40/ec5300e88b1866401e5d1efa1db5ffc0 to your computer and use it in GitHub Desktop.
DOCX ↔ PDF round-trip normalization

normalize_docx.py

Deterministic DOCX normalization to eliminate layout drift during DOCX ↔ PDF round-trips.

What it does

  1. Character encoding — NFC-normalizes UTF-8, strips NBSP/ZWSP/soft hyphens and C0/C1 control chars
  2. Ligature decomposition — Replaces , , , , with ASCII equivalents, disables <w:ligatures>
  3. Whitespace semantics — Converts multi-space runs to tabs, adds deterministic tab stops
  4. Style inlining — Flattens run/paragraph formatting to direct <w:rPr>/<w:pPr> properties
  5. Layout stabilization — Ensures <w:spacing> and <w:jc> are explicit

Usage

pip install python-docx
python normalize_docx.py input.docx output.normalized.docx

Why

Word's style inheritance and implicit formatting cause non-deterministic PDF exports. This script forces explicit, PDF-stable constructs so export → reimport produces (almost) identical layout (usually).

# -*- coding: utf-8 -*-
"""
normalize_docx.py — Deterministic DOCX normalization to minimize layout drift.
Usage:
python normalize_docx.py input.docx output.normalized.docx
"""
import sys
import re
import unicodedata
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
# Common typographic ligatures → ASCII sequences
TYPO_LIGATURES = {
"\uFB00": "ff", # ff
"\uFB01": "fi", # fi
"\uFB02": "fl", # fl
"\uFB03": "ffi", # ffi
"\uFB04": "ffl", # ffl
}
# Characters to remove entirely
STRIP_CHARS = [
"\u00A0", # NBSP
"\u200B", # ZWSP
"\u00AD", # soft hyphen
]
# All C0/C1 controls except TAB, LF, CR
CONTROL_EXCEPTIONS = {"\t", "\n", "\r"}
CONTROL_CHAR_RE = re.compile(r"[\u0000-\u001F\u007F-\u009F]")
MULTISPACE_RE = re.compile(r" {2,}")
def clean_text(t: str) -> str:
if not t:
return t
# NFC normalize
t = unicodedata.normalize("NFC", t)
# Replace ligatures
for k, v in TYPO_LIGATURES.items():
t = t.replace(k, v)
# Remove specific chars
for ch in STRIP_CHARS:
t = t.replace(ch, "")
# Remove disallowed control chars
def _sub(m):
ch = m.group(0)
return ch if ch in CONTROL_EXCEPTIONS else ""
t = CONTROL_CHAR_RE.sub(_sub, t)
# Multi-space → TAB
t = MULTISPACE_RE.sub("\t", t)
return t
def ensure_tabs(paragraph):
if "\t" not in paragraph.text:
return
pPr = paragraph._p.get_or_add_pPr()
tabs = pPr.find(qn('w:tabs'))
if tabs is None:
tabs = OxmlElement('w:tabs')
pPr.append(tabs)
# clear existing for determinism
for child in list(tabs):
tabs.remove(child)
# stops every 0.5" up to 7.5"
for i in range(1, 16):
tab = OxmlElement('w:tab')
tab.set(qn('w:val'), 'left')
pos_twips = i * 720 # 0.5" = 720 twips
tab.set(qn('w:pos'), str(pos_twips))
tabs.append(tab)
def disable_ligatures_defaults(doc):
styles = doc.styles.element
docDefaults = styles.find(qn('w:docDefaults'))
if docDefaults is None:
docDefaults = OxmlElement('w:docDefaults')
styles.append(docDefaults)
rPrDefault = docDefaults.find(qn('w:rPrDefault'))
if rPrDefault is None:
rPrDefault = OxmlElement('w:rPrDefault')
docDefaults.append(rPrDefault)
rPr = rPrDefault.find(qn('w:rPr'))
if rPr is None:
rPr = OxmlElement('w:rPr')
rPrDefault.append(rPr)
lig = rPr.find(qn('w:ligatures'))
if lig is None:
lig = OxmlElement('w:ligatures')
rPr.append(lig)
lig.set(qn('w:val'), '0')
def inline_run_format(run):
rPr = run._r.get_or_add_rPr()
# Preserve explicit font name
if run.font.name:
rFonts = rPr.find(qn('w:rFonts'))
if rFonts is None:
rFonts = OxmlElement('w:rFonts')
rPr.append(rFonts)
for attr in ('w:ascii','w:hAnsi','w:cs'):
rFonts.set(qn(attr), run.font.name)
# Preserve explicit size
if run.font.size:
val = str(int(run.font.size.pt * 2)) # half-points
for tag in ('w:sz','w:szCs'):
el = rPr.find(qn(tag))
if el is None:
el = OxmlElement(tag)
rPr.append(el)
el.set(qn('w:val'), val)
# Basic styles
def on(tag):
el = rPr.find(qn(tag))
if el is None:
el = OxmlElement(tag)
rPr.append(el)
el.set(qn('w:val'), '1')
if run.bold:
on('w:b')
if run.italic:
on('w:i')
if run.underline:
u = rPr.find(qn('w:u'))
if u is None:
u = OxmlElement('w:u')
rPr.append(u)
u.set(qn('w:val'), 'single')
# Disable ligatures at run level
lig = rPr.find(qn('w:ligatures'))
if lig is None:
lig = OxmlElement('w:ligatures')
rPr.append(lig)
lig.set(qn('w:val'), '0')
def inline_para_format(paragraph):
pPr = paragraph._p.get_or_add_pPr()
# Ensure <w:spacing/>
if pPr.find(qn('w:spacing')) is None:
pPr.append(OxmlElement('w:spacing'))
# Alignment
if paragraph.alignment is not None:
jc = pPr.find(qn('w:jc'))
if jc is None:
jc = OxmlElement('w:jc')
pPr.append(jc)
mapping = {0:'left',1:'center',2:'right',3:'both',4:'distribute'}
jc.set(qn('w:val'), mapping.get(paragraph.alignment, 'left'))
def normalize_docx(input_path, output_path):
doc = Document(input_path)
disable_ligatures_defaults(doc)
# paragraphs
for p in doc.paragraphs:
# text normalization per run
for r in p.runs:
if r.text:
r.text = clean_text(r.text)
inline_run_format(r)
ensure_tabs(p)
inline_para_format(p)
# tables
for tbl in doc.tables:
for row in tbl.rows:
for cell in row.cells:
for p in cell.paragraphs:
for r in p.runs:
if r.text:
r.text = clean_text(r.text)
inline_run_format(r)
ensure_tabs(p)
inline_para_format(p)
doc.save(output_path)
def main():
if len(sys.argv) != 3:
print("Usage: python normalize_docx.py input.docx output.normalized.docx")
sys.exit(2)
normalize_docx(sys.argv[1], sys.argv[2])
print(f"Normalized DOCX written to: {sys.argv[2]}")
if __name__ == "__main__":
main()

DOCX ↔ PDF Round-Trip Normalization Prompt

For use with GitHub Copilot, Claude, or other AI coding assistants


Objective: Eliminate DOCX ↔ PDF round-trip layout drift through deterministic normalization.

Normalization Pipeline:

  1. Character encoding: Canonicalize all text to NFC-normalized UTF-8. Strip:

    • U+00A0 (NBSP), U+200B (ZWSP), U+00AD (soft hyphen)
    • All C0/C1 control characters except U+0009 (tab), U+000A (LF), U+000D (CR)
  2. Glyph decomposition: Replace typographic ligatures (, , , , ) with constituent ASCII sequences. Disable <w:ligatures> in document settings.

  3. Whitespace semantics: Replace space-based alignment with structural equivalents:

    • Multiple spaces → tab stops (<w:tabs>)
    • Manual line spacing → paragraph properties (<w:spacing>)
    • Centered/right-aligned via spaces → <w:jc> justification
  4. Style flattening: Collapse style inheritance to terminal states:

    • Resolve all <w:basedOn> chains
    • Inline computed styles as direct formatting (<w:rPr>, <w:pPr>)
    • Permitted base styles: Normal, Heading1-6 only
  5. Layout primitives: Use only PDF-stable constructs:

    • <w:spacing> (before/after/line)
    • <w:tabs> (explicit tab stops)
    • <w:tbl> (simple tables, no nested/merged cells unless source contains them)

Constraints:

  • Preserve original text verbatim (no rewrites, no semantic changes)
  • Preserve font-family and font-size declarations
  • No content additions (sections, keywords, metadata)

Validation criterion: hash(layout(export_pdf(doc))) == hash(layout(reimport_pdf(doc)))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment