|
# -*- coding: utf-8 -*- |
|
""" |
|
normalize_docx.py — Deterministic DOCX normalization to minimize layout drift. |
|
|
|
Usage: |
|
python normalize_docx.py input.docx output.normalized.docx |
|
""" |
|
import sys |
|
import re |
|
import unicodedata |
|
from docx import Document |
|
from docx.oxml import OxmlElement |
|
from docx.oxml.ns import qn |
|
|
|
# Common typographic ligatures → ASCII sequences |
|
TYPO_LIGATURES = { |
|
"\uFB00": "ff", # ff |
|
"\uFB01": "fi", # fi |
|
"\uFB02": "fl", # fl |
|
"\uFB03": "ffi", # ffi |
|
"\uFB04": "ffl", # ffl |
|
} |
|
|
|
# Characters to remove entirely |
|
STRIP_CHARS = [ |
|
"\u00A0", # NBSP |
|
"\u200B", # ZWSP |
|
"\u00AD", # soft hyphen |
|
] |
|
|
|
# All C0/C1 controls except TAB, LF, CR |
|
CONTROL_EXCEPTIONS = {"\t", "\n", "\r"} |
|
CONTROL_CHAR_RE = re.compile(r"[\u0000-\u001F\u007F-\u009F]") |
|
MULTISPACE_RE = re.compile(r" {2,}") |
|
|
|
|
|
def clean_text(t: str) -> str: |
|
if not t: |
|
return t |
|
# NFC normalize |
|
t = unicodedata.normalize("NFC", t) |
|
# Replace ligatures |
|
for k, v in TYPO_LIGATURES.items(): |
|
t = t.replace(k, v) |
|
# Remove specific chars |
|
for ch in STRIP_CHARS: |
|
t = t.replace(ch, "") |
|
# Remove disallowed control chars |
|
def _sub(m): |
|
ch = m.group(0) |
|
return ch if ch in CONTROL_EXCEPTIONS else "" |
|
t = CONTROL_CHAR_RE.sub(_sub, t) |
|
# Multi-space → TAB |
|
t = MULTISPACE_RE.sub("\t", t) |
|
return t |
|
|
|
|
|
def ensure_tabs(paragraph): |
|
if "\t" not in paragraph.text: |
|
return |
|
pPr = paragraph._p.get_or_add_pPr() |
|
tabs = pPr.find(qn('w:tabs')) |
|
if tabs is None: |
|
tabs = OxmlElement('w:tabs') |
|
pPr.append(tabs) |
|
# clear existing for determinism |
|
for child in list(tabs): |
|
tabs.remove(child) |
|
# stops every 0.5" up to 7.5" |
|
for i in range(1, 16): |
|
tab = OxmlElement('w:tab') |
|
tab.set(qn('w:val'), 'left') |
|
pos_twips = i * 720 # 0.5" = 720 twips |
|
tab.set(qn('w:pos'), str(pos_twips)) |
|
tabs.append(tab) |
|
|
|
|
|
def disable_ligatures_defaults(doc): |
|
styles = doc.styles.element |
|
docDefaults = styles.find(qn('w:docDefaults')) |
|
if docDefaults is None: |
|
docDefaults = OxmlElement('w:docDefaults') |
|
styles.append(docDefaults) |
|
rPrDefault = docDefaults.find(qn('w:rPrDefault')) |
|
if rPrDefault is None: |
|
rPrDefault = OxmlElement('w:rPrDefault') |
|
docDefaults.append(rPrDefault) |
|
rPr = rPrDefault.find(qn('w:rPr')) |
|
if rPr is None: |
|
rPr = OxmlElement('w:rPr') |
|
rPrDefault.append(rPr) |
|
lig = rPr.find(qn('w:ligatures')) |
|
if lig is None: |
|
lig = OxmlElement('w:ligatures') |
|
rPr.append(lig) |
|
lig.set(qn('w:val'), '0') |
|
|
|
|
|
def inline_run_format(run): |
|
rPr = run._r.get_or_add_rPr() |
|
# Preserve explicit font name |
|
if run.font.name: |
|
rFonts = rPr.find(qn('w:rFonts')) |
|
if rFonts is None: |
|
rFonts = OxmlElement('w:rFonts') |
|
rPr.append(rFonts) |
|
for attr in ('w:ascii','w:hAnsi','w:cs'): |
|
rFonts.set(qn(attr), run.font.name) |
|
# Preserve explicit size |
|
if run.font.size: |
|
val = str(int(run.font.size.pt * 2)) # half-points |
|
for tag in ('w:sz','w:szCs'): |
|
el = rPr.find(qn(tag)) |
|
if el is None: |
|
el = OxmlElement(tag) |
|
rPr.append(el) |
|
el.set(qn('w:val'), val) |
|
# Basic styles |
|
def on(tag): |
|
el = rPr.find(qn(tag)) |
|
if el is None: |
|
el = OxmlElement(tag) |
|
rPr.append(el) |
|
el.set(qn('w:val'), '1') |
|
if run.bold: |
|
on('w:b') |
|
if run.italic: |
|
on('w:i') |
|
if run.underline: |
|
u = rPr.find(qn('w:u')) |
|
if u is None: |
|
u = OxmlElement('w:u') |
|
rPr.append(u) |
|
u.set(qn('w:val'), 'single') |
|
# Disable ligatures at run level |
|
lig = rPr.find(qn('w:ligatures')) |
|
if lig is None: |
|
lig = OxmlElement('w:ligatures') |
|
rPr.append(lig) |
|
lig.set(qn('w:val'), '0') |
|
|
|
|
|
def inline_para_format(paragraph): |
|
pPr = paragraph._p.get_or_add_pPr() |
|
# Ensure <w:spacing/> |
|
if pPr.find(qn('w:spacing')) is None: |
|
pPr.append(OxmlElement('w:spacing')) |
|
# Alignment |
|
if paragraph.alignment is not None: |
|
jc = pPr.find(qn('w:jc')) |
|
if jc is None: |
|
jc = OxmlElement('w:jc') |
|
pPr.append(jc) |
|
mapping = {0:'left',1:'center',2:'right',3:'both',4:'distribute'} |
|
jc.set(qn('w:val'), mapping.get(paragraph.alignment, 'left')) |
|
|
|
|
|
def normalize_docx(input_path, output_path): |
|
doc = Document(input_path) |
|
disable_ligatures_defaults(doc) |
|
# paragraphs |
|
for p in doc.paragraphs: |
|
# text normalization per run |
|
for r in p.runs: |
|
if r.text: |
|
r.text = clean_text(r.text) |
|
inline_run_format(r) |
|
ensure_tabs(p) |
|
inline_para_format(p) |
|
# tables |
|
for tbl in doc.tables: |
|
for row in tbl.rows: |
|
for cell in row.cells: |
|
for p in cell.paragraphs: |
|
for r in p.runs: |
|
if r.text: |
|
r.text = clean_text(r.text) |
|
inline_run_format(r) |
|
ensure_tabs(p) |
|
inline_para_format(p) |
|
doc.save(output_path) |
|
|
|
|
|
def main(): |
|
if len(sys.argv) != 3: |
|
print("Usage: python normalize_docx.py input.docx output.normalized.docx") |
|
sys.exit(2) |
|
normalize_docx(sys.argv[1], sys.argv[2]) |
|
print(f"Normalized DOCX written to: {sys.argv[2]}") |
|
|
|
if __name__ == "__main__": |
|
main() |