#!/usr/bin/env python3 # SPDX-License-Identifier: MIT """ style_audit.py — Financial formatting compliance checker for xlsx files. Audits an xlsx file (or an unpacked xlsx directory) and reports: 1. Style system integrity: count attributes match actual element counts 2. Color-role violations: formula cells with blue font, input cells with black font 3. Year-format violations: cells containing 4-digit years using comma-format 4. Percentage value violations: percentage-formatted cells with values > 1 (likely meant 0.08 not 8) 5. Style index out-of-range: s attribute exceeds cellXfs count 6. fills[0]/fills[1] presence check (OOXML spec requirement) Usage: python3 style_audit.py input.xlsx # audit a packed xlsx python3 style_audit.py /tmp/xlsx_work/ # audit an unpacked directory python3 style_audit.py input.xlsx --json # machine-readable output python3 style_audit.py input.xlsx --summary # counts only, no detail Exit code: 0 — no violations found 1 — violations detected (or file cannot be opened) """ import sys import os import zipfile import xml.etree.ElementTree as ET import json import re import tempfile import shutil NS = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" NSP = f"{{{NS}}}" # Predefined style index semantics from minimal_xlsx template. # Maps cellXfs index -> (role, font_color_expectation, numFmt_type) # role: "input" = blue expected, "formula" = black/green expected, "header" = any, "any" = skip TEMPLATE_SLOT_ROLES = { 0: ("any", None, None), 1: ("input", "blue", "general"), 2: ("formula", "black", "general"), 3: ("formula", "green", "general"), 4: ("any", None, "general"), # header 5: ("input", "blue", "currency"), 6: ("formula", "black", "currency"), 7: ("input", "blue", "percent"), 8: ("formula", "black", "percent"), 9: ("input", "blue", "integer"), 10: ("formula", "black", "integer"), 11: ("input", "blue", "year"), 12: ("input", "blue", "general"), # highlight } # AARRGGBB values for each role color BLUE_RGB = "000000ff" BLACK_RGB = "00000000" GREEN_RGB = "00008000" RED_RGB = "00ff0000" # numFmtIds that represent percentage formats (built-in + common custom) PERCENT_FMT_IDS = {9, 10, 165, 170} # numFmtIds that use comma separator (would corrupt year display) COMMA_FMT_IDS = {3, 4, 167, 168} # #,##0 style — 4-digit years would show as 2,024 def _parse_styles(styles_xml: bytes) -> dict: """Parse styles.xml and return structured data.""" root = ET.fromstring(styles_xml) def find(tag): return root.find(f"{NSP}{tag}") # numFmts num_fmts = {} # id -> formatCode nf_elem = find("numFmts") if nf_elem is not None: declared_count = int(nf_elem.get("count", "0")) actual_count = len(nf_elem) for nf in nf_elem: fid = int(nf.get("numFmtId", "0")) num_fmts[fid] = nf.get("formatCode", "") else: declared_count = 0 actual_count = 0 # fonts — extract color and bold flag fonts = [] fonts_elem = find("fonts") fonts_declared = 0 if fonts_elem is not None: fonts_declared = int(fonts_elem.get("count", "0")) for font in fonts_elem: color_elem = font.find(f"{NSP}color") bold_elem = font.find(f"{NSP}b") if color_elem is not None: rgb = color_elem.get("rgb", "").lower() theme = color_elem.get("theme") else: rgb = "" theme = None fonts.append({ "rgb": rgb, "theme": theme, "bold": bold_elem is not None, }) # fills fills = [] fills_elem = find("fills") fills_declared = 0 if fills_elem is not None: fills_declared = int(fills_elem.get("count", "0")) for fill in fills_elem: pf = fill.find(f"{NSP}patternFill") pattern_type = pf.get("patternType", "") if pf is not None else "" fills.append({"patternType": pattern_type}) # cellXfs xfs = [] xfs_elem = find("cellXfs") xfs_declared = 0 if xfs_elem is not None: xfs_declared = int(xfs_elem.get("count", "0")) for xf in xfs_elem: xfs.append({ "numFmtId": int(xf.get("numFmtId", "0")), "fontId": int(xf.get("fontId", "0")), "fillId": int(xf.get("fillId", "0")), "borderId": int(xf.get("borderId", "0")), }) return { "num_fmts": num_fmts, "num_fmts_declared": declared_count, "num_fmts_actual": actual_count, "fonts": fonts, "fonts_declared": fonts_declared, "fonts_actual": len(fonts), "fills": fills, "fills_declared": fills_declared, "fills_actual": len(fills), "xfs": xfs, "xfs_declared": xfs_declared, "xfs_actual": len(xfs), } def _is_blue_font(font: dict) -> bool: return font["rgb"] == BLUE_RGB def _is_black_font(font: dict) -> bool: return font["rgb"] == BLACK_RGB or (font["rgb"] == "" and font["theme"] is not None) def _is_green_font(font: dict) -> bool: return font["rgb"] == GREEN_RGB def _fmt_is_percent(num_fmt_id: int, num_fmts: dict) -> bool: if num_fmt_id in PERCENT_FMT_IDS: return True fmt_code = num_fmts.get(num_fmt_id, "") return "%" in fmt_code def _fmt_is_comma(num_fmt_id: int, num_fmts: dict) -> bool: if num_fmt_id in COMMA_FMT_IDS: return True fmt_code = num_fmts.get(num_fmt_id, "") # formatCode has comma separator if it contains #,##0 but not a trailing , (scale) return "#,##" in fmt_code and not fmt_code.endswith(",") and not fmt_code.endswith(",\"M\"") and not fmt_code.endswith(",\"K\"") def _looks_like_year(value_text: str) -> bool: """True if value is a 4-digit year between 1900 and 2100.""" try: v = int(float(value_text)) return 1900 <= v <= 2100 except (ValueError, TypeError): return False def _audit(styles_xml: bytes, sheet_xmls: list[tuple[str, bytes]]) -> dict: """ Run all formatting compliance checks. Args: styles_xml: content of xl/styles.xml sheet_xmls: list of (sheet_name, xml_bytes) for each worksheet Returns: dict with violations and summary """ results = { "violations": [], "warnings": [], "summary": {}, } v = results["violations"] w = results["warnings"] styles = _parse_styles(styles_xml) fonts = styles["fonts"] xfs = styles["xfs"] num_fmts = styles["num_fmts"] # ── Check A: count attribute integrity ────────────────────────────────── if styles["fonts_declared"] != styles["fonts_actual"]: v.append({ "type": "count_mismatch", "element": "fonts", "declared": styles["fonts_declared"], "actual": styles["fonts_actual"], "fix": f"Update ", }) if styles["fills_declared"] != styles["fills_actual"]: v.append({ "type": "count_mismatch", "element": "fills", "declared": styles["fills_declared"], "actual": styles["fills_actual"], "fix": f"Update ", }) if styles["xfs_declared"] != styles["xfs_actual"]: v.append({ "type": "count_mismatch", "element": "cellXfs", "declared": styles["xfs_declared"], "actual": styles["xfs_actual"], "fix": f"Update ", }) # ── Check B: fills[0] and fills[1] presence ────────────────────────────── fills = styles["fills"] if len(fills) < 2: v.append({ "type": "missing_required_fills", "detail": "fills[0] (none) and fills[1] (gray125) are required by OOXML spec", "fix": "Prepend and ", }) else: if fills[0].get("patternType") != "none": v.append({ "type": "fills_0_corrupted", "detail": f"fills[0] patternType='{fills[0].get('patternType')}', must be 'none'", "fix": "Set fills[0] patternFill patternType to 'none'", }) if fills[1].get("patternType") != "gray125": v.append({ "type": "fills_1_corrupted", "detail": f"fills[1] patternType='{fills[1].get('patternType')}', must be 'gray125'", "fix": "Set fills[1] patternFill patternType to 'gray125'", }) # ── Check C: per-cell style violations ─────────────────────────────────── total_cells = 0 formula_cells = 0 input_cells = 0 for sheet_name, sheet_xml in sheet_xmls: ws = ET.fromstring(sheet_xml) for cell in ws.findall(f".//{NSP}c"): cell_ref = cell.get("r", "?") s_attr = cell.get("s") has_formula = cell.find(f"{NSP}f") is not None v_elem = cell.find(f"{NSP}v") value_text = v_elem.text if v_elem is not None else None total_cells += 1 # Skip cells with no style if s_attr is None: continue try: s_idx = int(s_attr) except ValueError: continue # Check C1: s index out of range if s_idx >= len(xfs): v.append({ "type": "style_index_out_of_range", "sheet": sheet_name, "cell": cell_ref, "s": s_idx, "cellXfs_count": len(xfs), "fix": f"s={s_idx} exceeds cellXfs count={len(xfs)}; add missing entries or lower s value", }) continue xf = xfs[s_idx] font_id = xf["fontId"] num_fmt_id = xf["numFmtId"] if font_id >= len(fonts): v.append({ "type": "font_index_out_of_range", "sheet": sheet_name, "cell": cell_ref, "fontId": font_id, "fonts_count": len(fonts), "fix": f"fontId={font_id} exceeds fonts count={len(fonts)}; add missing entries", }) continue font = fonts[font_id] # Check C2: color-role violation — formula cell with blue font if has_formula and _is_blue_font(font): formula_cells += 1 f_elem = cell.find(f"{NSP}f") formula_text = f_elem.text if f_elem is not None else "" v.append({ "type": "formula_cell_blue_font", "sheet": sheet_name, "cell": cell_ref, "s": s_idx, "formula": formula_text, "fix": "Formula cells must use black font (formula) or green font (cross-sheet ref). " "Use style index 2/6/8/10 (black) or 3/13 (green) instead.", }) # Check C3: color-role violation — non-formula cell with explicit black # (only flag if it looks like it should be an input — has a numeric value) if (not has_formula and _is_black_font(font) and value_text is not None and not font.get("bold") and num_fmt_id not in (0,) # skip general-format black (could be label) ): try: float(value_text) # It's a numeric value with black font — possible missing blue input marker w.append({ "type": "numeric_input_may_lack_blue", "sheet": sheet_name, "cell": cell_ref, "s": s_idx, "value": value_text, "note": "Hardcoded numeric value has black font — if this is a user-editable " "assumption, change to blue-font input style (e.g. s=1/5/7/9/11/12).", }) except (ValueError, TypeError): pass # Check C4: year value with comma-formatted numFmt if value_text and _looks_like_year(value_text) and _fmt_is_comma(num_fmt_id, num_fmts): v.append({ "type": "year_with_comma_format", "sheet": sheet_name, "cell": cell_ref, "s": s_idx, "value": value_text, "numFmtId": num_fmt_id, "fix": "Year values must use numFmtId=1 (format '0') to display as 2024 not 2,024. " "Use style index 11 or a custom xf with numFmtId=1.", }) # Check C5: percentage format with value > 1 (likely 8 instead of 0.08) if value_text and _fmt_is_percent(num_fmt_id, num_fmts): try: pct_val = float(value_text) if pct_val > 1.0: w.append({ "type": "percent_value_gt_1", "sheet": sheet_name, "cell": cell_ref, "s": s_idx, "value": value_text, "displayed_as": f"{pct_val * 100:.0f}%", "note": f"Value {value_text} with percentage format displays as {pct_val*100:.0f}%. " "If intended rate is ~{:.0f}%, store as {:.4f} instead.".format( pct_val, pct_val / 100 ), }) except (ValueError, TypeError): pass if has_formula: formula_cells += 1 elif value_text is not None: input_cells += 1 results["summary"] = { "total_cells_inspected": total_cells, "formula_cells": formula_cells, "input_cells": input_cells, "violations": len(v), "warnings": len(w), } return results def _load_from_xlsx(xlsx_path: str) -> tuple[bytes, list[tuple[str, bytes]]]: """Load styles.xml and all sheet XMLs from a packed xlsx file.""" with zipfile.ZipFile(xlsx_path, "r") as z: styles_xml = z.read("xl/styles.xml") # Get sheet name mapping wb_xml = z.read("xl/workbook.xml") wb = ET.fromstring(wb_xml) rel_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships" rels_xml = z.read("xl/_rels/workbook.xml.rels") rels = ET.fromstring(rels_xml) rid_to_name = {} for sheet in wb.findall(f".//{{{NS}}}sheet"): rid = sheet.get(f"{{{rel_ns}}}id", "") name = sheet.get("name", "") rid_to_name[rid] = name rid_to_path = {} for rel in rels: rid = rel.get("Id", "") target = rel.get("Target", "") if "worksheets" in target: if not target.startswith("xl/"): target = "xl/" + target rid_to_path[rid] = target sheet_xmls = [] for rid, name in rid_to_name.items(): path = rid_to_path.get(rid) if path and path in z.namelist(): sheet_xmls.append((name, z.read(path))) return styles_xml, sheet_xmls def _load_from_dir(unpacked_dir: str) -> tuple[bytes, list[tuple[str, bytes]]]: """Load styles.xml and all sheet XMLs from an unpacked directory.""" styles_path = os.path.join(unpacked_dir, "xl", "styles.xml") with open(styles_path, "rb") as f: styles_xml = f.read() # Get sheet names from workbook.xml wb_path = os.path.join(unpacked_dir, "xl", "workbook.xml") wb = ET.fromstring(open(wb_path, "rb").read()) rels_path = os.path.join(unpacked_dir, "xl", "_rels", "workbook.xml.rels") rels = ET.fromstring(open(rels_path, "rb").read()) rel_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships" rid_to_name = {} for sheet in wb.findall(f".//{{{NS}}}sheet"): rid = sheet.get(f"{{{rel_ns}}}id", "") name = sheet.get("name", "") rid_to_name[rid] = name rid_to_path = {} for rel in rels: rid = rel.get("Id", "") target = rel.get("Target", "") if "worksheets" in target: rid_to_path[rid] = target sheet_xmls = [] ws_dir = os.path.join(unpacked_dir, "xl", "worksheets") for rid, name in rid_to_name.items(): rel_path = rid_to_path.get(rid, "") # rel_path may be "worksheets/sheet1.xml" or absolute path if rel_path.startswith("worksheets/"): full = os.path.join(unpacked_dir, "xl", rel_path) else: full = os.path.join(unpacked_dir, "xl", "worksheets", os.path.basename(rel_path)) if os.path.exists(full): with open(full, "rb") as f: sheet_xmls.append((name, f.read())) return styles_xml, sheet_xmls def main() -> None: use_json = "--json" in sys.argv summary_only = "--summary" in sys.argv args_clean = [a for a in sys.argv[1:] if not a.startswith("--")] if not args_clean: print("Usage: style_audit.py [--json] [--summary]") sys.exit(1) target = args_clean[0] try: if os.path.isdir(target): styles_xml, sheet_xmls = _load_from_dir(target) elif target.endswith(".xlsx") or target.endswith(".xlsm"): styles_xml, sheet_xmls = _load_from_xlsx(target) else: print(f"ERROR: unrecognized target '{target}' — must be .xlsx file or unpacked directory") sys.exit(1) except Exception as e: print(f"ERROR loading file: {e}") sys.exit(1) results = _audit(styles_xml, sheet_xmls) if use_json: print(json.dumps(results, indent=2, ensure_ascii=False)) sys.exit(1 if results["summary"]["violations"] > 0 else 0) # Human-readable output s = results["summary"] print(f"Target : {target}") print(f"Cells : {s['total_cells_inspected']} inspected " f"({s['formula_cells']} formula, {s['input_cells']} input)") print(f"Violations : {s['violations']}") print(f"Warnings : {s['warnings']}") if not summary_only: if results["violations"]: print("\n── Violations (must fix) ──") for item in results["violations"]: t = item["type"] if t == "count_mismatch": print(f" [FAIL] {item['element']} count mismatch: declared={item['declared']}, " f"actual={item['actual']}") print(f" Fix: {item['fix']}") elif t == "missing_required_fills": print(f" [FAIL] {item['detail']}") print(f" Fix: {item['fix']}") elif t in ("fills_0_corrupted", "fills_1_corrupted"): print(f" [FAIL] {item['detail']}") print(f" Fix: {item['fix']}") elif t == "formula_cell_blue_font": print(f" [FAIL] [{item['sheet']}!{item['cell']}] formula cell has blue font " f"(role=input, but cell contains formula: {item.get('formula', '')})") print(f" Fix: {item['fix']}") elif t == "style_index_out_of_range": print(f" [FAIL] [{item['sheet']}!{item['cell']}] s={item['s']} but " f"cellXfs count={item['cellXfs_count']}") print(f" Fix: {item['fix']}") elif t == "font_index_out_of_range": print(f" [FAIL] [{item['sheet']}!{item['cell']}] fontId={item['fontId']} but " f"fonts count={item['fonts_count']}") print(f" Fix: {item['fix']}") elif t == "year_with_comma_format": print(f" [FAIL] [{item['sheet']}!{item['cell']}] year value {item['value']} " f"uses comma-format (numFmtId={item['numFmtId']}) — will display as " f"{int(float(item['value'])):,}") print(f" Fix: {item['fix']}") else: print(f" [FAIL] {item}") if results["warnings"] and not summary_only: print("\n── Warnings (review recommended) ──") for item in results["warnings"]: t = item["type"] if t == "numeric_input_may_lack_blue": print(f" [WARN] [{item['sheet']}!{item['cell']}] numeric value={item['value']} " f"has black font — if user-editable assumption, use blue-font input style") elif t == "percent_value_gt_1": print(f" [WARN] [{item['sheet']}!{item['cell']}] percent-format cell has " f"value={item['value']} (displays as {item['displayed_as']}) — " f"likely should be stored as decimal (e.g. 0.08 for 8%)") else: print(f" [WARN] {item}") print() if s["violations"] == 0: if s["warnings"] == 0: print("PASS — Financial formatting is compliant") else: print(f"PASS with WARN — {s['warnings']} warning(s) need review") else: print(f"FAIL — {s['violations']} violation(s) must be fixed before delivery") sys.exit(1) if __name__ == "__main__": main()