423 lines
17 KiB
Python
423 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
# SPDX-License-Identifier: MIT
|
|
"""
|
|
formula_check.py — Static formula validator for xlsx files.
|
|
|
|
Usage:
|
|
python3 formula_check.py <input.xlsx>
|
|
python3 formula_check.py <input.xlsx> --json # machine-readable output
|
|
python3 formula_check.py <input.xlsx> --report # standardized validation report (JSON)
|
|
python3 formula_check.py <input.xlsx> --report -o out # report to file
|
|
python3 formula_check.py <input.xlsx> --sheet Sales # limit to one sheet
|
|
python3 formula_check.py <input.xlsx> --summary # error counts only, no details
|
|
|
|
What it checks:
|
|
1. Error-value cells: <c t="e"><v>#REF!</v></c> — all 7 Excel error types
|
|
2. Broken cross-sheet references: formula references a sheet not in workbook.xml
|
|
3. Broken named-range references: formula references a name not in workbook.xml <definedNames>
|
|
4. Shared formula integrity: shared formula primary cell exists and has formula text
|
|
5. Missing <v> on t="e" cells (malformed XML)
|
|
|
|
Checks NOT performed (require dynamic recalculation):
|
|
- Runtime errors that only appear after formulas execute (#DIV/0! on empty denominator, etc.)
|
|
-> Use libreoffice_recalc.py + re-run formula_check.py for dynamic validation
|
|
|
|
Exit code:
|
|
0 — no errors found
|
|
1 — errors detected (or file cannot be opened)
|
|
"""
|
|
|
|
import sys
|
|
import zipfile
|
|
import xml.etree.ElementTree as ET
|
|
import re
|
|
import json
|
|
|
|
# OOXML SpreadsheetML namespace
|
|
NS = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
|
NSP = f"{{{NS}}}"
|
|
|
|
# All 7 standard Excel formula error types
|
|
EXCEL_ERRORS = {"#REF!", "#DIV/0!", "#VALUE!", "#NAME?", "#NULL!", "#NUM!", "#N/A"}
|
|
|
|
# Excel built-in function names (subset of common ones) — used for #NAME? heuristic
|
|
# Full list: https://support.microsoft.com/en-us/office/excel-functions-alphabetical
|
|
_BUILTIN_FUNCTIONS = {
|
|
"ABS", "AND", "AVERAGE", "AVERAGEIF", "AVERAGEIFS", "CEILING", "CHOOSE",
|
|
"COUNTA", "COUNTIF", "COUNTIFS", "COUNT", "DATE", "EDATE", "EOMONTH",
|
|
"FALSE", "FILTER", "FIND", "FLOOR", "IF", "IFERROR", "IFNA", "IFS",
|
|
"INDEX", "INDIRECT", "INT", "IRR", "ISBLANK", "ISERROR", "ISNA", "ISNUMBER",
|
|
"LARGE", "LEFT", "LEN", "LOOKUP", "LOWER", "MATCH", "MAX", "MID", "MIN",
|
|
"MOD", "MONTH", "NETWORKDAYS", "NOT", "NOW", "NPV", "OFFSET", "OR",
|
|
"PMT", "PV", "RAND", "RANK", "RIGHT", "ROUND", "ROUNDDOWN", "ROUNDUP",
|
|
"ROW", "ROWS", "SEARCH", "SMALL", "SORT", "SQRT", "SUBSTITUTE", "SUM",
|
|
"SUMIF", "SUMIFS", "SUMPRODUCT", "TEXT", "TODAY", "TRANSPOSE", "TRIM",
|
|
"TRUE", "UNIQUE", "UPPER", "VALUE", "VLOOKUP", "HLOOKUP", "XLOOKUP",
|
|
"XMATCH", "XNPV", "XIRR", "YEAR", "YEARFRAC",
|
|
}
|
|
|
|
|
|
def get_sheet_names(z: zipfile.ZipFile) -> dict[str, str]:
|
|
"""Return dict of {r:id -> sheet_name} from workbook.xml."""
|
|
wb_xml = z.read("xl/workbook.xml")
|
|
wb = ET.fromstring(wb_xml)
|
|
rel_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
|
sheets = {}
|
|
for sheet in wb.findall(f".//{NSP}sheet"):
|
|
name = sheet.get("name", "")
|
|
rid = sheet.get(f"{{{rel_ns}}}id", "")
|
|
sheets[rid] = name
|
|
return sheets
|
|
|
|
|
|
def get_defined_names(z: zipfile.ZipFile) -> set[str]:
|
|
"""Return set of named ranges defined in workbook.xml <definedNames>."""
|
|
wb_xml = z.read("xl/workbook.xml")
|
|
wb = ET.fromstring(wb_xml)
|
|
names = set()
|
|
for dn in wb.findall(f".//{NSP}definedName"):
|
|
n = dn.get("name", "")
|
|
if n:
|
|
names.add(n)
|
|
return names
|
|
|
|
|
|
def get_sheet_files(z: zipfile.ZipFile) -> dict[str, str]:
|
|
"""Return dict of {r:id -> xl/worksheets/sheetN.xml} from workbook.xml.rels."""
|
|
rels_xml = z.read("xl/_rels/workbook.xml.rels")
|
|
rels = ET.fromstring(rels_xml)
|
|
mapping = {}
|
|
for rel in rels:
|
|
rid = rel.get("Id", "")
|
|
target = rel.get("Target", "")
|
|
if "worksheets" in target:
|
|
# Target may be relative: "worksheets/sheet1.xml" -> "xl/worksheets/sheet1.xml"
|
|
if not target.startswith("xl/"):
|
|
target = "xl/" + target
|
|
mapping[rid] = target
|
|
return mapping
|
|
|
|
|
|
def extract_sheet_refs(formula: str) -> list[str]:
|
|
"""
|
|
Extract all sheet names referenced in a formula string.
|
|
|
|
Handles:
|
|
- 'Sheet Name'!A1 (quoted, may contain spaces)
|
|
- SheetName!A1 (unquoted, no spaces)
|
|
|
|
Returns a list of sheet name strings (may contain duplicates if the same
|
|
sheet is referenced multiple times in one formula).
|
|
"""
|
|
refs = []
|
|
# Quoted sheet names: 'Sheet Name'!
|
|
for m in re.finditer(r"'([^']+)'!", formula):
|
|
refs.append(m.group(1))
|
|
# Unquoted sheet names: SheetName! (not preceded by a single quote)
|
|
for m in re.finditer(r"(?<!')([A-Za-z_\u4e00-\u9fff][A-Za-z0-9_.·\u4e00-\u9fff]*)!", formula):
|
|
refs.append(m.group(1))
|
|
return refs
|
|
|
|
|
|
def extract_name_refs(formula: str) -> list[str]:
|
|
"""
|
|
Extract identifiers in a formula that could be named range references.
|
|
|
|
Heuristic: identifiers that:
|
|
- Are not preceded by a sheet reference (no "!" before them)
|
|
- Are not followed by "(" (which would make them function calls)
|
|
- Match the pattern of a name (letters/underscore start, alphanumeric/underscore body)
|
|
- Are not single-letter column references or row references
|
|
|
|
This is approximate. False positives are possible; false negatives are rare.
|
|
"""
|
|
names = []
|
|
# Remove quoted sheet references first to avoid false matches
|
|
formula_clean = re.sub(r"'[^']*'![A-Z$0-9:]+", "", formula)
|
|
formula_clean = re.sub(r"[A-Za-z_][A-Za-z0-9_.]*![A-Z$0-9:]+", "", formula_clean)
|
|
# Find identifiers not followed by "(" (not function calls)
|
|
for m in re.finditer(r"\b([A-Za-z_][A-Za-z0-9_]{2,})\b(?!\s*\()", formula_clean):
|
|
candidate = m.group(1)
|
|
# Exclude Excel cell references like A1, B10, AA100
|
|
if re.fullmatch(r"[A-Z]{1,3}[0-9]+", candidate):
|
|
continue
|
|
# Exclude built-in function names (they appear without parens sometimes in array formulas)
|
|
if candidate.upper() in _BUILTIN_FUNCTIONS:
|
|
continue
|
|
names.append(candidate)
|
|
return names
|
|
|
|
|
|
def check(xlsx_path: str, sheet_filter: str | None = None) -> dict:
|
|
"""
|
|
Run all static checks on the given xlsx file.
|
|
|
|
Args:
|
|
xlsx_path: path to the .xlsx file
|
|
sheet_filter: if provided, only check the sheet with this name
|
|
|
|
Returns:
|
|
A dict with keys:
|
|
file, sheets_checked, formula_count, shared_formula_ranges,
|
|
error_count, errors
|
|
"""
|
|
results = {
|
|
"file": xlsx_path,
|
|
"sheets_checked": [],
|
|
"formula_count": 0,
|
|
"shared_formula_ranges": 0, # number of shared formula definitions
|
|
"error_count": 0,
|
|
"errors": [],
|
|
}
|
|
|
|
try:
|
|
z = zipfile.ZipFile(xlsx_path, "r")
|
|
except (zipfile.BadZipFile, FileNotFoundError) as e:
|
|
results["errors"].append({"type": "file_error", "message": str(e)})
|
|
results["error_count"] = 1
|
|
return results
|
|
|
|
with z:
|
|
sheet_names = get_sheet_names(z)
|
|
sheet_files = get_sheet_files(z)
|
|
valid_sheet_names = set(sheet_names.values())
|
|
defined_names = get_defined_names(z)
|
|
|
|
for rid, sheet_name in sheet_names.items():
|
|
# Apply sheet filter if requested
|
|
if sheet_filter and sheet_name != sheet_filter:
|
|
continue
|
|
|
|
ws_file = sheet_files.get(rid)
|
|
if not ws_file or ws_file not in z.namelist():
|
|
continue
|
|
|
|
results["sheets_checked"].append(sheet_name)
|
|
ws_xml = z.read(ws_file)
|
|
ws = ET.fromstring(ws_xml)
|
|
|
|
# Track shared formula IDs seen on this sheet (si -> primary cell ref)
|
|
shared_primary: dict[str, str] = {}
|
|
|
|
for cell in ws.findall(f".//{NSP}c"):
|
|
cell_ref = cell.get("r", "?")
|
|
cell_type = cell.get("t", "n")
|
|
|
|
# ── Check 1: error-value cell ──────────────────────────────
|
|
if cell_type == "e":
|
|
v_elem = cell.find(f"{NSP}v")
|
|
if v_elem is None:
|
|
# Malformed: t="e" but no <v> — record as structural issue
|
|
results["errors"].append(
|
|
{
|
|
"type": "malformed_error_cell",
|
|
"sheet": sheet_name,
|
|
"cell": cell_ref,
|
|
"detail": "Cell has t='e' but no <v> child element",
|
|
}
|
|
)
|
|
results["error_count"] += 1
|
|
else:
|
|
error_val = v_elem.text or "#UNKNOWN"
|
|
f_elem = cell.find(f"{NSP}f")
|
|
results["errors"].append(
|
|
{
|
|
"type": "error_value",
|
|
"error": error_val,
|
|
"sheet": sheet_name,
|
|
"cell": cell_ref,
|
|
# Include formula text if present
|
|
"formula": f_elem.text if (f_elem is not None and f_elem.text) else None,
|
|
}
|
|
)
|
|
results["error_count"] += 1
|
|
|
|
# ── Check 2 & 3: formulas ──────────────────────────────────
|
|
f_elem = cell.find(f"{NSP}f")
|
|
if f_elem is None:
|
|
continue
|
|
|
|
f_type = f_elem.get("t", "") # "shared", "array", or "" for normal
|
|
f_si = f_elem.get("si") # shared formula group ID
|
|
|
|
# Count formulas:
|
|
# - Normal formulas: always count
|
|
# - Shared formula PRIMARY (has text + ref attribute): count once
|
|
# - Shared formula CONSUMER (si only, no text): do NOT count separately
|
|
# (they are covered by the primary's ref range)
|
|
if f_type == "shared" and f_elem.text is None:
|
|
# Consumer cell: skip formula counting and cross-ref checks
|
|
# (the primary cell already covers this formula)
|
|
continue
|
|
|
|
formula = f_elem.text or ""
|
|
|
|
if f_type == "shared" and f_elem.get("ref"):
|
|
results["shared_formula_ranges"] += 1
|
|
if f_si is not None:
|
|
shared_primary[f_si] = cell_ref
|
|
|
|
if formula:
|
|
results["formula_count"] += 1
|
|
|
|
# Check 2: cross-sheet references
|
|
for ref_sheet in extract_sheet_refs(formula):
|
|
if ref_sheet not in valid_sheet_names:
|
|
results["errors"].append(
|
|
{
|
|
"type": "broken_sheet_ref",
|
|
"sheet": sheet_name,
|
|
"cell": cell_ref,
|
|
"formula": formula,
|
|
"missing_sheet": ref_sheet,
|
|
"valid_sheets": sorted(valid_sheet_names),
|
|
}
|
|
)
|
|
results["error_count"] += 1
|
|
|
|
# Check 3: named range references
|
|
# Only flag if the name is not a built-in and not a sheet-prefixed ref
|
|
for name_ref in extract_name_refs(formula):
|
|
if name_ref not in defined_names:
|
|
results["errors"].append(
|
|
{
|
|
"type": "unknown_name_ref",
|
|
"sheet": sheet_name,
|
|
"cell": cell_ref,
|
|
"formula": formula,
|
|
"unknown_name": name_ref,
|
|
"defined_names": sorted(defined_names),
|
|
"note": "Heuristic check — verify manually if this is a false positive",
|
|
}
|
|
)
|
|
results["error_count"] += 1
|
|
|
|
return results
|
|
|
|
|
|
def build_report(results: dict) -> dict:
|
|
"""
|
|
Transform raw check() output into a standardized validation report.
|
|
|
|
Usage:
|
|
python3 formula_check.py <input.xlsx> --report # JSON report to stdout
|
|
python3 formula_check.py <input.xlsx> --report -o out # JSON report to file
|
|
"""
|
|
from collections import Counter
|
|
|
|
errors = results.get("errors", [])
|
|
error_types = [e.get("error", e.get("type", "unknown")) for e in errors]
|
|
|
|
return {
|
|
"status": "success" if results["error_count"] == 0 else "errors_found",
|
|
"file": results["file"],
|
|
"sheets_checked": results["sheets_checked"],
|
|
"total_formulas": results["formula_count"],
|
|
"total_errors": results["error_count"],
|
|
"shared_formula_ranges": results.get("shared_formula_ranges", 0),
|
|
"errors_by_type": dict(Counter(error_types)) if errors else {},
|
|
"errors": errors,
|
|
}
|
|
|
|
|
|
def main() -> None:
|
|
use_json = "--json" in sys.argv
|
|
use_report = "--report" in sys.argv
|
|
summary_only = "--summary" in sys.argv
|
|
output_file = None
|
|
sheet_filter = None
|
|
args_clean = []
|
|
|
|
i = 1
|
|
while i < len(sys.argv):
|
|
arg = sys.argv[i]
|
|
if arg == "--sheet" and i + 1 < len(sys.argv):
|
|
sheet_filter = sys.argv[i + 1]
|
|
i += 2
|
|
elif arg == "-o" and i + 1 < len(sys.argv):
|
|
output_file = sys.argv[i + 1]
|
|
i += 2
|
|
elif arg.startswith("--"):
|
|
i += 1 # skip flags already handled
|
|
else:
|
|
args_clean.append(arg)
|
|
i += 1
|
|
|
|
if not args_clean:
|
|
print("Usage: formula_check.py <input.xlsx> [--json] [--report [-o FILE]] [--sheet NAME] [--summary]")
|
|
sys.exit(1)
|
|
|
|
results = check(args_clean[0], sheet_filter=sheet_filter)
|
|
|
|
if use_report:
|
|
report = build_report(results)
|
|
output = json.dumps(report, indent=2, ensure_ascii=False)
|
|
if output_file:
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
f.write(output + "\n")
|
|
else:
|
|
print(output)
|
|
sys.exit(1 if results["error_count"] > 0 else 0)
|
|
|
|
if use_json:
|
|
print(json.dumps(results, indent=2, ensure_ascii=False))
|
|
sys.exit(1 if results["error_count"] > 0 else 0)
|
|
|
|
# Human-readable output
|
|
sheets = ", ".join(results["sheets_checked"]) or "(none)"
|
|
if sheet_filter:
|
|
sheets = f"{sheet_filter} (filtered)"
|
|
|
|
print(f"File : {results['file']}")
|
|
print(f"Sheets : {sheets}")
|
|
print(f"Formulas checked : {results['formula_count']} distinct formula cells")
|
|
print(f"Shared formula ranges : {results['shared_formula_ranges']} ranges")
|
|
print(f"Errors found : {results['error_count']}")
|
|
|
|
if not summary_only and results["errors"]:
|
|
print("\n── Error Details ──")
|
|
for e in results["errors"]:
|
|
if e["type"] == "error_value":
|
|
formula_hint = f" (formula: {e['formula']})" if e.get("formula") else ""
|
|
print(f" [FAIL] [{e['sheet']}!{e['cell']}] contains {e['error']}{formula_hint}")
|
|
elif e["type"] == "broken_sheet_ref":
|
|
print(
|
|
f" [FAIL] [{e['sheet']}!{e['cell']}] references missing sheet "
|
|
f"'{e['missing_sheet']}'"
|
|
)
|
|
print(f" Formula: {e['formula']}")
|
|
print(f" Valid sheets: {e.get('valid_sheets', [])}")
|
|
elif e["type"] == "unknown_name_ref":
|
|
print(
|
|
f" [WARN] [{e['sheet']}!{e['cell']}] uses unknown name "
|
|
f"'{e['unknown_name']}' (heuristic — verify manually)"
|
|
)
|
|
print(f" Formula: {e['formula']}")
|
|
print(f" Defined names: {e.get('defined_names', [])}")
|
|
elif e["type"] == "malformed_error_cell":
|
|
print(f" [FAIL] [{e['sheet']}!{e['cell']}] malformed error cell: {e['detail']}")
|
|
elif e["type"] == "file_error":
|
|
print(f" [FAIL] File error: {e['message']}")
|
|
print()
|
|
|
|
if results["error_count"] == 0:
|
|
print("PASS — No formula errors detected")
|
|
else:
|
|
# Separate definitive failures from heuristic warnings
|
|
hard_errors = [e for e in results["errors"] if e["type"] != "unknown_name_ref"]
|
|
warnings = [e for e in results["errors"] if e["type"] == "unknown_name_ref"]
|
|
if hard_errors:
|
|
print(f"FAIL — {len(hard_errors)} error(s) must be fixed before delivery")
|
|
if warnings:
|
|
print(f"WARN — {len(warnings)} heuristic warning(s) require manual review")
|
|
sys.exit(1)
|
|
else:
|
|
# Only heuristic warnings — do not block delivery but alert
|
|
print(f"PASS with WARN — {len(warnings)} heuristic warning(s) require manual review")
|
|
# Exit 0: heuristic warnings alone do not block delivery
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|