Initial commit: add all skills files
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
422
minimax-xlsx/scripts/formula_check.py
Normal file
422
minimax-xlsx/scripts/formula_check.py
Normal file
@@ -0,0 +1,422 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: MIT
|
||||
"""
|
||||
formula_check.py — Static formula validator for xlsx files.
|
||||
|
||||
Usage:
|
||||
python3 formula_check.py <input.xlsx>
|
||||
python3 formula_check.py <input.xlsx> --json # machine-readable output
|
||||
python3 formula_check.py <input.xlsx> --report # standardized validation report (JSON)
|
||||
python3 formula_check.py <input.xlsx> --report -o out # report to file
|
||||
python3 formula_check.py <input.xlsx> --sheet Sales # limit to one sheet
|
||||
python3 formula_check.py <input.xlsx> --summary # error counts only, no details
|
||||
|
||||
What it checks:
|
||||
1. Error-value cells: <c t="e"><v>#REF!</v></c> — all 7 Excel error types
|
||||
2. Broken cross-sheet references: formula references a sheet not in workbook.xml
|
||||
3. Broken named-range references: formula references a name not in workbook.xml <definedNames>
|
||||
4. Shared formula integrity: shared formula primary cell exists and has formula text
|
||||
5. Missing <v> on t="e" cells (malformed XML)
|
||||
|
||||
Checks NOT performed (require dynamic recalculation):
|
||||
- Runtime errors that only appear after formulas execute (#DIV/0! on empty denominator, etc.)
|
||||
-> Use libreoffice_recalc.py + re-run formula_check.py for dynamic validation
|
||||
|
||||
Exit code:
|
||||
0 — no errors found
|
||||
1 — errors detected (or file cannot be opened)
|
||||
"""
|
||||
|
||||
import sys
|
||||
import zipfile
|
||||
import xml.etree.ElementTree as ET
|
||||
import re
|
||||
import json
|
||||
|
||||
# OOXML SpreadsheetML namespace
|
||||
NS = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
||||
NSP = f"{{{NS}}}"
|
||||
|
||||
# All 7 standard Excel formula error types
|
||||
EXCEL_ERRORS = {"#REF!", "#DIV/0!", "#VALUE!", "#NAME?", "#NULL!", "#NUM!", "#N/A"}
|
||||
|
||||
# Excel built-in function names (subset of common ones) — used for #NAME? heuristic
|
||||
# Full list: https://support.microsoft.com/en-us/office/excel-functions-alphabetical
|
||||
_BUILTIN_FUNCTIONS = {
|
||||
"ABS", "AND", "AVERAGE", "AVERAGEIF", "AVERAGEIFS", "CEILING", "CHOOSE",
|
||||
"COUNTA", "COUNTIF", "COUNTIFS", "COUNT", "DATE", "EDATE", "EOMONTH",
|
||||
"FALSE", "FILTER", "FIND", "FLOOR", "IF", "IFERROR", "IFNA", "IFS",
|
||||
"INDEX", "INDIRECT", "INT", "IRR", "ISBLANK", "ISERROR", "ISNA", "ISNUMBER",
|
||||
"LARGE", "LEFT", "LEN", "LOOKUP", "LOWER", "MATCH", "MAX", "MID", "MIN",
|
||||
"MOD", "MONTH", "NETWORKDAYS", "NOT", "NOW", "NPV", "OFFSET", "OR",
|
||||
"PMT", "PV", "RAND", "RANK", "RIGHT", "ROUND", "ROUNDDOWN", "ROUNDUP",
|
||||
"ROW", "ROWS", "SEARCH", "SMALL", "SORT", "SQRT", "SUBSTITUTE", "SUM",
|
||||
"SUMIF", "SUMIFS", "SUMPRODUCT", "TEXT", "TODAY", "TRANSPOSE", "TRIM",
|
||||
"TRUE", "UNIQUE", "UPPER", "VALUE", "VLOOKUP", "HLOOKUP", "XLOOKUP",
|
||||
"XMATCH", "XNPV", "XIRR", "YEAR", "YEARFRAC",
|
||||
}
|
||||
|
||||
|
||||
def get_sheet_names(z: zipfile.ZipFile) -> dict[str, str]:
|
||||
"""Return dict of {r:id -> sheet_name} from workbook.xml."""
|
||||
wb_xml = z.read("xl/workbook.xml")
|
||||
wb = ET.fromstring(wb_xml)
|
||||
rel_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
sheets = {}
|
||||
for sheet in wb.findall(f".//{NSP}sheet"):
|
||||
name = sheet.get("name", "")
|
||||
rid = sheet.get(f"{{{rel_ns}}}id", "")
|
||||
sheets[rid] = name
|
||||
return sheets
|
||||
|
||||
|
||||
def get_defined_names(z: zipfile.ZipFile) -> set[str]:
|
||||
"""Return set of named ranges defined in workbook.xml <definedNames>."""
|
||||
wb_xml = z.read("xl/workbook.xml")
|
||||
wb = ET.fromstring(wb_xml)
|
||||
names = set()
|
||||
for dn in wb.findall(f".//{NSP}definedName"):
|
||||
n = dn.get("name", "")
|
||||
if n:
|
||||
names.add(n)
|
||||
return names
|
||||
|
||||
|
||||
def get_sheet_files(z: zipfile.ZipFile) -> dict[str, str]:
|
||||
"""Return dict of {r:id -> xl/worksheets/sheetN.xml} from workbook.xml.rels."""
|
||||
rels_xml = z.read("xl/_rels/workbook.xml.rels")
|
||||
rels = ET.fromstring(rels_xml)
|
||||
mapping = {}
|
||||
for rel in rels:
|
||||
rid = rel.get("Id", "")
|
||||
target = rel.get("Target", "")
|
||||
if "worksheets" in target:
|
||||
# Target may be relative: "worksheets/sheet1.xml" -> "xl/worksheets/sheet1.xml"
|
||||
if not target.startswith("xl/"):
|
||||
target = "xl/" + target
|
||||
mapping[rid] = target
|
||||
return mapping
|
||||
|
||||
|
||||
def extract_sheet_refs(formula: str) -> list[str]:
|
||||
"""
|
||||
Extract all sheet names referenced in a formula string.
|
||||
|
||||
Handles:
|
||||
- 'Sheet Name'!A1 (quoted, may contain spaces)
|
||||
- SheetName!A1 (unquoted, no spaces)
|
||||
|
||||
Returns a list of sheet name strings (may contain duplicates if the same
|
||||
sheet is referenced multiple times in one formula).
|
||||
"""
|
||||
refs = []
|
||||
# Quoted sheet names: 'Sheet Name'!
|
||||
for m in re.finditer(r"'([^']+)'!", formula):
|
||||
refs.append(m.group(1))
|
||||
# Unquoted sheet names: SheetName! (not preceded by a single quote)
|
||||
for m in re.finditer(r"(?<!')([A-Za-z_\u4e00-\u9fff][A-Za-z0-9_.·\u4e00-\u9fff]*)!", formula):
|
||||
refs.append(m.group(1))
|
||||
return refs
|
||||
|
||||
|
||||
def extract_name_refs(formula: str) -> list[str]:
|
||||
"""
|
||||
Extract identifiers in a formula that could be named range references.
|
||||
|
||||
Heuristic: identifiers that:
|
||||
- Are not preceded by a sheet reference (no "!" before them)
|
||||
- Are not followed by "(" (which would make them function calls)
|
||||
- Match the pattern of a name (letters/underscore start, alphanumeric/underscore body)
|
||||
- Are not single-letter column references or row references
|
||||
|
||||
This is approximate. False positives are possible; false negatives are rare.
|
||||
"""
|
||||
names = []
|
||||
# Remove quoted sheet references first to avoid false matches
|
||||
formula_clean = re.sub(r"'[^']*'![A-Z$0-9:]+", "", formula)
|
||||
formula_clean = re.sub(r"[A-Za-z_][A-Za-z0-9_.]*![A-Z$0-9:]+", "", formula_clean)
|
||||
# Find identifiers not followed by "(" (not function calls)
|
||||
for m in re.finditer(r"\b([A-Za-z_][A-Za-z0-9_]{2,})\b(?!\s*\()", formula_clean):
|
||||
candidate = m.group(1)
|
||||
# Exclude Excel cell references like A1, B10, AA100
|
||||
if re.fullmatch(r"[A-Z]{1,3}[0-9]+", candidate):
|
||||
continue
|
||||
# Exclude built-in function names (they appear without parens sometimes in array formulas)
|
||||
if candidate.upper() in _BUILTIN_FUNCTIONS:
|
||||
continue
|
||||
names.append(candidate)
|
||||
return names
|
||||
|
||||
|
||||
def check(xlsx_path: str, sheet_filter: str | None = None) -> dict:
|
||||
"""
|
||||
Run all static checks on the given xlsx file.
|
||||
|
||||
Args:
|
||||
xlsx_path: path to the .xlsx file
|
||||
sheet_filter: if provided, only check the sheet with this name
|
||||
|
||||
Returns:
|
||||
A dict with keys:
|
||||
file, sheets_checked, formula_count, shared_formula_ranges,
|
||||
error_count, errors
|
||||
"""
|
||||
results = {
|
||||
"file": xlsx_path,
|
||||
"sheets_checked": [],
|
||||
"formula_count": 0,
|
||||
"shared_formula_ranges": 0, # number of shared formula definitions
|
||||
"error_count": 0,
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
try:
|
||||
z = zipfile.ZipFile(xlsx_path, "r")
|
||||
except (zipfile.BadZipFile, FileNotFoundError) as e:
|
||||
results["errors"].append({"type": "file_error", "message": str(e)})
|
||||
results["error_count"] = 1
|
||||
return results
|
||||
|
||||
with z:
|
||||
sheet_names = get_sheet_names(z)
|
||||
sheet_files = get_sheet_files(z)
|
||||
valid_sheet_names = set(sheet_names.values())
|
||||
defined_names = get_defined_names(z)
|
||||
|
||||
for rid, sheet_name in sheet_names.items():
|
||||
# Apply sheet filter if requested
|
||||
if sheet_filter and sheet_name != sheet_filter:
|
||||
continue
|
||||
|
||||
ws_file = sheet_files.get(rid)
|
||||
if not ws_file or ws_file not in z.namelist():
|
||||
continue
|
||||
|
||||
results["sheets_checked"].append(sheet_name)
|
||||
ws_xml = z.read(ws_file)
|
||||
ws = ET.fromstring(ws_xml)
|
||||
|
||||
# Track shared formula IDs seen on this sheet (si -> primary cell ref)
|
||||
shared_primary: dict[str, str] = {}
|
||||
|
||||
for cell in ws.findall(f".//{NSP}c"):
|
||||
cell_ref = cell.get("r", "?")
|
||||
cell_type = cell.get("t", "n")
|
||||
|
||||
# ── Check 1: error-value cell ──────────────────────────────
|
||||
if cell_type == "e":
|
||||
v_elem = cell.find(f"{NSP}v")
|
||||
if v_elem is None:
|
||||
# Malformed: t="e" but no <v> — record as structural issue
|
||||
results["errors"].append(
|
||||
{
|
||||
"type": "malformed_error_cell",
|
||||
"sheet": sheet_name,
|
||||
"cell": cell_ref,
|
||||
"detail": "Cell has t='e' but no <v> child element",
|
||||
}
|
||||
)
|
||||
results["error_count"] += 1
|
||||
else:
|
||||
error_val = v_elem.text or "#UNKNOWN"
|
||||
f_elem = cell.find(f"{NSP}f")
|
||||
results["errors"].append(
|
||||
{
|
||||
"type": "error_value",
|
||||
"error": error_val,
|
||||
"sheet": sheet_name,
|
||||
"cell": cell_ref,
|
||||
# Include formula text if present
|
||||
"formula": f_elem.text if (f_elem is not None and f_elem.text) else None,
|
||||
}
|
||||
)
|
||||
results["error_count"] += 1
|
||||
|
||||
# ── Check 2 & 3: formulas ──────────────────────────────────
|
||||
f_elem = cell.find(f"{NSP}f")
|
||||
if f_elem is None:
|
||||
continue
|
||||
|
||||
f_type = f_elem.get("t", "") # "shared", "array", or "" for normal
|
||||
f_si = f_elem.get("si") # shared formula group ID
|
||||
|
||||
# Count formulas:
|
||||
# - Normal formulas: always count
|
||||
# - Shared formula PRIMARY (has text + ref attribute): count once
|
||||
# - Shared formula CONSUMER (si only, no text): do NOT count separately
|
||||
# (they are covered by the primary's ref range)
|
||||
if f_type == "shared" and f_elem.text is None:
|
||||
# Consumer cell: skip formula counting and cross-ref checks
|
||||
# (the primary cell already covers this formula)
|
||||
continue
|
||||
|
||||
formula = f_elem.text or ""
|
||||
|
||||
if f_type == "shared" and f_elem.get("ref"):
|
||||
results["shared_formula_ranges"] += 1
|
||||
if f_si is not None:
|
||||
shared_primary[f_si] = cell_ref
|
||||
|
||||
if formula:
|
||||
results["formula_count"] += 1
|
||||
|
||||
# Check 2: cross-sheet references
|
||||
for ref_sheet in extract_sheet_refs(formula):
|
||||
if ref_sheet not in valid_sheet_names:
|
||||
results["errors"].append(
|
||||
{
|
||||
"type": "broken_sheet_ref",
|
||||
"sheet": sheet_name,
|
||||
"cell": cell_ref,
|
||||
"formula": formula,
|
||||
"missing_sheet": ref_sheet,
|
||||
"valid_sheets": sorted(valid_sheet_names),
|
||||
}
|
||||
)
|
||||
results["error_count"] += 1
|
||||
|
||||
# Check 3: named range references
|
||||
# Only flag if the name is not a built-in and not a sheet-prefixed ref
|
||||
for name_ref in extract_name_refs(formula):
|
||||
if name_ref not in defined_names:
|
||||
results["errors"].append(
|
||||
{
|
||||
"type": "unknown_name_ref",
|
||||
"sheet": sheet_name,
|
||||
"cell": cell_ref,
|
||||
"formula": formula,
|
||||
"unknown_name": name_ref,
|
||||
"defined_names": sorted(defined_names),
|
||||
"note": "Heuristic check — verify manually if this is a false positive",
|
||||
}
|
||||
)
|
||||
results["error_count"] += 1
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def build_report(results: dict) -> dict:
|
||||
"""
|
||||
Transform raw check() output into a standardized validation report.
|
||||
|
||||
Usage:
|
||||
python3 formula_check.py <input.xlsx> --report # JSON report to stdout
|
||||
python3 formula_check.py <input.xlsx> --report -o out # JSON report to file
|
||||
"""
|
||||
from collections import Counter
|
||||
|
||||
errors = results.get("errors", [])
|
||||
error_types = [e.get("error", e.get("type", "unknown")) for e in errors]
|
||||
|
||||
return {
|
||||
"status": "success" if results["error_count"] == 0 else "errors_found",
|
||||
"file": results["file"],
|
||||
"sheets_checked": results["sheets_checked"],
|
||||
"total_formulas": results["formula_count"],
|
||||
"total_errors": results["error_count"],
|
||||
"shared_formula_ranges": results.get("shared_formula_ranges", 0),
|
||||
"errors_by_type": dict(Counter(error_types)) if errors else {},
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
use_json = "--json" in sys.argv
|
||||
use_report = "--report" in sys.argv
|
||||
summary_only = "--summary" in sys.argv
|
||||
output_file = None
|
||||
sheet_filter = None
|
||||
args_clean = []
|
||||
|
||||
i = 1
|
||||
while i < len(sys.argv):
|
||||
arg = sys.argv[i]
|
||||
if arg == "--sheet" and i + 1 < len(sys.argv):
|
||||
sheet_filter = sys.argv[i + 1]
|
||||
i += 2
|
||||
elif arg == "-o" and i + 1 < len(sys.argv):
|
||||
output_file = sys.argv[i + 1]
|
||||
i += 2
|
||||
elif arg.startswith("--"):
|
||||
i += 1 # skip flags already handled
|
||||
else:
|
||||
args_clean.append(arg)
|
||||
i += 1
|
||||
|
||||
if not args_clean:
|
||||
print("Usage: formula_check.py <input.xlsx> [--json] [--report [-o FILE]] [--sheet NAME] [--summary]")
|
||||
sys.exit(1)
|
||||
|
||||
results = check(args_clean[0], sheet_filter=sheet_filter)
|
||||
|
||||
if use_report:
|
||||
report = build_report(results)
|
||||
output = json.dumps(report, indent=2, ensure_ascii=False)
|
||||
if output_file:
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(output + "\n")
|
||||
else:
|
||||
print(output)
|
||||
sys.exit(1 if results["error_count"] > 0 else 0)
|
||||
|
||||
if use_json:
|
||||
print(json.dumps(results, indent=2, ensure_ascii=False))
|
||||
sys.exit(1 if results["error_count"] > 0 else 0)
|
||||
|
||||
# Human-readable output
|
||||
sheets = ", ".join(results["sheets_checked"]) or "(none)"
|
||||
if sheet_filter:
|
||||
sheets = f"{sheet_filter} (filtered)"
|
||||
|
||||
print(f"File : {results['file']}")
|
||||
print(f"Sheets : {sheets}")
|
||||
print(f"Formulas checked : {results['formula_count']} distinct formula cells")
|
||||
print(f"Shared formula ranges : {results['shared_formula_ranges']} ranges")
|
||||
print(f"Errors found : {results['error_count']}")
|
||||
|
||||
if not summary_only and results["errors"]:
|
||||
print("\n── Error Details ──")
|
||||
for e in results["errors"]:
|
||||
if e["type"] == "error_value":
|
||||
formula_hint = f" (formula: {e['formula']})" if e.get("formula") else ""
|
||||
print(f" [FAIL] [{e['sheet']}!{e['cell']}] contains {e['error']}{formula_hint}")
|
||||
elif e["type"] == "broken_sheet_ref":
|
||||
print(
|
||||
f" [FAIL] [{e['sheet']}!{e['cell']}] references missing sheet "
|
||||
f"'{e['missing_sheet']}'"
|
||||
)
|
||||
print(f" Formula: {e['formula']}")
|
||||
print(f" Valid sheets: {e.get('valid_sheets', [])}")
|
||||
elif e["type"] == "unknown_name_ref":
|
||||
print(
|
||||
f" [WARN] [{e['sheet']}!{e['cell']}] uses unknown name "
|
||||
f"'{e['unknown_name']}' (heuristic — verify manually)"
|
||||
)
|
||||
print(f" Formula: {e['formula']}")
|
||||
print(f" Defined names: {e.get('defined_names', [])}")
|
||||
elif e["type"] == "malformed_error_cell":
|
||||
print(f" [FAIL] [{e['sheet']}!{e['cell']}] malformed error cell: {e['detail']}")
|
||||
elif e["type"] == "file_error":
|
||||
print(f" [FAIL] File error: {e['message']}")
|
||||
print()
|
||||
|
||||
if results["error_count"] == 0:
|
||||
print("PASS — No formula errors detected")
|
||||
else:
|
||||
# Separate definitive failures from heuristic warnings
|
||||
hard_errors = [e for e in results["errors"] if e["type"] != "unknown_name_ref"]
|
||||
warnings = [e for e in results["errors"] if e["type"] == "unknown_name_ref"]
|
||||
if hard_errors:
|
||||
print(f"FAIL — {len(hard_errors)} error(s) must be fixed before delivery")
|
||||
if warnings:
|
||||
print(f"WARN — {len(warnings)} heuristic warning(s) require manual review")
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Only heuristic warnings — do not block delivery but alert
|
||||
print(f"PASS with WARN — {len(warnings)} heuristic warning(s) require manual review")
|
||||
# Exit 0: heuristic warnings alone do not block delivery
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user