Initial commit: add all skills files

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-10 16:52:49 +08:00
commit 6487becf60
396 changed files with 108871 additions and 0 deletions

View File

@@ -0,0 +1,422 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
"""
formula_check.py — Static formula validator for xlsx files.
Usage:
python3 formula_check.py <input.xlsx>
python3 formula_check.py <input.xlsx> --json # machine-readable output
python3 formula_check.py <input.xlsx> --report # standardized validation report (JSON)
python3 formula_check.py <input.xlsx> --report -o out # report to file
python3 formula_check.py <input.xlsx> --sheet Sales # limit to one sheet
python3 formula_check.py <input.xlsx> --summary # error counts only, no details
What it checks:
1. Error-value cells: <c t="e"><v>#REF!</v></c> — all 7 Excel error types
2. Broken cross-sheet references: formula references a sheet not in workbook.xml
3. Broken named-range references: formula references a name not in workbook.xml <definedNames>
4. Shared formula integrity: shared formula primary cell exists and has formula text
5. Missing <v> on t="e" cells (malformed XML)
Checks NOT performed (require dynamic recalculation):
- Runtime errors that only appear after formulas execute (#DIV/0! on empty denominator, etc.)
-> Use libreoffice_recalc.py + re-run formula_check.py for dynamic validation
Exit code:
0 — no errors found
1 — errors detected (or file cannot be opened)
"""
import sys
import zipfile
import xml.etree.ElementTree as ET
import re
import json
# OOXML SpreadsheetML namespace
NS = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
NSP = f"{{{NS}}}"
# All 7 standard Excel formula error types
EXCEL_ERRORS = {"#REF!", "#DIV/0!", "#VALUE!", "#NAME?", "#NULL!", "#NUM!", "#N/A"}
# Excel built-in function names (subset of common ones) — used for #NAME? heuristic
# Full list: https://support.microsoft.com/en-us/office/excel-functions-alphabetical
_BUILTIN_FUNCTIONS = {
"ABS", "AND", "AVERAGE", "AVERAGEIF", "AVERAGEIFS", "CEILING", "CHOOSE",
"COUNTA", "COUNTIF", "COUNTIFS", "COUNT", "DATE", "EDATE", "EOMONTH",
"FALSE", "FILTER", "FIND", "FLOOR", "IF", "IFERROR", "IFNA", "IFS",
"INDEX", "INDIRECT", "INT", "IRR", "ISBLANK", "ISERROR", "ISNA", "ISNUMBER",
"LARGE", "LEFT", "LEN", "LOOKUP", "LOWER", "MATCH", "MAX", "MID", "MIN",
"MOD", "MONTH", "NETWORKDAYS", "NOT", "NOW", "NPV", "OFFSET", "OR",
"PMT", "PV", "RAND", "RANK", "RIGHT", "ROUND", "ROUNDDOWN", "ROUNDUP",
"ROW", "ROWS", "SEARCH", "SMALL", "SORT", "SQRT", "SUBSTITUTE", "SUM",
"SUMIF", "SUMIFS", "SUMPRODUCT", "TEXT", "TODAY", "TRANSPOSE", "TRIM",
"TRUE", "UNIQUE", "UPPER", "VALUE", "VLOOKUP", "HLOOKUP", "XLOOKUP",
"XMATCH", "XNPV", "XIRR", "YEAR", "YEARFRAC",
}
def get_sheet_names(z: zipfile.ZipFile) -> dict[str, str]:
"""Return dict of {r:id -> sheet_name} from workbook.xml."""
wb_xml = z.read("xl/workbook.xml")
wb = ET.fromstring(wb_xml)
rel_ns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
sheets = {}
for sheet in wb.findall(f".//{NSP}sheet"):
name = sheet.get("name", "")
rid = sheet.get(f"{{{rel_ns}}}id", "")
sheets[rid] = name
return sheets
def get_defined_names(z: zipfile.ZipFile) -> set[str]:
"""Return set of named ranges defined in workbook.xml <definedNames>."""
wb_xml = z.read("xl/workbook.xml")
wb = ET.fromstring(wb_xml)
names = set()
for dn in wb.findall(f".//{NSP}definedName"):
n = dn.get("name", "")
if n:
names.add(n)
return names
def get_sheet_files(z: zipfile.ZipFile) -> dict[str, str]:
"""Return dict of {r:id -> xl/worksheets/sheetN.xml} from workbook.xml.rels."""
rels_xml = z.read("xl/_rels/workbook.xml.rels")
rels = ET.fromstring(rels_xml)
mapping = {}
for rel in rels:
rid = rel.get("Id", "")
target = rel.get("Target", "")
if "worksheets" in target:
# Target may be relative: "worksheets/sheet1.xml" -> "xl/worksheets/sheet1.xml"
if not target.startswith("xl/"):
target = "xl/" + target
mapping[rid] = target
return mapping
def extract_sheet_refs(formula: str) -> list[str]:
"""
Extract all sheet names referenced in a formula string.
Handles:
- 'Sheet Name'!A1 (quoted, may contain spaces)
- SheetName!A1 (unquoted, no spaces)
Returns a list of sheet name strings (may contain duplicates if the same
sheet is referenced multiple times in one formula).
"""
refs = []
# Quoted sheet names: 'Sheet Name'!
for m in re.finditer(r"'([^']+)'!", formula):
refs.append(m.group(1))
# Unquoted sheet names: SheetName! (not preceded by a single quote)
for m in re.finditer(r"(?<!')([A-Za-z_\u4e00-\u9fff][A-Za-z0-9_.·\u4e00-\u9fff]*)!", formula):
refs.append(m.group(1))
return refs
def extract_name_refs(formula: str) -> list[str]:
"""
Extract identifiers in a formula that could be named range references.
Heuristic: identifiers that:
- Are not preceded by a sheet reference (no "!" before them)
- Are not followed by "(" (which would make them function calls)
- Match the pattern of a name (letters/underscore start, alphanumeric/underscore body)
- Are not single-letter column references or row references
This is approximate. False positives are possible; false negatives are rare.
"""
names = []
# Remove quoted sheet references first to avoid false matches
formula_clean = re.sub(r"'[^']*'![A-Z$0-9:]+", "", formula)
formula_clean = re.sub(r"[A-Za-z_][A-Za-z0-9_.]*![A-Z$0-9:]+", "", formula_clean)
# Find identifiers not followed by "(" (not function calls)
for m in re.finditer(r"\b([A-Za-z_][A-Za-z0-9_]{2,})\b(?!\s*\()", formula_clean):
candidate = m.group(1)
# Exclude Excel cell references like A1, B10, AA100
if re.fullmatch(r"[A-Z]{1,3}[0-9]+", candidate):
continue
# Exclude built-in function names (they appear without parens sometimes in array formulas)
if candidate.upper() in _BUILTIN_FUNCTIONS:
continue
names.append(candidate)
return names
def check(xlsx_path: str, sheet_filter: str | None = None) -> dict:
"""
Run all static checks on the given xlsx file.
Args:
xlsx_path: path to the .xlsx file
sheet_filter: if provided, only check the sheet with this name
Returns:
A dict with keys:
file, sheets_checked, formula_count, shared_formula_ranges,
error_count, errors
"""
results = {
"file": xlsx_path,
"sheets_checked": [],
"formula_count": 0,
"shared_formula_ranges": 0, # number of shared formula definitions
"error_count": 0,
"errors": [],
}
try:
z = zipfile.ZipFile(xlsx_path, "r")
except (zipfile.BadZipFile, FileNotFoundError) as e:
results["errors"].append({"type": "file_error", "message": str(e)})
results["error_count"] = 1
return results
with z:
sheet_names = get_sheet_names(z)
sheet_files = get_sheet_files(z)
valid_sheet_names = set(sheet_names.values())
defined_names = get_defined_names(z)
for rid, sheet_name in sheet_names.items():
# Apply sheet filter if requested
if sheet_filter and sheet_name != sheet_filter:
continue
ws_file = sheet_files.get(rid)
if not ws_file or ws_file not in z.namelist():
continue
results["sheets_checked"].append(sheet_name)
ws_xml = z.read(ws_file)
ws = ET.fromstring(ws_xml)
# Track shared formula IDs seen on this sheet (si -> primary cell ref)
shared_primary: dict[str, str] = {}
for cell in ws.findall(f".//{NSP}c"):
cell_ref = cell.get("r", "?")
cell_type = cell.get("t", "n")
# ── Check 1: error-value cell ──────────────────────────────
if cell_type == "e":
v_elem = cell.find(f"{NSP}v")
if v_elem is None:
# Malformed: t="e" but no <v> — record as structural issue
results["errors"].append(
{
"type": "malformed_error_cell",
"sheet": sheet_name,
"cell": cell_ref,
"detail": "Cell has t='e' but no <v> child element",
}
)
results["error_count"] += 1
else:
error_val = v_elem.text or "#UNKNOWN"
f_elem = cell.find(f"{NSP}f")
results["errors"].append(
{
"type": "error_value",
"error": error_val,
"sheet": sheet_name,
"cell": cell_ref,
# Include formula text if present
"formula": f_elem.text if (f_elem is not None and f_elem.text) else None,
}
)
results["error_count"] += 1
# ── Check 2 & 3: formulas ──────────────────────────────────
f_elem = cell.find(f"{NSP}f")
if f_elem is None:
continue
f_type = f_elem.get("t", "") # "shared", "array", or "" for normal
f_si = f_elem.get("si") # shared formula group ID
# Count formulas:
# - Normal formulas: always count
# - Shared formula PRIMARY (has text + ref attribute): count once
# - Shared formula CONSUMER (si only, no text): do NOT count separately
# (they are covered by the primary's ref range)
if f_type == "shared" and f_elem.text is None:
# Consumer cell: skip formula counting and cross-ref checks
# (the primary cell already covers this formula)
continue
formula = f_elem.text or ""
if f_type == "shared" and f_elem.get("ref"):
results["shared_formula_ranges"] += 1
if f_si is not None:
shared_primary[f_si] = cell_ref
if formula:
results["formula_count"] += 1
# Check 2: cross-sheet references
for ref_sheet in extract_sheet_refs(formula):
if ref_sheet not in valid_sheet_names:
results["errors"].append(
{
"type": "broken_sheet_ref",
"sheet": sheet_name,
"cell": cell_ref,
"formula": formula,
"missing_sheet": ref_sheet,
"valid_sheets": sorted(valid_sheet_names),
}
)
results["error_count"] += 1
# Check 3: named range references
# Only flag if the name is not a built-in and not a sheet-prefixed ref
for name_ref in extract_name_refs(formula):
if name_ref not in defined_names:
results["errors"].append(
{
"type": "unknown_name_ref",
"sheet": sheet_name,
"cell": cell_ref,
"formula": formula,
"unknown_name": name_ref,
"defined_names": sorted(defined_names),
"note": "Heuristic check — verify manually if this is a false positive",
}
)
results["error_count"] += 1
return results
def build_report(results: dict) -> dict:
"""
Transform raw check() output into a standardized validation report.
Usage:
python3 formula_check.py <input.xlsx> --report # JSON report to stdout
python3 formula_check.py <input.xlsx> --report -o out # JSON report to file
"""
from collections import Counter
errors = results.get("errors", [])
error_types = [e.get("error", e.get("type", "unknown")) for e in errors]
return {
"status": "success" if results["error_count"] == 0 else "errors_found",
"file": results["file"],
"sheets_checked": results["sheets_checked"],
"total_formulas": results["formula_count"],
"total_errors": results["error_count"],
"shared_formula_ranges": results.get("shared_formula_ranges", 0),
"errors_by_type": dict(Counter(error_types)) if errors else {},
"errors": errors,
}
def main() -> None:
use_json = "--json" in sys.argv
use_report = "--report" in sys.argv
summary_only = "--summary" in sys.argv
output_file = None
sheet_filter = None
args_clean = []
i = 1
while i < len(sys.argv):
arg = sys.argv[i]
if arg == "--sheet" and i + 1 < len(sys.argv):
sheet_filter = sys.argv[i + 1]
i += 2
elif arg == "-o" and i + 1 < len(sys.argv):
output_file = sys.argv[i + 1]
i += 2
elif arg.startswith("--"):
i += 1 # skip flags already handled
else:
args_clean.append(arg)
i += 1
if not args_clean:
print("Usage: formula_check.py <input.xlsx> [--json] [--report [-o FILE]] [--sheet NAME] [--summary]")
sys.exit(1)
results = check(args_clean[0], sheet_filter=sheet_filter)
if use_report:
report = build_report(results)
output = json.dumps(report, indent=2, ensure_ascii=False)
if output_file:
with open(output_file, "w", encoding="utf-8") as f:
f.write(output + "\n")
else:
print(output)
sys.exit(1 if results["error_count"] > 0 else 0)
if use_json:
print(json.dumps(results, indent=2, ensure_ascii=False))
sys.exit(1 if results["error_count"] > 0 else 0)
# Human-readable output
sheets = ", ".join(results["sheets_checked"]) or "(none)"
if sheet_filter:
sheets = f"{sheet_filter} (filtered)"
print(f"File : {results['file']}")
print(f"Sheets : {sheets}")
print(f"Formulas checked : {results['formula_count']} distinct formula cells")
print(f"Shared formula ranges : {results['shared_formula_ranges']} ranges")
print(f"Errors found : {results['error_count']}")
if not summary_only and results["errors"]:
print("\n── Error Details ──")
for e in results["errors"]:
if e["type"] == "error_value":
formula_hint = f" (formula: {e['formula']})" if e.get("formula") else ""
print(f" [FAIL] [{e['sheet']}!{e['cell']}] contains {e['error']}{formula_hint}")
elif e["type"] == "broken_sheet_ref":
print(
f" [FAIL] [{e['sheet']}!{e['cell']}] references missing sheet "
f"'{e['missing_sheet']}'"
)
print(f" Formula: {e['formula']}")
print(f" Valid sheets: {e.get('valid_sheets', [])}")
elif e["type"] == "unknown_name_ref":
print(
f" [WARN] [{e['sheet']}!{e['cell']}] uses unknown name "
f"'{e['unknown_name']}' (heuristic — verify manually)"
)
print(f" Formula: {e['formula']}")
print(f" Defined names: {e.get('defined_names', [])}")
elif e["type"] == "malformed_error_cell":
print(f" [FAIL] [{e['sheet']}!{e['cell']}] malformed error cell: {e['detail']}")
elif e["type"] == "file_error":
print(f" [FAIL] File error: {e['message']}")
print()
if results["error_count"] == 0:
print("PASS — No formula errors detected")
else:
# Separate definitive failures from heuristic warnings
hard_errors = [e for e in results["errors"] if e["type"] != "unknown_name_ref"]
warnings = [e for e in results["errors"] if e["type"] == "unknown_name_ref"]
if hard_errors:
print(f"FAIL — {len(hard_errors)} error(s) must be fixed before delivery")
if warnings:
print(f"WARN — {len(warnings)} heuristic warning(s) require manual review")
sys.exit(1)
else:
# Only heuristic warnings — do not block delivery but alert
print(f"PASS with WARN — {len(warnings)} heuristic warning(s) require manual review")
# Exit 0: heuristic warnings alone do not block delivery
sys.exit(0)
if __name__ == "__main__":
main()