#!/usr/bin/env python3 # SPDX-License-Identifier: MIT """ xlsx_unpack.py — Unpack an xlsx file into a working directory for XML editing. Usage: python3 xlsx_unpack.py What it does: 1. Unzips the xlsx (which is a ZIP archive) 2. Pretty-prints all XML and .rels files for readability 3. Prints a summary of key files to edit """ import sys import zipfile import os import shutil import xml.dom.minidom def pretty_print_xml(content: bytes) -> str: """Pretty-print XML bytes. Returns original content on parse failure.""" try: dom = xml.dom.minidom.parseString(content) pretty = dom.toprettyxml(indent=" ", encoding="utf-8").decode("utf-8") # Remove the extra blank lines toprettyxml adds lines = [line for line in pretty.splitlines() if line.strip()] return "\n".join(lines) + "\n" except Exception: return content.decode("utf-8", errors="replace") def unpack(xlsx_path: str, output_dir: str) -> None: if not os.path.isfile(xlsx_path): print(f"ERROR: File not found: {xlsx_path}", file=sys.stderr) sys.exit(1) if not xlsx_path.lower().endswith((".xlsx", ".xlsm")): print(f"WARNING: '{xlsx_path}' does not have an .xlsx/.xlsm extension", file=sys.stderr) if os.path.exists(output_dir): shutil.rmtree(output_dir) os.makedirs(output_dir) try: with zipfile.ZipFile(xlsx_path, "r") as z: # Validate member paths to prevent zip-slip (path traversal) attacks for member in z.namelist(): member_path = os.path.realpath(os.path.join(output_dir, member)) if not member_path.startswith(os.path.realpath(output_dir) + os.sep) and member_path != os.path.realpath(output_dir): print(f"ERROR: Zip entry '{member}' would escape target directory (path traversal blocked)", file=sys.stderr) shutil.rmtree(output_dir, ignore_errors=True) sys.exit(1) z.extractall(output_dir) except zipfile.BadZipFile: shutil.rmtree(output_dir, ignore_errors=True) print(f"ERROR: '{xlsx_path}' is not a valid ZIP/xlsx file", file=sys.stderr) sys.exit(1) # Pretty-print XML and .rels files xml_count = 0 for dirpath, _, filenames in os.walk(output_dir): for fname in filenames: if fname.endswith(".xml") or fname.endswith(".rels"): fpath = os.path.join(dirpath, fname) with open(fpath, "rb") as f: raw = f.read() pretty = pretty_print_xml(raw) with open(fpath, "w", encoding="utf-8") as f: f.write(pretty) xml_count += 1 print(f"Unpacked '{xlsx_path}' → '{output_dir}'") print(f"Pretty-printed {xml_count} XML/rels files\n") # Print key files grouped by category categories = { "Package root": ["[Content_Types].xml", "_rels/.rels"], "Workbook": ["xl/workbook.xml", "xl/_rels/workbook.xml.rels"], "Styles & Strings": ["xl/styles.xml", "xl/sharedStrings.xml"], "Worksheets": [], } all_files = [] for dirpath, _, filenames in os.walk(output_dir): for fname in filenames: rel = os.path.relpath(os.path.join(dirpath, fname), output_dir) all_files.append(rel) # Collect worksheets for rel in sorted(all_files): if rel.startswith("xl/worksheets/") and rel.endswith(".xml"): categories["Worksheets"].append(rel) print("Key files to inspect/edit:") for category, files in categories.items(): if not files: continue print(f"\n [{category}]") for f in files: full = os.path.join(output_dir, f) if os.path.isfile(full): size = os.path.getsize(full) print(f" {f} ({size:,} bytes)") else: print(f" {f} (not found)") # Warn about high-risk files present risky = { "xl/vbaProject.bin": "VBA macros — DO NOT modify", "xl/pivotTables": "Pivot tables — update source ranges carefully if shifting rows", "xl/charts": "Charts — update data ranges if shifting rows", } print("\n [High-risk content detected:]") found_any = False for path, warning in risky.items(): full = os.path.join(output_dir, path) if os.path.exists(full): print(f" ⚠️ {path} — {warning}") found_any = True if not found_any: print(" ✓ None (safe to edit)") if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: xlsx_unpack.py ") sys.exit(1) unpack(sys.argv[1], sys.argv[2])