131 lines
4.7 KiB
Python
131 lines
4.7 KiB
Python
#!/usr/bin/env python3
|
|
# SPDX-License-Identifier: MIT
|
|
"""
|
|
xlsx_unpack.py — Unpack an xlsx file into a working directory for XML editing.
|
|
|
|
Usage:
|
|
python3 xlsx_unpack.py <input.xlsx> <output_dir>
|
|
|
|
What it does:
|
|
1. Unzips the xlsx (which is a ZIP archive)
|
|
2. Pretty-prints all XML and .rels files for readability
|
|
3. Prints a summary of key files to edit
|
|
"""
|
|
|
|
import sys
|
|
import zipfile
|
|
import os
|
|
import shutil
|
|
import xml.dom.minidom
|
|
|
|
|
|
def pretty_print_xml(content: bytes) -> str:
|
|
"""Pretty-print XML bytes. Returns original content on parse failure."""
|
|
try:
|
|
dom = xml.dom.minidom.parseString(content)
|
|
pretty = dom.toprettyxml(indent=" ", encoding="utf-8").decode("utf-8")
|
|
# Remove the extra blank lines toprettyxml adds
|
|
lines = [line for line in pretty.splitlines() if line.strip()]
|
|
return "\n".join(lines) + "\n"
|
|
except Exception:
|
|
return content.decode("utf-8", errors="replace")
|
|
|
|
|
|
def unpack(xlsx_path: str, output_dir: str) -> None:
|
|
if not os.path.isfile(xlsx_path):
|
|
print(f"ERROR: File not found: {xlsx_path}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if not xlsx_path.lower().endswith((".xlsx", ".xlsm")):
|
|
print(f"WARNING: '{xlsx_path}' does not have an .xlsx/.xlsm extension", file=sys.stderr)
|
|
|
|
if os.path.exists(output_dir):
|
|
shutil.rmtree(output_dir)
|
|
os.makedirs(output_dir)
|
|
|
|
try:
|
|
with zipfile.ZipFile(xlsx_path, "r") as z:
|
|
# Validate member paths to prevent zip-slip (path traversal) attacks
|
|
for member in z.namelist():
|
|
member_path = os.path.realpath(os.path.join(output_dir, member))
|
|
if not member_path.startswith(os.path.realpath(output_dir) + os.sep) and member_path != os.path.realpath(output_dir):
|
|
print(f"ERROR: Zip entry '{member}' would escape target directory (path traversal blocked)", file=sys.stderr)
|
|
shutil.rmtree(output_dir, ignore_errors=True)
|
|
sys.exit(1)
|
|
z.extractall(output_dir)
|
|
except zipfile.BadZipFile:
|
|
shutil.rmtree(output_dir, ignore_errors=True)
|
|
print(f"ERROR: '{xlsx_path}' is not a valid ZIP/xlsx file", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Pretty-print XML and .rels files
|
|
xml_count = 0
|
|
for dirpath, _, filenames in os.walk(output_dir):
|
|
for fname in filenames:
|
|
if fname.endswith(".xml") or fname.endswith(".rels"):
|
|
fpath = os.path.join(dirpath, fname)
|
|
with open(fpath, "rb") as f:
|
|
raw = f.read()
|
|
pretty = pretty_print_xml(raw)
|
|
with open(fpath, "w", encoding="utf-8") as f:
|
|
f.write(pretty)
|
|
xml_count += 1
|
|
|
|
print(f"Unpacked '{xlsx_path}' → '{output_dir}'")
|
|
print(f"Pretty-printed {xml_count} XML/rels files\n")
|
|
|
|
# Print key files grouped by category
|
|
categories = {
|
|
"Package root": ["[Content_Types].xml", "_rels/.rels"],
|
|
"Workbook": ["xl/workbook.xml", "xl/_rels/workbook.xml.rels"],
|
|
"Styles & Strings": ["xl/styles.xml", "xl/sharedStrings.xml"],
|
|
"Worksheets": [],
|
|
}
|
|
|
|
all_files = []
|
|
for dirpath, _, filenames in os.walk(output_dir):
|
|
for fname in filenames:
|
|
rel = os.path.relpath(os.path.join(dirpath, fname), output_dir)
|
|
all_files.append(rel)
|
|
|
|
# Collect worksheets
|
|
for rel in sorted(all_files):
|
|
if rel.startswith("xl/worksheets/") and rel.endswith(".xml"):
|
|
categories["Worksheets"].append(rel)
|
|
|
|
print("Key files to inspect/edit:")
|
|
for category, files in categories.items():
|
|
if not files:
|
|
continue
|
|
print(f"\n [{category}]")
|
|
for f in files:
|
|
full = os.path.join(output_dir, f)
|
|
if os.path.isfile(full):
|
|
size = os.path.getsize(full)
|
|
print(f" {f} ({size:,} bytes)")
|
|
else:
|
|
print(f" {f} (not found)")
|
|
|
|
# Warn about high-risk files present
|
|
risky = {
|
|
"xl/vbaProject.bin": "VBA macros — DO NOT modify",
|
|
"xl/pivotTables": "Pivot tables — update source ranges carefully if shifting rows",
|
|
"xl/charts": "Charts — update data ranges if shifting rows",
|
|
}
|
|
print("\n [High-risk content detected:]")
|
|
found_any = False
|
|
for path, warning in risky.items():
|
|
full = os.path.join(output_dir, path)
|
|
if os.path.exists(full):
|
|
print(f" ⚠️ {path} — {warning}")
|
|
found_any = True
|
|
if not found_any:
|
|
print(" ✓ None (safe to edit)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 3:
|
|
print("Usage: xlsx_unpack.py <input.xlsx> <output_dir>")
|
|
sys.exit(1)
|
|
unpack(sys.argv[1], sys.argv[2])
|