201 lines
6.3 KiB
Python
201 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
fill_inspect.py — Inspect form fields in an existing PDF.
|
|
|
|
Usage:
|
|
python3 fill_inspect.py --input form.pdf
|
|
python3 fill_inspect.py --input form.pdf --out fields.json
|
|
|
|
Outputs a JSON summary of every fillable field: name, type, current value,
|
|
allowed values (for checkboxes / dropdowns), and page number.
|
|
|
|
Exit codes: 0 success, 1 bad args / file not found, 2 dep missing, 3 read error
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
import importlib.util
|
|
import os
|
|
|
|
|
|
|
|
|
|
def ensure_deps():
|
|
if importlib.util.find_spec("pypdf") is None:
|
|
import subprocess
|
|
subprocess.check_call(
|
|
[sys.executable, "-m", "pip", "install", "--break-system-packages", "-q", "pypdf"]
|
|
)
|
|
|
|
|
|
ensure_deps()
|
|
from pypdf import PdfReader
|
|
from pypdf.generic import ArrayObject, DictionaryObject, NameObject, TextStringObject
|
|
|
|
|
|
# ── Field type resolution ──────────────────────────────────────────────────────
|
|
def _field_type(field) -> str:
|
|
ft = field.get("/FT")
|
|
if ft is None:
|
|
return "unknown"
|
|
ft = str(ft)
|
|
if ft == "/Tx":
|
|
return "text"
|
|
if ft == "/Btn":
|
|
ff = int(field.get("/Ff", 0))
|
|
return "radio" if ff & (1 << 15) else "checkbox"
|
|
if ft == "/Ch":
|
|
ff = int(field.get("/Ff", 0))
|
|
return "dropdown" if ff & (1 << 17) else "listbox"
|
|
if ft == "/Sig":
|
|
return "signature"
|
|
return "unknown"
|
|
|
|
|
|
def _field_value(field) -> str | None:
|
|
v = field.get("/V")
|
|
return str(v) if v is not None else None
|
|
|
|
|
|
def _field_options(field, ftype: str) -> dict:
|
|
extra = {}
|
|
if ftype in ("checkbox",):
|
|
ap = field.get("/AP")
|
|
if ap and "/N" in ap:
|
|
states = [str(k) for k in ap["/N"]]
|
|
extra["states"] = states
|
|
checked = next((s for s in states if s != "/Off"), None)
|
|
if checked:
|
|
extra["checked_value"] = checked
|
|
if ftype in ("dropdown", "listbox"):
|
|
opt = field.get("/Opt")
|
|
if opt:
|
|
choices = []
|
|
for item in opt:
|
|
if isinstance(item, (list, ArrayObject)) and len(item) >= 2:
|
|
choices.append({"value": str(item[0]), "label": str(item[1])})
|
|
else:
|
|
choices.append({"value": str(item), "label": str(item)})
|
|
extra["choices"] = choices
|
|
if ftype == "radio":
|
|
kids = field.get("/Kids")
|
|
if kids:
|
|
values = []
|
|
for kid in kids:
|
|
ap = kid.get("/AP")
|
|
if ap and "/N" in ap:
|
|
for k in ap["/N"]:
|
|
if str(k) != "/Off":
|
|
values.append(str(k))
|
|
extra["radio_values"] = values
|
|
return extra
|
|
|
|
|
|
def _walk_fields(fields, page_map: dict, parent_name: str = "") -> list:
|
|
"""Recursively collect all leaf fields."""
|
|
result = []
|
|
for field in fields:
|
|
name = str(field.get("/T", ""))
|
|
full = f"{parent_name}.{name}" if parent_name else name
|
|
|
|
kids = field.get("/Kids")
|
|
# Kids that have /T are sub-fields (groups), not widget annotations
|
|
if kids:
|
|
named_kids = [k for k in kids if "/T" in k]
|
|
if named_kids:
|
|
result.extend(_walk_fields(named_kids, page_map, full))
|
|
continue
|
|
|
|
ftype = _field_type(field)
|
|
if ftype == "unknown":
|
|
continue
|
|
|
|
entry = {
|
|
"name": full,
|
|
"type": ftype,
|
|
"value": _field_value(field),
|
|
}
|
|
entry.update(_field_options(field, ftype))
|
|
|
|
# Page lookup via /P indirect reference
|
|
p_ref = field.get("/P")
|
|
if p_ref and hasattr(p_ref, "idnum"):
|
|
entry["page"] = page_map.get(p_ref.idnum, "?")
|
|
|
|
result.append(entry)
|
|
return result
|
|
|
|
|
|
def inspect(pdf_path: str) -> dict:
|
|
try:
|
|
reader = PdfReader(pdf_path)
|
|
except Exception as e:
|
|
return {"status": "error", "error": str(e)}
|
|
|
|
# Build page-number lookup: {object_id: 1-based page number}
|
|
page_map = {}
|
|
for i, page in enumerate(reader.pages):
|
|
if hasattr(page, "indirect_reference") and page.indirect_reference:
|
|
page_map[page.indirect_reference.idnum] = i + 1
|
|
|
|
acroform = reader.trailer.get("/Root", {}).get("/AcroForm")
|
|
if acroform is None or "/Fields" not in acroform:
|
|
return {
|
|
"status": "ok",
|
|
"has_fields": False,
|
|
"field_count": 0,
|
|
"fields": [],
|
|
"note": "This PDF has no fillable form fields.",
|
|
}
|
|
|
|
fields = _walk_fields(list(acroform["/Fields"]), page_map)
|
|
|
|
return {
|
|
"status": "ok",
|
|
"has_fields": bool(fields),
|
|
"field_count": len(fields),
|
|
"fields": fields,
|
|
}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Inspect PDF form fields")
|
|
parser.add_argument("--input", required=True, help="PDF file to inspect")
|
|
parser.add_argument("--out", default="", help="Write JSON to file (optional)")
|
|
args = parser.parse_args()
|
|
|
|
if not os.path.exists(args.input):
|
|
print(json.dumps({"status": "error", "error": f"File not found: {args.input}"}),
|
|
file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
result = inspect(args.input)
|
|
|
|
output = json.dumps(result, indent=2, ensure_ascii=False)
|
|
|
|
if args.out:
|
|
with open(args.out, "w") as f:
|
|
f.write(output)
|
|
|
|
print(output)
|
|
|
|
# Human-readable summary
|
|
if result["status"] == "ok" and result["has_fields"]:
|
|
print(f"\n── Fields in {args.input} ──────────────────────────────",
|
|
file=sys.stderr)
|
|
for f in result["fields"]:
|
|
pg = f" p.{f['page']}" if "page" in f else ""
|
|
val = f" = {f['value']}" if f.get("value") else ""
|
|
extra = ""
|
|
if "choices" in f:
|
|
extra = f" [{', '.join(c['value'] for c in f['choices'][:4])}{'…' if len(f['choices'])>4 else ''}]"
|
|
elif "states" in f:
|
|
extra = f" {f['states']}"
|
|
print(f" {f['type']:12} {f['name']}{pg}{val}{extra}", file=sys.stderr)
|
|
print("", file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|