#!/usr/bin/env python3 """ reformat_parse.py — Convert an existing document into content.json, then hand off to the CREATE pipeline (render_body.py). Supported input formats: .md / .txt — Markdown / plain text .pdf — Extract text from existing PDF (layout preserved as best-effort) .json — Pass-through if already content.json format Usage: python3 reformat_parse.py --input doc.md --out content.json python3 reformat_parse.py --input old.pdf --out content.json python3 reformat_parse.py --input data.json --out content.json Then pipe into the CREATE pipeline: python3 render_body.py --tokens tokens.json --content content.json --out body.pdf Or use make.sh reformat which does both steps: bash make.sh reformat --input doc.md --type report --title "My Report" --out output.pdf Exit codes: 0 success, 1 bad args / unsupported format, 2 dep missing, 3 parse error """ import argparse import json import os import re import sys import importlib.util from pathlib import Path def ensure_deps(): missing = [] if importlib.util.find_spec("pypdf") is None: missing.append("pypdf") if missing: import subprocess subprocess.check_call( [sys.executable, "-m", "pip", "install", "--break-system-packages", "-q"] + missing ) ensure_deps() # ── Markdown / plain text parser ─────────────────────────────────────────────── def parse_markdown(text: str) -> list: """ Convert Markdown to content.json blocks. Supports: # headings, **bold**, bullet lists, > blockquotes (→ callout), | tables |, plain paragraphs. """ blocks = [] lines = text.splitlines() i = 0 def flush_para(buf: list): t = " ".join(buf).strip() if t: blocks.append({"type": "body", "text": _md_inline(t)}) para_buf = [] while i < len(lines): line = lines[i] stripped = line.strip() # Blank line — flush paragraph buffer if not stripped: flush_para(para_buf) para_buf = [] i += 1 continue # ATX Headings: # ## ### m = re.match(r'^(#{1,3})\s+(.*)', stripped) if m: flush_para(para_buf) para_buf = [] level = len(m.group(1)) htype = {1: "h1", 2: "h2", 3: "h3"}.get(level, "h3") blocks.append({"type": htype, "text": _md_inline(m.group(2))}) i += 1 continue # Display math block: $$expr$$ on one line, or opening $$ ... closing $$ if stripped.startswith("$$"): flush_para(para_buf) para_buf = [] inline_expr = stripped[2:].rstrip("$").strip() if inline_expr: # Single-line: $$E = mc^2$$ blocks.append({"type": "math", "text": inline_expr}) i += 1 else: # Multi-line: opening $$ alone, then expression lines, then closing $$ math_lines = [] i += 1 while i < len(lines) and lines[i].strip() != "$$": math_lines.append(lines[i]) i += 1 if i < len(lines): i += 1 # skip closing $$ blocks.append({"type": "math", "text": "\n".join(math_lines).strip()}) continue # Fenced code block: ``` or ~~~ if stripped.startswith("```") or stripped.startswith("~~~"): flush_para(para_buf) para_buf = [] fence = stripped[:3] code_lines = [] i += 1 while i < len(lines) and not lines[i].strip().startswith(fence): code_lines.append(lines[i]) i += 1 if i < len(lines): i += 1 # skip closing fence blocks.append({"type": "code", "text": "\n".join(code_lines)}) continue # Blockquote → callout if stripped.startswith(">"): flush_para(para_buf) para_buf = [] qt = re.sub(r'^>\s*', '', stripped) blocks.append({"type": "callout", "text": _md_inline(qt)}) i += 1 continue # Unordered bullet: -, *, + if re.match(r'^[-*+]\s+', stripped): flush_para(para_buf) para_buf = [] text_part = re.sub(r'^[-*+]\s+', '', stripped) blocks.append({"type": "bullet", "text": _md_inline(text_part)}) i += 1 continue # Ordered list: 1. 2. etc. → numbered (preserves counter in render_body) if re.match(r'^\d+\.\s+', stripped): flush_para(para_buf) para_buf = [] text_part = re.sub(r'^\d+\.\s+', '', stripped) blocks.append({"type": "numbered", "text": _md_inline(text_part)}) i += 1 continue # Table: | col | col | if stripped.startswith("|"): flush_para(para_buf) para_buf = [] table_lines = [] while i < len(lines) and lines[i].strip().startswith("|"): table_lines.append(lines[i].strip()) i += 1 # Remove separator rows (|---|---|) data_rows = [r for r in table_lines if not re.match(r'^\|[-:| ]+\|$', r)] parsed = [] for row in data_rows: cells = [c.strip() for c in row.strip("|").split("|")] parsed.append(cells) if len(parsed) >= 2: blocks.append({ "type": "table", "headers": parsed[0], "rows": parsed[1:], }) elif len(parsed) == 1: # Single row — treat as paragraph blocks.append({"type": "body", "text": " | ".join(parsed[0])}) continue # Horizontal rule → spacer if re.match(r'^[-*_]{3,}$', stripped): flush_para(para_buf) para_buf = [] blocks.append({"type": "spacer", "pt": 16}) i += 1 continue # Plain text → accumulate into paragraph para_buf.append(stripped) i += 1 flush_para(para_buf) return blocks def _md_inline(text: str) -> str: """Convert inline Markdown to ReportLab XML markup.""" # Bold: **text** or __text__ text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) text = re.sub(r'__(.+?)__', r'\1', text) # Italic: *text* or _text_ text = re.sub(r'\*(.+?)\*', r'\1', text) text = re.sub(r'_(.+?)_', r'\1', text) # Inline code: `code` text = re.sub(r'`(.+?)`', r'\1', text) # Strip markdown links, keep text text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text) return text # ── PDF text extractor ───────────────────────────────────────────────────────── def parse_pdf(pdf_path: str) -> list: """ Extract text from an existing PDF and convert to content.json blocks. Best-effort: detects headings by font size heuristics if available, otherwise falls back to paragraph splitting. """ from pypdf import PdfReader reader = PdfReader(pdf_path) all_text = [] for page in reader.pages: text = page.extract_text() if text: all_text.append(text.strip()) full_text = "\n\n".join(all_text) # Treat extracted PDF text as plain text / light markdown # (most PDFs lose formatting — we do our best) return parse_plain(full_text) def parse_plain(text: str) -> list: """ Heuristic plain-text parser. Short ALL-CAPS or title-case lines → headings. Everything else → paragraphs. """ blocks = [] paragraphs = re.split(r'\n{2,}', text.strip()) for para in paragraphs: para = para.strip() if not para: continue lines = para.splitlines() # Single short line that looks like a heading if len(lines) == 1 and len(para) < 80: if para.isupper() or re.match(r'^[A-Z][^.!?]*$', para): blocks.append({"type": "h1", "text": para.title()}) continue # Bullet lists if lines[0].startswith(("- ", "• ", "* ")): for line in lines: text_part = re.sub(r'^[-•*]\s+', '', line.strip()) if text_part: blocks.append({"type": "bullet", "text": text_part}) continue # Regular paragraph blocks.append({"type": "body", "text": " ".join(lines)}) return blocks # ── Pass-through validator ───────────────────────────────────────────────────── VALID_TYPES = {"h1","h2","h3","body","bullet","numbered","callout","table", "image","code","math","divider","caption","pagebreak","spacer"} def validate_content_json(data: list) -> tuple[list, list]: """Return (valid_blocks, warnings).""" valid, warnings = [], [] for i, block in enumerate(data): if not isinstance(block, dict): warnings.append(f"Block {i}: not a dict, skipped") continue btype = block.get("type") if btype not in VALID_TYPES: warnings.append(f"Block {i}: unknown type '{btype}', kept as-is") valid.append(block) return valid, warnings # ── Dispatcher ───────────────────────────────────────────────────────────────── def parse_file(input_path: str) -> tuple[list, list]: """Return (blocks, warnings).""" ext = Path(input_path).suffix.lower() if ext in (".md", ".txt", ".markdown"): with open(input_path, encoding="utf-8", errors="replace") as f: text = f.read() blocks = parse_markdown(text) return blocks, [] if ext == ".pdf": blocks = parse_pdf(input_path) return blocks, ["PDF text extraction is best-effort — review content.json before rendering"] if ext == ".json": with open(input_path) as f: data = json.load(f) if isinstance(data, list): return validate_content_json(data) # Maybe it's a meta-wrapper {"content": [...]} if isinstance(data, dict) and "content" in data: return validate_content_json(data["content"]) return [], [f"JSON file does not contain a list of content blocks"] return [], [f"Unsupported file type: {ext}. Supported: .md .txt .pdf .json"] # ── CLI ──────────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Parse a document into content.json") parser.add_argument("--input", required=True, help="Input file (.md, .txt, .pdf, .json)") parser.add_argument("--out", default="content.json", help="Output content.json path") args = parser.parse_args() if not os.path.exists(args.input): print(json.dumps({"status": "error", "error": f"File not found: {args.input}"}), file=sys.stderr) sys.exit(1) try: blocks, warnings = parse_file(args.input) except Exception as e: import traceback print(json.dumps({"status": "error", "error": str(e), "trace": traceback.format_exc()}), file=sys.stderr) sys.exit(3) if not blocks: print(json.dumps({ "status": "error", "error": "No content blocks extracted", "warnings": warnings, }), file=sys.stderr) sys.exit(3) with open(args.out, "w", encoding="utf-8") as f: json.dump(blocks, f, indent=2, ensure_ascii=False) result = { "status": "ok", "out": args.out, "block_count": len(blocks), "warnings": warnings, } print(json.dumps(result, indent=2)) print(f"\n── Parsed {args.input} ─────────────────────────────────────", file=sys.stderr) print(f" Blocks : {len(blocks)}", file=sys.stderr) type_counts: dict = {} for b in blocks: type_counts[b.get("type","?")] = type_counts.get(b.get("type","?"), 0) + 1 for t, n in sorted(type_counts.items()): print(f" {t:12} × {n}", file=sys.stderr) if warnings: print(f" Warnings:", file=sys.stderr) for w in warnings: print(f" ⚠ {w}", file=sys.stderr) print(f"\n Next: bash make.sh run --content {args.out} --title '...' --type ...", file=sys.stderr) print("", file=sys.stderr) if __name__ == "__main__": main()