initial commit

2026-03-16 19:54:53 -04:00
commit bfe0e01254
3341 changed files with 483939 additions and 0 deletions
--- a/.gemini/skills/bmad-distillator/scripts/analyze_sources.py
+++ b/.gemini/skills/bmad-distillator/scripts/analyze_sources.py
@@ -0,0 +1,300 @@
+# /// script
+# /// requires-python = ">=3.10"
+# /// dependencies = []
+# ///
+"""Analyze source documents for the distillation generator.
+
+Enumerates files from paths/folders/globs, computes sizes and token estimates,
+detects document types from naming conventions, and suggests groupings for
+related documents (e.g., a brief paired with its discovery notes).
+
+Accepts: file paths, folder paths (scans recursively for .md/.txt/.yaml/.yml/.json),
+or glob patterns. Skips node_modules, .git, __pycache__, .venv, _bmad-output.
+
+Output JSON structure:
+  status: "ok" | "error"
+  files[]: path, filename, size_bytes, estimated_tokens, doc_type
+  summary: total_files, total_size_bytes, total_estimated_tokens
+  groups[]: group_key, files[] with role (primary/companion/standalone)
+    - Groups related docs by naming convention (e.g., brief + discovery-notes)
+  routing: recommendation ("single" | "fan-out"), reason
+    - single: ≤3 files AND ≤15K estimated tokens
+    - fan-out: >3 files OR >15K estimated tokens
+  split_prediction: prediction ("likely" | "unlikely"), reason, estimated_distillate_tokens
+    - Estimates distillate at ~1/3 source size; splits if >5K tokens
+"""
+
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+import re
+import sys
+from pathlib import Path
+
+# Extensions to include when scanning folders
+INCLUDE_EXTENSIONS = {".md", ".txt", ".yaml", ".yml", ".json"}
+
+# Directories to skip when scanning folders
+SKIP_DIRS = {
+    "node_modules", ".git", "__pycache__", ".venv", "venv",
+    ".claude", "_bmad-output", ".cursor", ".vscode",
+}
+
+# Approximate chars per token for estimation
+CHARS_PER_TOKEN = 4
+
+# Thresholds
+SINGLE_COMPRESSOR_MAX_TOKENS = 15_000
+SINGLE_DISTILLATE_MAX_TOKENS = 5_000
+
+# Naming patterns for document type detection
+DOC_TYPE_PATTERNS = [
+    (r"discovery[_-]notes", "discovery-notes"),
+    (r"product[_-]brief", "product-brief"),
+    (r"research[_-]report", "research-report"),
+    (r"architecture", "architecture-doc"),
+    (r"prd", "prd"),
+    (r"distillate", "distillate"),
+    (r"changelog", "changelog"),
+    (r"readme", "readme"),
+    (r"spec", "specification"),
+    (r"requirements", "requirements"),
+    (r"design[_-]doc", "design-doc"),
+    (r"meeting[_-]notes", "meeting-notes"),
+    (r"brainstorm", "brainstorming"),
+    (r"interview", "interview-notes"),
+]
+
+# Patterns for grouping related documents
+GROUP_PATTERNS = [
+    # base document + discovery notes
+    (r"^(.+?)(?:-discovery-notes|-discovery_notes)\.(\w+)$", r"\1.\2"),
+    # base document + appendix
+    (r"^(.+?)(?:-appendix|-addendum)(?:-\w+)?\.(\w+)$", r"\1.\2"),
+    # base document + review/feedback
+    (r"^(.+?)(?:-review|-feedback)\.(\w+)$", r"\1.\2"),
+]
+
+
+def resolve_inputs(inputs: list[str]) -> list[Path]:
+    """Resolve input arguments to a flat list of file paths."""
+    files: list[Path] = []
+    for inp in inputs:
+        path = Path(inp)
+        if path.is_file():
+            files.append(path.resolve())
+        elif path.is_dir():
+            for root, dirs, filenames in os.walk(path):
+                dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
+                for fn in sorted(filenames):
+                    fp = Path(root) / fn
+                    if fp.suffix.lower() in INCLUDE_EXTENSIONS:
+                        files.append(fp.resolve())
+        else:
+            # Try as glob
+            matches = glob.glob(inp, recursive=True)
+            for m in sorted(matches):
+                mp = Path(m)
+                if mp.is_file() and mp.suffix.lower() in INCLUDE_EXTENSIONS:
+                    files.append(mp.resolve())
+    # Deduplicate while preserving order
+    seen: set[Path] = set()
+    deduped: list[Path] = []
+    for f in files:
+        if f not in seen:
+            seen.add(f)
+            deduped.append(f)
+    return deduped
+
+
+def detect_doc_type(filename: str) -> str:
+    """Detect document type from filename."""
+    name_lower = filename.lower()
+    for pattern, doc_type in DOC_TYPE_PATTERNS:
+        if re.search(pattern, name_lower):
+            return doc_type
+    return "unknown"
+
+
+def suggest_groups(files: list[Path]) -> list[dict]:
+    """Suggest document groupings based on naming conventions."""
+    groups: dict[str, list[dict]] = {}
+    ungrouped: list[dict] = []
+
+    file_map = {f.name: f for f in files}
+
+    assigned: set[str] = set()
+
+    for f in files:
+        if f.name in assigned:
+            continue
+
+        matched = False
+        for pattern, base_pattern in GROUP_PATTERNS:
+            m = re.match(pattern, f.name, re.IGNORECASE)
+            if m:
+                # This file is a companion — find its base
+                base_name = re.sub(pattern, base_pattern, f.name, flags=re.IGNORECASE)
+                group_key = base_name
+                if group_key not in groups:
+                    groups[group_key] = []
+                    # Add the base file if it exists
+                    if base_name in file_map and base_name not in assigned:
+                        groups[group_key].append({
+                            "path": str(file_map[base_name]),
+                            "filename": base_name,
+                            "role": "primary",
+                        })
+                        assigned.add(base_name)
+                groups[group_key].append({
+                    "path": str(f),
+                    "filename": f.name,
+                    "role": "companion",
+                })
+                assigned.add(f.name)
+                matched = True
+                break
+
+        if not matched:
+            # Check if this file is a base that already has companions
+            if f.name in groups:
+                continue  # Already added as primary
+            ungrouped.append({
+                "path": str(f),
+                "filename": f.name,
+            })
+
+    result = []
+    for group_key, members in groups.items():
+        result.append({
+            "group_key": group_key,
+            "files": members,
+        })
+    for ug in ungrouped:
+        if ug["filename"] not in assigned:
+            result.append({
+                "group_key": ug["filename"],
+                "files": [{"path": ug["path"], "filename": ug["filename"], "role": "standalone"}],
+            })
+
+    return result
+
+
+def analyze(inputs: list[str], output_path: str | None = None) -> None:
+    """Main analysis function."""
+    files = resolve_inputs(inputs)
+
+    if not files:
+        result = {
+            "status": "error",
+            "error": "No readable files found from provided inputs",
+            "inputs": inputs,
+        }
+        output_json(result, output_path)
+        return
+
+    # Analyze each file
+    file_details = []
+    total_chars = 0
+    for f in files:
+        size = f.stat().st_size
+        total_chars += size
+        file_details.append({
+            "path": str(f),
+            "filename": f.name,
+            "size_bytes": size,
+            "estimated_tokens": size // CHARS_PER_TOKEN,
+            "doc_type": detect_doc_type(f.name),
+        })
+
+    total_tokens = total_chars // CHARS_PER_TOKEN
+    groups = suggest_groups(files)
+
+    # Routing recommendation
+    if len(files) <= 3 and total_tokens <= SINGLE_COMPRESSOR_MAX_TOKENS:
+        routing = "single"
+        routing_reason = (
+            f"{len(files)} file(s), ~{total_tokens:,} estimated tokens — "
+            f"within single compressor threshold"
+        )
+    else:
+        routing = "fan-out"
+        routing_reason = (
+            f"{len(files)} file(s), ~{total_tokens:,} estimated tokens — "
+            f"exceeds single compressor threshold "
+            f"({'>' + str(SINGLE_COMPRESSOR_MAX_TOKENS) + ' tokens' if total_tokens > SINGLE_COMPRESSOR_MAX_TOKENS else '> 3 files'})"
+        )
+
+    # Split prediction
+    estimated_distillate_tokens = total_tokens // 3  # rough: distillate is ~1/3 of source
+    if estimated_distillate_tokens > SINGLE_DISTILLATE_MAX_TOKENS:
+        split_prediction = "likely"
+        split_reason = (
+            f"Estimated distillate ~{estimated_distillate_tokens:,} tokens "
+            f"exceeds {SINGLE_DISTILLATE_MAX_TOKENS:,} threshold"
+        )
+    else:
+        split_prediction = "unlikely"
+        split_reason = (
+            f"Estimated distillate ~{estimated_distillate_tokens:,} tokens "
+            f"within {SINGLE_DISTILLATE_MAX_TOKENS:,} threshold"
+        )
+
+    result = {
+        "status": "ok",
+        "files": file_details,
+        "summary": {
+            "total_files": len(files),
+            "total_size_bytes": total_chars,
+            "total_estimated_tokens": total_tokens,
+        },
+        "groups": groups,
+        "routing": {
+            "recommendation": routing,
+            "reason": routing_reason,
+        },
+        "split_prediction": {
+            "prediction": split_prediction,
+            "reason": split_reason,
+            "estimated_distillate_tokens": estimated_distillate_tokens,
+        },
+    }
+
+    output_json(result, output_path)
+
+
+def output_json(data: dict, output_path: str | None) -> None:
+    """Write JSON to file or stdout."""
+    json_str = json.dumps(data, indent=2)
+    if output_path:
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+        Path(output_path).write_text(json_str + "\n")
+        print(f"Results written to {output_path}", file=sys.stderr)
+    else:
+        print(json_str)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "inputs",
+        nargs="+",
+        help="File paths, folder paths, or glob patterns to analyze",
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Output JSON to file instead of stdout",
+    )
+    args = parser.parse_args()
+    analyze(args.inputs, args.output)
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()