#!/usr/bin/env python3
"""Deterministic pre-pass for prompt craft scanner.

Extracts metrics and flagged patterns from SKILL.md and prompt files
so the LLM scanner can work from compact data instead of reading raw files.

Covers:
- SKILL.md line count and section inventory
- Overview section size
- Inline data detection (tables, fenced code blocks)
- Defensive padding pattern grep
- Meta-explanation pattern grep
- Back-reference detection ("as described above")
- Config header and progression condition presence per prompt
- File-level token estimates (chars / 4 rough approximation)
"""

# /// script
# requires-python = ">=3.9"
# ///

from __future__ import annotations

import argparse
import json
import re
import sys
from datetime import datetime, timezone
from pathlib import Path


# Defensive padding / filler patterns
WASTE_PATTERNS = [
    (r'\b[Mm]ake sure (?:to|you)\b', 'defensive-padding', 'Defensive: "make sure to/you"'),
    (r"\b[Dd]on'?t forget (?:to|that)\b", 'defensive-padding', "Defensive: \"don't forget\""),
    (r'\b[Rr]emember (?:to|that)\b', 'defensive-padding', 'Defensive: "remember to/that"'),
    (r'\b[Bb]e sure to\b', 'defensive-padding', 'Defensive: "be sure to"'),
    (r'\b[Pp]lease ensure\b', 'defensive-padding', 'Defensive: "please ensure"'),
    (r'\b[Ii]t is important (?:to|that)\b', 'defensive-padding', 'Defensive: "it is important"'),
    (r'\b[Yy]ou are an AI\b', 'meta-explanation', 'Meta: "you are an AI"'),
    (r'\b[Aa]s a language model\b', 'meta-explanation', 'Meta: "as a language model"'),
    (r'\b[Aa]s an AI assistant\b', 'meta-explanation', 'Meta: "as an AI assistant"'),
    (r'\b[Tt]his (?:workflow|skill|process) is designed to\b', 'meta-explanation', 'Meta: "this workflow is designed to"'),
    (r'\b[Tt]he purpose of this (?:section|step) is\b', 'meta-explanation', 'Meta: "the purpose of this section is"'),
    (r"\b[Ll]et'?s (?:think about|begin|start)\b", 'filler', "Filler: \"let's think/begin\""),
    (r'\b[Nn]ow we(?:\'ll| will)\b', 'filler', "Filler: \"now we'll\""),
]

# Back-reference patterns (self-containment risk)
BACKREF_PATTERNS = [
    (r'\bas described above\b', 'Back-reference: "as described above"'),
    (r'\bper the overview\b', 'Back-reference: "per the overview"'),
    (r'\bas mentioned (?:above|in|earlier)\b', 'Back-reference: "as mentioned above/in/earlier"'),
    (r'\bsee (?:above|the overview)\b', 'Back-reference: "see above/the overview"'),
    (r'\brefer to (?:the )?(?:above|overview|SKILL)\b', 'Back-reference: "refer to above/overview"'),
]


def count_tables(content: str) -> tuple[int, int]:
    """Count markdown tables and their total lines."""
    table_count = 0
    table_lines = 0
    in_table = False
    for line in content.split('\n'):
        if '|' in line and re.match(r'^\s*\|', line):
            if not in_table:
                table_count += 1
                in_table = True
            table_lines += 1
        else:
            in_table = False
    return table_count, table_lines


def count_fenced_blocks(content: str) -> tuple[int, int]:
    """Count fenced code blocks and their total lines."""
    block_count = 0
    block_lines = 0
    in_block = False
    for line in content.split('\n'):
        if line.strip().startswith('```'):
            if in_block:
                in_block = False
            else:
                in_block = True
                block_count += 1
        elif in_block:
            block_lines += 1
    return block_count, block_lines


def extract_overview_size(content: str) -> int:
    """Count lines in the ## Overview section."""
    lines = content.split('\n')
    in_overview = False
    overview_lines = 0
    for line in lines:
        if re.match(r'^##\s+Overview\b', line):
            in_overview = True
            continue
        elif in_overview and re.match(r'^##\s', line):
            break
        elif in_overview:
            overview_lines += 1
    return overview_lines


def scan_file_patterns(filepath: Path, rel_path: str) -> dict:
    """Extract metrics and pattern matches from a single file."""
    content = filepath.read_text(encoding='utf-8')
    lines = content.split('\n')
    line_count = len(lines)

    # Token estimate (rough: chars / 4)
    token_estimate = len(content) // 4

    # Section inventory
    sections = []
    for i, line in enumerate(lines, 1):
        m = re.match(r'^(#{2,3})\s+(.+)$', line)
        if m:
            sections.append({'level': len(m.group(1)), 'title': m.group(2).strip(), 'line': i})

    # Tables and code blocks
    table_count, table_lines = count_tables(content)
    block_count, block_lines = count_fenced_blocks(content)

    # Pattern matches
    waste_matches = []
    for pattern, category, label in WASTE_PATTERNS:
        for m in re.finditer(pattern, content):
            line_num = content[:m.start()].count('\n') + 1
            waste_matches.append({
                'line': line_num,
                'category': category,
                'pattern': label,
                'context': lines[line_num - 1].strip()[:100],
            })

    backref_matches = []
    for pattern, label in BACKREF_PATTERNS:
        for m in re.finditer(pattern, content, re.IGNORECASE):
            line_num = content[:m.start()].count('\n') + 1
            backref_matches.append({
                'line': line_num,
                'pattern': label,
                'context': lines[line_num - 1].strip()[:100],
            })

    # Config header
    has_config_header = '{communication_language}' in content or '{document_output_language}' in content

    # Progression condition
    prog_keywords = ['progress', 'advance', 'move to', 'next stage',
                     'when complete', 'proceed to', 'transition', 'completion criteria']
    has_progression = any(kw in content.lower() for kw in prog_keywords)

    result = {
        'file': rel_path,
        'line_count': line_count,
        'token_estimate': token_estimate,
        'sections': sections,
        'table_count': table_count,
        'table_lines': table_lines,
        'fenced_block_count': block_count,
        'fenced_block_lines': block_lines,
        'waste_patterns': waste_matches,
        'back_references': backref_matches,
        'has_config_header': has_config_header,
        'has_progression': has_progression,
    }

    return result


def scan_prompt_metrics(skill_path: Path) -> dict:
    """Extract metrics from all prompt-relevant files."""
    files_data = []

    # SKILL.md
    skill_md = skill_path / 'SKILL.md'
    if skill_md.exists():
        data = scan_file_patterns(skill_md, 'SKILL.md')
        content = skill_md.read_text(encoding='utf-8')
        data['overview_lines'] = extract_overview_size(content)
        data['is_skill_md'] = True
        files_data.append(data)

    # Prompt files at skill root (non-SKILL.md .md files)
    for f in sorted(skill_path.iterdir()):
        if f.is_file() and f.suffix == '.md' and f.name != 'SKILL.md':
            data = scan_file_patterns(f, f.name)
            data['is_skill_md'] = False
            files_data.append(data)

    # Resources (just sizes, for progressive disclosure assessment)
    resources_dir = skill_path / 'resources'
    resource_sizes = {}
    if resources_dir.exists():
        for f in sorted(resources_dir.iterdir()):
            if f.is_file() and f.suffix in ('.md', '.json', '.yaml', '.yml'):
                content = f.read_text(encoding='utf-8')
                resource_sizes[f.name] = {
                    'lines': len(content.split('\n')),
                    'tokens': len(content) // 4,
                }

    # Aggregate stats
    total_waste = sum(len(f['waste_patterns']) for f in files_data)
    total_backrefs = sum(len(f['back_references']) for f in files_data)
    total_tokens = sum(f['token_estimate'] for f in files_data)
    prompts_with_config = sum(1 for f in files_data if not f.get('is_skill_md') and f['has_config_header'])
    prompts_with_progression = sum(1 for f in files_data if not f.get('is_skill_md') and f['has_progression'])
    total_prompts = sum(1 for f in files_data if not f.get('is_skill_md'))

    skill_md_data = next((f for f in files_data if f.get('is_skill_md')), None)

    return {
        'scanner': 'prompt-craft-prepass',
        'script': 'prepass-prompt-metrics.py',
        'version': '1.0.0',
        'skill_path': str(skill_path),
        'timestamp': datetime.now(timezone.utc).isoformat(),
        'status': 'info',
        'skill_md_summary': {
            'line_count': skill_md_data['line_count'] if skill_md_data else 0,
            'token_estimate': skill_md_data['token_estimate'] if skill_md_data else 0,
            'overview_lines': skill_md_data.get('overview_lines', 0) if skill_md_data else 0,
            'table_count': skill_md_data['table_count'] if skill_md_data else 0,
            'table_lines': skill_md_data['table_lines'] if skill_md_data else 0,
            'fenced_block_count': skill_md_data['fenced_block_count'] if skill_md_data else 0,
            'fenced_block_lines': skill_md_data['fenced_block_lines'] if skill_md_data else 0,
            'section_count': len(skill_md_data['sections']) if skill_md_data else 0,
        },
        'prompt_health': {
            'total_prompts': total_prompts,
            'prompts_with_config_header': prompts_with_config,
            'prompts_with_progression': prompts_with_progression,
        },
        'aggregate': {
            'total_files_scanned': len(files_data),
            'total_token_estimate': total_tokens,
            'total_waste_patterns': total_waste,
            'total_back_references': total_backrefs,
        },
        'resource_sizes': resource_sizes,
        'files': files_data,
    }


def main() -> int:
    parser = argparse.ArgumentParser(
        description='Extract prompt craft metrics for LLM scanner pre-pass',
    )
    parser.add_argument(
        'skill_path',
        type=Path,
        help='Path to the skill directory to scan',
    )
    parser.add_argument(
        '--output', '-o',
        type=Path,
        help='Write JSON output to file instead of stdout',
    )
    args = parser.parse_args()

    if not args.skill_path.is_dir():
        print(f"Error: {args.skill_path} is not a directory", file=sys.stderr)
        return 2

    result = scan_prompt_metrics(args.skill_path)
    output = json.dumps(result, indent=2)

    if args.output:
        args.output.parent.mkdir(parents=True, exist_ok=True)
        args.output.write_text(output)
        print(f"Results written to {args.output}", file=sys.stderr)
    else:
        print(output)

    return 0


if __name__ == '__main__':
    sys.exit(main())