#!/usr/bin/env python3
"""Validate agent-doc structure and agent-doc skill-suite consistency."""

from __future__ import annotations

import argparse
import re
from collections import deque
from dataclasses import dataclass, field
from pathlib import Path


BACKTICK_MD_RE = re.compile(r"`([^`\n]+?\.md)`")
H2_RE = re.compile(r"^##\s+(.+?)\s*$", re.MULTILINE)
FRONTMATTER_NAME_RE = re.compile(r"^name:\s*([A-Za-z0-9-]+)\s*$", re.MULTILINE)
DEFAULT_PROMPT_RE = re.compile(r'^\s*default_prompt:\s*"([^"]+)"\s*$', re.MULTILINE)
BULLET_RE = re.compile(r"^\s*-\s+(.+?)\s*$", re.MULTILINE)
SECTION_RE = re.compile(r"^##\s+(.+?)\s*$", re.MULTILINE)
STRONG_RULE_RE = re.compile(r"\b(read and follow|must|required|do not|always)\b", re.IGNORECASE)

RULE_REQUIRED_HEADINGS = {"Applies To", "Authority", "Checklist"}
RULE_OPTIONAL_HEADINGS = {"Does Not Apply To", "Edge Cases", "Failure Modes"}
INVENTORY_REQUIRED_HEADINGS = {
    "Applies To",
    "Maintenance Rules",
    "Reclassification Rules",
    "Classification Authority",
    "Entries",
    "Maintenance Checklist",
}


@dataclass
class ValidationReport:
    errors: list[str] = field(default_factory=list)
    structural_warnings: list[str] = field(default_factory=list)
    warnings: list[str] = field(default_factory=list)
    notes: list[str] = field(default_factory=list)

    def error(self, message: str) -> None:
        self.errors.append(message)

    def warn(self, message: str) -> None:
        self.warnings.append(message)

    def structural_warn(self, message: str) -> None:
        self.structural_warnings.append(message)

    def note(self, message: str) -> None:
        self.notes.append(message)


def read_text(path: Path) -> str:
    return path.read_text(encoding="utf-8")


def normalize_heading(heading: str) -> str:
    return " ".join(heading.strip().split())


def normalize_phrase(value: str) -> str:
    return " ".join(value.lower().split())


def extract_h2_headings(text: str) -> set[str]:
    return {normalize_heading(match.group(1)) for match in H2_RE.finditer(text)}


def extract_section_text(text: str, heading: str) -> str:
    matches = list(SECTION_RE.finditer(text))
    for index, match in enumerate(matches):
        if normalize_heading(match.group(1)) != heading:
            continue
        start = match.end()
        end = matches[index + 1].start() if index + 1 < len(matches) else len(text)
        return text[start:end].strip()
    return ""


def extract_section_bullets(text: str, heading: str) -> list[str]:
    section = extract_section_text(text, heading)
    return [match.group(1).strip() for match in BULLET_RE.finditer(section)]


def has_anchor_token(value: str) -> bool:
    return "`" in value or "/" in value or "." in value or ":" in value


def is_vague_phrase(value: str) -> bool:
    normalized = normalize_phrase(value)
    vague_phrases = {
        "related tasks",
        "relevant tasks",
        "appropriate tasks",
        "matching tasks",
        "relevant work",
        "related work",
        "relevant code",
        "related code",
        "the code",
        "the docs",
        "documentation",
        "repository",
        "repo",
        "maintainers",
        "owners",
        "owner",
    }
    return normalized in vague_phrases


def extract_strong_rule_lines(text: str) -> set[str]:
    strong_lines: set[str] = set()
    for raw_line in text.splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#") or line.startswith("```"):
            continue
        if STRONG_RULE_RE.search(line):
            strong_lines.add(normalize_phrase(line))
    return strong_lines


def extract_frontmatter_name(text: str) -> str | None:
    if not text.startswith("---\n"):
        return None
    end = text.find("\n---\n", 4)
    if end == -1:
        return None
    frontmatter = text[4:end]
    match = FRONTMATTER_NAME_RE.search(frontmatter)
    return match.group(1) if match else None


def extract_default_prompt(text: str) -> str | None:
    match = DEFAULT_PROMPT_RE.search(text)
    return match.group(1) if match else None


def md_reference_targets(text: str) -> list[Path]:
    targets: list[Path] = []
    for raw in BACKTICK_MD_RE.findall(text):
        candidate = Path(raw.strip())
        if candidate.name == "AGENTS.md" or "agent-docs" in candidate.parts:
            targets.append(candidate)
    return targets


def agent_doc_files(repo_root: Path) -> list[Path]:
    docs = []
    for path in repo_root.rglob("*.md"):
        relative = path.relative_to(repo_root)
        if relative.name == "AGENTS.md" and relative.parent == Path("."):
            continue
        if "agent-docs" in relative.parts:
            docs.append(path)
    return sorted(docs)


def is_inventory_doc(headings: set[str]) -> bool:
    return "Entries" in headings or "Maintenance Rules" in headings


def validate_doc_shape(path: Path, report: ValidationReport, display_path: Path | None = None) -> None:
    text = read_text(path)
    headings = extract_h2_headings(text)
    label = str(display_path or path)

    if is_inventory_doc(headings):
        missing = sorted(INVENTORY_REQUIRED_HEADINGS - headings)
        if missing:
            report.error(
                f"{label}: inventory doc is missing required headings: {', '.join(missing)}"
            )
    else:
        missing = sorted(RULE_REQUIRED_HEADINGS - headings)
        if missing:
            report.error(
                f"{label}: rule doc is missing required headings: {', '.join(missing)}"
            )
        rule_sections = headings - RULE_REQUIRED_HEADINGS - RULE_OPTIONAL_HEADINGS
        if not rule_sections:
            report.error(
                f"{label}: rule doc needs at least one topic-specific rule section beyond scope, authority, and checklist headings"
            )

    applies_to = extract_section_bullets(text, "Applies To")
    if not applies_to:
        report.warn(f"{label}: `Applies To` has no bullet entries")
    for bullet in applies_to:
        if is_vague_phrase(bullet):
            report.warn(f"{label}: vague `Applies To` bullet `{bullet}` should be more specific")

    for bullet in extract_section_bullets(text, "Authority"):
        if is_vague_phrase(bullet) or (len(bullet) < 18 and not has_anchor_token(bullet)):
            report.warn(
                f"{label}: vague `Authority` bullet `{bullet}` should name a stronger source-of-truth anchor"
            )


def validate_repo_graph(repo_root: Path, report: ValidationReport, max_depth: int, max_out_degree: int) -> None:
    root_agents = repo_root / "AGENTS.md"
    if not root_agents.exists():
        report.error(f"{root_agents}: missing root AGENTS.md")
        return

    docs = agent_doc_files(repo_root)
    nodes = [root_agents, *docs]
    node_set = set(nodes)
    edges: dict[Path, set[Path]] = {node: set() for node in nodes}

    for node in nodes:
        text = read_text(node)
        for target in md_reference_targets(text):
            resolved = (repo_root / target).resolve()
            try:
                resolved.relative_to(repo_root.resolve())
            except ValueError:
                report.error(
                    f"{node.relative_to(repo_root)}: referenced path escapes repo root: {target}"
                )
                continue
            if not resolved.exists():
                report.error(
                    f"{node.relative_to(repo_root)}: stale reference to missing file `{target.as_posix()}`"
                )
                continue
            if resolved in node_set:
                edges[node].add(resolved)

    queue: deque[Path] = deque([root_agents])
    depths: dict[Path, int] = {root_agents: 0}
    while queue:
        current = queue.popleft()
        for child in sorted(edges[current]):
            if child not in depths:
                depths[child] = depths[current] + 1
                queue.append(child)

    for doc in docs:
        if doc not in depths:
            report.error(
                f"{doc.relative_to(repo_root)}: active child doc is not reachable from root AGENTS.md"
            )

    max_observed_depth = max(depths.values(), default=0)
    report.note(f"repo graph depth: {max_observed_depth}")
    if max_observed_depth > max_depth:
        report.structural_warn(
            f"repo graph depth {max_observed_depth} exceeds recommended maximum {max_depth}"
        )

    for node, children in sorted(edges.items()):
        if len(children) > max_out_degree:
            report.structural_warn(
                f"{node.relative_to(repo_root)}: out-degree {len(children)} exceeds recommended maximum {max_out_degree}"
            )

    visiting: set[Path] = set()
    visited: set[Path] = set()
    stack: list[Path] = []

    def dfs(node: Path) -> None:
        visiting.add(node)
        stack.append(node)
        for child in sorted(edges[node]):
            if child in visiting:
                start = stack.index(child)
                cycle = stack[start:] + [child]
                cycle_text = " -> ".join(str(item.relative_to(repo_root)) for item in cycle)
                report.error(f"cycle detected: {cycle_text}")
                continue
            if child not in visited:
                dfs(child)
        stack.pop()
        visiting.remove(node)
        visited.add(node)

    dfs(root_agents)

    applies_to_index: dict[str, list[Path]] = {}
    doc_profiles: dict[Path, dict[str, set[str]]] = {}
    for doc in docs:
        validate_doc_shape(doc, report, doc.relative_to(repo_root))
        doc_text = read_text(doc)
        applies_to_bullets = {
            normalize_phrase(bullet) for bullet in extract_section_bullets(doc_text, "Applies To")
        }
        authority_bullets = {
            normalize_phrase(bullet) for bullet in extract_section_bullets(doc_text, "Authority")
        }
        doc_profiles[doc] = {
            "applies_to": applies_to_bullets,
            "authority": authority_bullets,
            "strong_rules": extract_strong_rule_lines(doc_text),
        }

        for bullet in applies_to_bullets:
            normalized = bullet
            applies_to_index.setdefault(normalized, []).append(doc.relative_to(repo_root))

        nonempty_lines = [line for line in doc_text.splitlines() if line.strip()]
        reference_lines = [line for line in nonempty_lines if ".md`" in line or ".md" in line]
        if len(edges[doc]) >= 3 and nonempty_lines:
            reference_ratio = len(reference_lines) / len(nonempty_lines)
            if reference_ratio > 0.35 and len(extract_h2_headings(doc_text)) <= 4:
                report.structural_warn(
                    f"{doc.relative_to(repo_root)}: high reference density suggests secondary-router behavior"
                )

    for applies_to, owners in sorted(applies_to_index.items()):
        unique_owners = sorted({owner.as_posix() for owner in owners})
        if len(unique_owners) > 1 and not is_vague_phrase(applies_to):
            report.warn(
                f"shared `Applies To` bullet `{applies_to}` appears in multiple docs: {', '.join(unique_owners)}"
            )

    sorted_docs = sorted(doc_profiles)
    for index, left_doc in enumerate(sorted_docs):
        left_profile = doc_profiles[left_doc]
        for right_doc in sorted_docs[index + 1 :]:
            right_profile = doc_profiles[right_doc]
            shared_applies = {
                value
                for value in left_profile["applies_to"] & right_profile["applies_to"]
                if not is_vague_phrase(value)
            }
            if not shared_applies:
                continue

            shared_authority = {
                value
                for value in left_profile["authority"] & right_profile["authority"]
                if value and not is_vague_phrase(value)
            }
            shared_rules = left_profile["strong_rules"] & right_profile["strong_rules"]

            if shared_authority:
                report.warn(
                    "possible duplicate authority between "
                    f"{left_doc.relative_to(repo_root)} and {right_doc.relative_to(repo_root)}: "
                    f"shared scope {', '.join(sorted(shared_applies))}; shared authority {', '.join(sorted(shared_authority))}"
                )
                continue

            if shared_rules:
                report.warn(
                    "possible overlapping rule ownership between "
                    f"{left_doc.relative_to(repo_root)} and {right_doc.relative_to(repo_root)}: "
                    f"shared scope {', '.join(sorted(shared_applies))}; shared strong rule signals detected"
                )


def validate_skill_suite(skill_suite_root: Path, report: ValidationReport) -> None:
    if not skill_suite_root.exists():
        report.error(f"{skill_suite_root}: missing skill-suite root")
        return

    skill_dirs = sorted(
        path
        for path in skill_suite_root.iterdir()
        if path.is_dir() and path.name.startswith("agent-docs") and (path / "SKILL.md").exists()
    )
    if not skill_dirs:
        report.error(f"{skill_suite_root}: no agent-docs skill directories found")
        return

    skill_texts: dict[str, str] = {}
    for skill_dir in skill_dirs:
        skill_md = skill_dir / "SKILL.md"
        text = read_text(skill_md)
        skill_texts[skill_dir.name] = text
        skill_name = extract_frontmatter_name(text)
        if not skill_name:
            report.error(f"{skill_md}: missing frontmatter name")
            continue
        if skill_name != skill_dir.name:
            report.error(f"{skill_md}: frontmatter name `{skill_name}` does not match directory `{skill_dir.name}`")

        prompt_file = skill_dir / "agents" / "openai.yaml"
        if prompt_file.exists():
            prompt_text = read_text(prompt_file)
            default_prompt = extract_default_prompt(prompt_text)
            if not default_prompt:
                report.error(f"{prompt_file}: missing default_prompt")
            elif f"${skill_name}" not in default_prompt:
                report.error(
                    f"{prompt_file}: default_prompt does not mention `${skill_name}`"
                )

        if "bluecraft-agentic-docs" in text:
            report.error(f"{skill_md}: stale reference to `bluecraft-agentic-docs`")

    refs_dir = skill_suite_root / "agent-docs" / "references"
    if refs_dir.exists():
        searchable_texts = [*skill_texts.values()]
        for skill_dir in skill_dirs:
            prompt_file = skill_dir / "agents" / "openai.yaml"
            if prompt_file.exists():
                searchable_texts.append(read_text(prompt_file))
        for ref_file in sorted(path for path in refs_dir.iterdir() if path.is_file()):
            if not any(ref_file.name in text for text in searchable_texts):
                report.error(
                    f"{ref_file}: no skill file or prompt directly references this shared reference"
                )

    max_prompt_length = 220
    for skill_dir in skill_dirs:
        prompt_file = skill_dir / "agents" / "openai.yaml"
        if not prompt_file.exists():
            continue
        default_prompt = extract_default_prompt(read_text(prompt_file))
        if default_prompt and len(default_prompt) > max_prompt_length:
            report.warn(
                f"{prompt_file}: default_prompt length {len(default_prompt)} exceeds recommended maximum {max_prompt_length}"
            )


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="Validate AGENTS.md / agent-docs structure and agent-docs skill-suite consistency."
    )
    parser.add_argument("--repo-root", type=Path, help="Repository root containing AGENTS.md and agent-docs/")
    parser.add_argument("--skill-suite-root", type=Path, help="Root containing the agent-docs skill directories")
    parser.add_argument("--max-depth", type=int, default=3, help="Recommended maximum root-to-doc routing depth")
    parser.add_argument(
        "--max-out-degree",
        type=int,
        default=7,
        help="Recommended maximum number of child-doc references from one doc",
    )
    return parser


def main() -> int:
    parser = build_parser()
    args = parser.parse_args()
    if not args.repo_root and not args.skill_suite_root:
        parser.error("at least one of --repo-root or --skill-suite-root is required")

    report = ValidationReport()
    if args.repo_root:
        validate_repo_graph(args.repo_root.resolve(), report, args.max_depth, args.max_out_degree)
    if args.skill_suite_root:
        validate_skill_suite(args.skill_suite_root.resolve(), report)

    for note in report.notes:
        print(f"NOTE: {note}")
    for structural_warning in report.structural_warnings:
        print(f"STRUCTURAL-WARNING: {structural_warning}")
    for warning in report.warnings:
        print(f"WARNING: {warning}")
    for error in report.errors:
        print(f"ERROR: {error}")

    if report.errors:
        print(
            "Validation failed with "
            f"{len(report.errors)} error(s), "
            f"{len(report.structural_warnings)} structural warning(s), and "
            f"{len(report.warnings)} warning(s)."
        )
        return 1

    print(
        "Validation passed with "
        f"{len(report.structural_warnings)} structural warning(s) and "
        f"{len(report.warnings)} warning(s)."
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())