#!/usr/bin/env python3
"""Validate Chapter 17 RAG guided-lab starter artifacts.

The validator is intentionally lightweight and dependency-free so students can run
it in a local Python environment before implementing a full RAG stack.
"""

from __future__ import annotations

import csv
import json
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]

REQUIRED_SOURCE_COLUMNS = {
    "source_id",
    "source_uri",
    "owner",
    "sensitivity",
    "update_frequency",
    "format",
    "expected_document_count",
    "access_tags",
    "notes",
}

REQUIRED_SCHEMA_FIELDS = {
    "chunk_id",
    "document_id",
    "source_uri",
    "document_version",
    "section_path",
    "text",
    "token_count",
    "language",
    "access_tags",
    "embedding_model",
    "embedding_dimension",
    "chunking_strategy",
    "content_hash",
    "quality_status",
    "created_at",
}

REQUIRED_QUESTION_COLUMNS = {
    "question_id",
    "question",
    "expected_source_id",
    "expected_answer_type",
    "required_behavior",
    "risk_category",
}


def read_csv(path: Path) -> list[dict[str, str]]:
    with path.open(newline="", encoding="utf-8") as f:
        return list(csv.DictReader(f))


def assert_columns(path: Path, rows: list[dict[str, str]], required: set[str]) -> None:
    if not rows:
        raise AssertionError(f"{path.name} must contain at least one data row")
    columns = set(rows[0].keys())
    missing = sorted(required - columns)
    if missing:
        raise AssertionError(f"{path.name} is missing columns: {', '.join(missing)}")


def validate_source_inventory() -> None:
    path = ROOT / "source_inventory.csv"
    rows = read_csv(path)
    assert_columns(path, rows, REQUIRED_SOURCE_COLUMNS)
    if len(rows) < 5:
        raise AssertionError("source_inventory.csv should contain at least five representative sources")
    sensitivities = {row["sensitivity"].strip().lower() for row in rows}
    if not {"internal", "confidential", "restricted"}.issubset(sensitivities):
        raise AssertionError("source inventory should include internal, confidential, and restricted examples")
    for row in rows:
        if not row["source_uri"].startswith(("oss://", "s3://", "https://", "file://")):
            raise AssertionError(f"source_uri should be resolvable or storage-like: {row['source_uri']}")
        if not row["access_tags"].strip():
            raise AssertionError(f"access_tags must not be empty for {row['source_id']}")


def validate_chunk_schema() -> None:
    path = ROOT / "chunk_schema.json"
    schema = json.loads(path.read_text(encoding="utf-8"))
    required = set(schema.get("required_fields", []))
    missing = sorted(REQUIRED_SCHEMA_FIELDS - required)
    if missing:
        raise AssertionError(f"chunk_schema.json missing required fields: {', '.join(missing)}")
    fields = schema.get("fields", {})
    missing_descriptions = [field for field in required if not fields.get(field)]
    if missing_descriptions:
        raise AssertionError(f"chunk_schema.json missing field descriptions: {', '.join(sorted(missing_descriptions))}")


def validate_evaluation_questions() -> None:
    path = ROOT / "evaluation_questions.csv"
    rows = read_csv(path)
    assert_columns(path, rows, REQUIRED_QUESTION_COLUMNS)
    if len(rows) < 10:
        raise AssertionError("evaluation_questions.csv should contain at least ten representative questions")
    risk_categories = {row["risk_category"].strip().lower() for row in rows}
    if "prompt_injection" not in risk_categories or "sensitive_data" not in risk_categories:
        raise AssertionError("evaluation questions must include prompt-injection and sensitive-data refusal cases")
    if not any(row["required_behavior"] == "require_grounding" for row in rows):
        raise AssertionError("evaluation questions must include a grounding/faithfulness test")


def main() -> int:
    checks = [
        validate_source_inventory,
        validate_chunk_schema,
        validate_evaluation_questions,
    ]
    for check in checks:
        check()
        print(f"PASS {check.__name__}")
    print("PASS Chapter 17 RAG guided-lab artifacts are internally consistent")
    return 0


if __name__ == "__main__":
    try:
        raise SystemExit(main())
    except AssertionError as exc:
        print(f"FAIL {exc}", file=sys.stderr)
        raise SystemExit(1)
