{
  "schema_name": "rag_policy_assistant_chunk_v1",
  "description": "Canonical chunk record for a permission-aware internal policy RAG assistant.",
  "required_fields": [
    "chunk_id",
    "document_id",
    "source_uri",
    "document_version",
    "section_path",
    "text",
    "token_count",
    "language",
    "access_tags",
    "embedding_model",
    "embedding_dimension",
    "chunking_strategy",
    "content_hash",
    "quality_status",
    "created_at"
  ],
  "fields": {
    "chunk_id": "Stable unique identifier for the chunk, normally document_id plus chunk sequence and version.",
    "document_id": "Stable identifier of the source document.",
    "source_uri": "Resolvable location used for citations and debugging.",
    "document_version": "Last modified timestamp, source version, or content version.",
    "section_path": "Document hierarchy used for parent context expansion.",
    "text": "Cleaned text that will be embedded and eligible for prompt assembly.",
    "token_count": "Approximate token count after cleaning.",
    "language": "BCP-47 language code such as en or uz.",
    "access_tags": "Role, tenant, geography, and sensitivity tags inherited from the source inventory.",
    "embedding_model": "Embedding model identifier used to generate the vector.",
    "embedding_dimension": "Vector dimension expected by the target index.",
    "chunking_strategy": "Versioned chunking recipe such as header-aware-v1.",
    "content_hash": "Hash of normalized text used for idempotency and duplicate detection.",
    "quality_status": "validated, quarantined, or rejected.",
    "created_at": "UTC timestamp when the chunk record was created."
  }
}
