From e6b865335c91d583655d97bd3cc483e8bd3f0d2c Mon Sep 17 00:00:00 2001 From: Taus Date: Fri, 19 Jun 2026 14:53:35 +0000 Subject: [PATCH 1/5] yeast: Move schema and YAML loader into yeast-schema crate For type checking rules, we need to be able to load schemas (so we know what to check against). However, since we can't have yeast-macros depending on yeast (where the schema-handling code currently lives) as this would introduce a circular dependency, we instead split the schema-related code into its own yeast-schema crate. --- Cargo.lock | 10 + Cargo.toml | 1 + .../tree_sitter_extractors_deps/defs.bzl | 31 + shared/yeast-schema/BUILD.bazel | 12 + shared/yeast-schema/Cargo.toml | 9 + shared/yeast-schema/src/lib.rs | 33 + shared/yeast-schema/src/node_types_yaml.rs | 762 +++++++++++++++++ shared/yeast-schema/src/schema.rs | 340 ++++++++ shared/yeast/BUILD.bazel | 4 +- shared/yeast/Cargo.toml | 1 + shared/yeast/src/lib.rs | 17 +- shared/yeast/src/node_types_yaml.rs | 777 +----------------- shared/yeast/src/schema.rs | 325 ++------ 13 files changed, 1275 insertions(+), 1047 deletions(-) create mode 100644 shared/yeast-schema/BUILD.bazel create mode 100644 shared/yeast-schema/Cargo.toml create mode 100644 shared/yeast-schema/src/lib.rs create mode 100644 shared/yeast-schema/src/node_types_yaml.rs create mode 100644 shared/yeast-schema/src/schema.rs diff --git a/Cargo.lock b/Cargo.lock index 4fab55a6444f..76043ec0a439 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3724,6 +3724,7 @@ dependencies = [ "tree-sitter-python", "tree-sitter-ruby", "yeast-macros", + "yeast-schema", ] [[package]] @@ -3735,6 +3736,15 @@ dependencies = [ "syn", ] +[[package]] +name = "yeast-schema" +version = "0.1.0" +dependencies = [ + "serde", + "serde_json", + "serde_yaml", +] + [[package]] name = "yoke" version = "0.8.0" diff --git a/Cargo.toml b/Cargo.toml index 62eb2e7e920c..9c15b486062b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ "shared/tree-sitter-extractor", "shared/yeast", "shared/yeast-macros", + "shared/yeast-schema", "ruby/extractor", "unified/extractor", "unified/extractor/tree-sitter-swift", diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl b/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl index 11842460638f..7fbdfc4bbd4b 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl @@ -403,6 +403,13 @@ _NORMAL_DEPENDENCIES = { "syn": Label("@vendor_ts__syn-2.0.106//:syn"), }, }, + "shared/yeast-schema": { + _COMMON_CONDITION: { + "serde": Label("@vendor_ts__serde-1.0.228//:serde"), + "serde_json": Label("@vendor_ts__serde_json-1.0.145//:serde_json"), + "serde_yaml": Label("@vendor_ts__serde_yaml-0.9.34-deprecated//:serde_yaml"), + }, + }, "unified/extractor": { _COMMON_CONDITION: { "clap": Label("@vendor_ts__clap-4.5.48//:clap"), @@ -456,6 +463,10 @@ _NORMAL_ALIASES = { _COMMON_CONDITION: { }, }, + "shared/yeast-schema": { + _COMMON_CONDITION: { + }, + }, "unified/extractor": { _COMMON_CONDITION: { }, @@ -488,6 +499,8 @@ _NORMAL_DEV_DEPENDENCIES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -513,6 +526,8 @@ _NORMAL_DEV_ALIASES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -536,6 +551,8 @@ _PROC_MACRO_DEPENDENCIES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -559,6 +576,8 @@ _PROC_MACRO_ALIASES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -582,6 +601,8 @@ _PROC_MACRO_DEV_DEPENDENCIES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -607,6 +628,8 @@ _PROC_MACRO_DEV_ALIASES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -630,6 +653,8 @@ _BUILD_DEPENDENCIES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -657,6 +682,8 @@ _BUILD_ALIASES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -682,6 +709,8 @@ _BUILD_PROC_MACRO_DEPENDENCIES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { @@ -705,6 +734,8 @@ _BUILD_PROC_MACRO_ALIASES = { }, "shared/yeast-macros": { }, + "shared/yeast-schema": { + }, "unified/extractor": { }, "unified/extractor/tree-sitter-swift": { diff --git a/shared/yeast-schema/BUILD.bazel b/shared/yeast-schema/BUILD.bazel new file mode 100644 index 000000000000..85f008a1aa67 --- /dev/null +++ b/shared/yeast-schema/BUILD.bazel @@ -0,0 +1,12 @@ +load("@rules_rust//rust:defs.bzl", "rust_library") +load("//misc/bazel/3rdparty/tree_sitter_extractors_deps:defs.bzl", "aliases", "all_crate_deps") + +exports_files(["Cargo.toml"]) + +rust_library( + name = "yeast-schema", + srcs = glob(["src/**/*.rs"]), + aliases = aliases(), + visibility = ["//visibility:public"], + deps = all_crate_deps(), +) diff --git a/shared/yeast-schema/Cargo.toml b/shared/yeast-schema/Cargo.toml new file mode 100644 index 000000000000..4cf534d4f0ce --- /dev/null +++ b/shared/yeast-schema/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "yeast-schema" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +serde_yaml = "0.9" diff --git a/shared/yeast-schema/src/lib.rs b/shared/yeast-schema/src/lib.rs new file mode 100644 index 000000000000..8e15571c3558 --- /dev/null +++ b/shared/yeast-schema/src/lib.rs @@ -0,0 +1,33 @@ +//! Schema definitions and YAML/JSON node-types loaders for YEAST. +//! +//! This crate carries the parts of the YEAST framework that don't need +//! `tree-sitter`: the [`schema::Schema`] type and its associated +//! [`schema::NodeType`] / [`schema::FieldCardinality`] helpers, plus the +//! YAML and JSON conversion helpers in [`node_types_yaml`]. +//! +//! It exists so that both the runtime crate (`yeast`) and the +//! compile-time `rules!` proc macro (`yeast-macros`) can build against a +//! single source of truth without dragging tree-sitter (a heavy C-backed +//! dep) into the proc-macro toolchain. +//! +//! Tree-sitter-aware adapters — building a `Schema` from a +//! `tree_sitter::Language`, or loading a YAML schema on top of one — +//! live in `yeast::schema` and `yeast::node_types_yaml` respectively. + +pub mod node_types_yaml; +pub mod schema; + +/// Field IDs are stable `u16`s, matching tree-sitter's representation so a +/// schema built from a tree-sitter language can preserve the language's +/// existing IDs. +pub type FieldId = u16; + +/// Kind IDs are stable `u16`s. Like `FieldId`, this matches tree-sitter's +/// representation. +pub type KindId = u16; + +/// Sentinel field id used to mean "the implicit unfielded slot" (what the +/// tree-sitter docs call `children` and what YEAST surfaces in queries as +/// the bare `child:` field). Reserved to avoid clashing with real field +/// IDs allocated by `Schema::register_field`. +pub const CHILD_FIELD: u16 = u16::MAX; diff --git a/shared/yeast-schema/src/node_types_yaml.rs b/shared/yeast-schema/src/node_types_yaml.rs new file mode 100644 index 000000000000..5f6a3906f7cb --- /dev/null +++ b/shared/yeast-schema/src/node_types_yaml.rs @@ -0,0 +1,762 @@ +/// Converts a YAML node-types file to the tree-sitter `node-types.json` format. +/// +/// # YAML format +/// +/// ```yaml +/// supertypes: +/// _expression: +/// - assignment +/// - binary +/// +/// named: +/// assignment: +/// left: _lhs +/// right: _expression +/// identifier: +/// +/// unnamed: +/// - "+" +/// - "end" +/// ``` +/// +/// See the crate-level docs for the full format specification. +use std::collections::{BTreeMap, BTreeSet}; +use std::fmt::Write; + +use crate::CHILD_FIELD; +use serde::Deserialize; +use serde_json::json; + +/// Top-level YAML structure. +#[derive(Deserialize, Default)] +struct YamlNodeTypes { + #[serde(default)] + supertypes: BTreeMap>, + #[serde(default)] + named: BTreeMap>>, + #[serde(default)] + unnamed: Vec, +} + +/// A reference to a node type. Can be: +/// - a plain string (resolved by looking up named vs unnamed) +/// - a map `{unnamed: "name"}` to force unnamed interpretation +#[derive(Deserialize, Debug, Clone)] +#[serde(untagged)] +enum TypeRef { + Name(String), + Explicit { unnamed: String }, +} + +/// A field value: either a single type ref or a list of them. +#[derive(Deserialize, Debug, Clone)] +#[serde(untagged)] +enum TypeRefOrList { + Single(TypeRef), + List(Vec), +} + +impl TypeRefOrList { + fn into_vec(self) -> Vec { + match self { + TypeRefOrList::Single(t) => vec![t], + TypeRefOrList::List(v) => v, + } + } +} + +/// Parsed field name: base name + multiplicity markers. +struct FieldSpec { + name: Option, // None for $children + multiple: bool, + required: bool, +} + +fn parse_field_name(raw: &str) -> FieldSpec { + let is_children = + raw == "$children" || raw == "$children?" || raw == "$children*" || raw == "$children+"; + + let suffix = raw.chars().last().filter(|c| matches!(c, '?' | '*' | '+')); + + let (multiple, required) = match suffix { + Some('?') => (false, false), + Some('*') => (true, false), + Some('+') => (true, true), + _ => (false, true), // bare field name = required, single + }; + + let name = if is_children { + None + } else { + let base = raw.trim_end_matches(['?', '*', '+']); + Some(base.to_string()) + }; + + FieldSpec { + name, + multiple, + required, + } +} + +/// Resolve a TypeRef to a (type, named) pair, given the sets of known named +/// and unnamed types. +fn resolve_type_ref_pair( + type_ref: &TypeRef, + named_types: &BTreeSet, + unnamed_types: &BTreeSet, +) -> (String, bool) { + match type_ref { + TypeRef::Explicit { unnamed } => (unnamed.clone(), false), + TypeRef::Name(name) => { + let is_named = named_types.contains(name); + let is_unnamed = unnamed_types.contains(name); + if is_named && is_unnamed { + (name.clone(), true) + } else if is_unnamed { + (name.clone(), false) + } else { + (name.clone(), true) + } + } + } +} + +/// Resolve a TypeRef to a {type, named} JSON record, given the sets of known named +/// and unnamed types. +fn resolve_type_ref( + type_ref: &TypeRef, + named_types: &BTreeSet, + unnamed_types: &BTreeSet, +) -> serde_json::Value { + let (kind, named) = resolve_type_ref_pair(type_ref, named_types, unnamed_types); + json!({"type": kind, "named": named}) +} + +/// Convert YAML string to node-types JSON string. +pub fn convert(yaml_input: &str) -> Result { + let yaml: YamlNodeTypes = + serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; + + // Build the sets of known named and unnamed types for resolution. + let mut named_types = BTreeSet::new(); + for name in yaml.supertypes.keys() { + named_types.insert(name.clone()); + } + for name in yaml.named.keys() { + named_types.insert(name.clone()); + } + let unnamed_types: BTreeSet = yaml.unnamed.iter().cloned().collect(); + + let mut output = Vec::new(); + + // 1. Supertypes + for (name, members) in &yaml.supertypes { + let subtypes: Vec<_> = members + .iter() + .map(|m| resolve_type_ref(m, &named_types, &unnamed_types)) + .collect(); + output.push(json!({ + "type": name, + "named": true, + "subtypes": subtypes, + })); + } + + // 2. Named nodes + for (name, fields_opt) in &yaml.named { + let fields_map = match fields_opt { + None => { + // Leaf token: no fields, no children, no subtypes + output.push(json!({ + "type": name, + "named": true, + "fields": {}, + })); + continue; + } + Some(m) if m.is_empty() => { + output.push(json!({ + "type": name, + "named": true, + "fields": {}, + })); + continue; + } + Some(m) => m, + }; + + let mut json_fields = serde_json::Map::new(); + let mut json_children: Option = None; + + for (raw_field_name, type_refs) in fields_map { + let spec = parse_field_name(raw_field_name); + let types: Vec<_> = type_refs + .clone() + .into_vec() + .iter() + .map(|t| resolve_type_ref(t, &named_types, &unnamed_types)) + .collect(); + + // Cloning to make the borrow checker happy + let field_info = json!({ + "multiple": spec.multiple, + "required": spec.required, + "types": types, + }); + + if spec.name.is_none() { + // $children + json_children = Some(field_info); + } else { + json_fields.insert(spec.name.unwrap(), field_info); + } + } + + let mut entry = json!({ + "type": name, + "named": true, + "fields": json_fields, + }); + + if let Some(children) = json_children { + entry + .as_object_mut() + .unwrap() + .insert("children".to_string(), children); + } + + output.push(entry); + } + + // 3. Unnamed tokens + for name in &yaml.unnamed { + output.push(json!({ + "type": name, + "named": false, + })); + } + + serde_json::to_string_pretty(&output).map_err(|e| format!("Failed to serialize JSON: {e}")) +} + +/// Apply YAML node-type definitions to a mutable Schema. +/// Registers all types, fields, and allowed types from the YAML into the +/// schema. Public so callers can layer YAML node-types onto a Schema that +/// already has fields/kinds preregistered from another source (e.g. a +/// tree-sitter language). +pub fn extend_schema_from_yaml( + schema: &mut crate::schema::Schema, + yaml_input: &str, +) -> Result<(), String> { + let yaml: YamlNodeTypes = + serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; + apply_yaml_to_schema(&yaml, schema); + Ok(()) +} + +fn apply_yaml_to_schema( + yaml: &YamlNodeTypes, + schema: &mut crate::schema::Schema, +) { + // Register all supertypes as node kinds + for name in yaml.supertypes.keys() { + schema.register_kind(name); + } + + // Register named node kinds and their fields + for (name, fields_opt) in &yaml.named { + schema.register_kind(name); + if let Some(fields) = fields_opt { + for raw_field_name in fields.keys() { + let spec = parse_field_name(raw_field_name); + if let Some(field_name) = &spec.name { + schema.register_field(field_name); + } + } + } + } + + // Register unnamed tokens as node kinds + for name in &yaml.unnamed { + schema.register_unnamed_kind(name); + } + + let mut named_types = BTreeSet::new(); + for name in yaml.supertypes.keys() { + named_types.insert(name.clone()); + } + for name in yaml.named.keys() { + named_types.insert(name.clone()); + } + let unnamed_types: BTreeSet = yaml.unnamed.iter().cloned().collect(); + + for (supertype, members) in &yaml.supertypes { + let node_types = members + .iter() + .map(|m| { + let (kind, named) = resolve_type_ref_pair(m, &named_types, &unnamed_types); + crate::schema::NodeType { kind, named } + }) + .collect(); + schema.set_supertype_members(supertype, node_types); + } + + // Register allowed field child types for type checking. + for (parent_kind, fields_opt) in &yaml.named { + let Some(fields) = fields_opt else { + continue; + }; + + for (raw_field_name, type_refs) in fields { + let spec = parse_field_name(raw_field_name); + let field_id = match &spec.name { + Some(name) => schema.register_field(name), + None => CHILD_FIELD, + }; + + let mut node_types = type_refs + .clone() + .into_vec() + .into_iter() + .map(|type_ref| { + let (kind, named) = resolve_type_ref_pair(&type_ref, &named_types, &unnamed_types); + crate::schema::NodeType { kind, named } + }) + .collect::>(); + node_types.sort_by(|a, b| a.kind.cmp(&b.kind).then(a.named.cmp(&b.named))); + node_types.dedup_by(|a, b| a.kind == b.kind && a.named == b.named); + schema.set_field_types(parent_kind, field_id, node_types); + schema.set_field_cardinality( + parent_kind, + field_id, + crate::schema::FieldCardinality { + multiple: spec.multiple, + required: spec.required, + }, + ); + } + } +} + +pub fn schema_from_yaml(yaml_input: &str) -> Result { + let mut schema = crate::schema::Schema::new(); + extend_schema_from_yaml(&mut schema, yaml_input)?; + Ok(schema) +} + +// --------------------------------------------------------------------------- +// JSON → YAML conversion +// --------------------------------------------------------------------------- + +/// JSON node-types structures (mirrors tree-sitter's format). +#[derive(Deserialize)] +struct JsonNodeInfo { + #[serde(rename = "type")] + kind: String, + named: bool, + #[serde(default)] + fields: BTreeMap, + children: Option, + #[serde(default)] + subtypes: Vec, +} + +#[derive(Deserialize)] +struct JsonNodeType { + #[serde(rename = "type")] + kind: String, + named: bool, +} + +#[derive(Deserialize)] +struct JsonFieldInfo { + multiple: bool, + required: bool, + types: Vec, +} + +/// Convert a tree-sitter node-types.json string to the YAML format. +pub fn convert_from_json(json_input: &str) -> Result { + let nodes: Vec = + serde_json::from_str(json_input).map_err(|e| format!("Failed to parse JSON: {e}"))?; + + // Collect all named and unnamed types for disambiguation decisions. + let mut all_named: BTreeSet = BTreeSet::new(); + let mut all_unnamed: BTreeSet = BTreeSet::new(); + for node in &nodes { + if node.named { + all_named.insert(node.kind.clone()); + } else { + all_unnamed.insert(node.kind.clone()); + } + } + + let mut supertypes: BTreeMap> = BTreeMap::new(); + let mut named: BTreeMap>> = BTreeMap::new(); + let mut unnamed: Vec = Vec::new(); + + for node in nodes { + if !node.named { + unnamed.push(node.kind); + continue; + } + + if !node.subtypes.is_empty() { + supertypes.insert(node.kind, node.subtypes); + continue; + } + + if node.fields.is_empty() && node.children.is_none() { + // Leaf token + named.insert(node.kind, None); + } else { + let mut fields = BTreeMap::new(); + for (name, info) in node.fields { + fields.insert(name, info); + } + if let Some(children) = node.children { + fields.insert("$children".to_string(), children); + } + named.insert(node.kind, Some(fields)); + } + } + + // Now emit YAML + let mut out = String::new(); + + // Supertypes + if !supertypes.is_empty() { + writeln!(out, "supertypes:").unwrap(); + for (name, members) in &supertypes { + writeln!(out, " {name}:").unwrap(); + for member in members { + let ref_str = format_type_ref(&member.kind, member.named, &all_named, &all_unnamed); + writeln!(out, " - {ref_str}").unwrap(); + } + } + writeln!(out).unwrap(); + } + + // Named + if !named.is_empty() { + writeln!(out, "named:").unwrap(); + for (name, fields_opt) in &named { + match fields_opt { + None => { + writeln!(out, " {name}:").unwrap(); + } + Some(fields) => { + writeln!(out, " {name}:").unwrap(); + for (field_name, info) in fields { + let suffix = field_suffix(info.multiple, info.required); + let yaml_name = if field_name == "$children" { + format!("$children{suffix}") + } else { + format!("{field_name}{suffix}") + }; + + let type_refs: Vec = info + .types + .iter() + .map(|t| format_type_ref(&t.kind, t.named, &all_named, &all_unnamed)) + .collect(); + + if type_refs.len() == 1 { + writeln!(out, " {yaml_name}: {}", type_refs[0]).unwrap(); + } else { + let list = type_refs + .iter() + .map(|s| s.as_str()) + .collect::>() + .join(", "); + writeln!(out, " {yaml_name}: [{list}]").unwrap(); + } + } + } + } + } + writeln!(out).unwrap(); + } + + // Unnamed + if !unnamed.is_empty() { + writeln!(out, "unnamed:").unwrap(); + for name in &unnamed { + writeln!(out, " - {}", force_quote(name)).unwrap(); + } + } + + Ok(out) +} + +fn field_suffix(multiple: bool, required: bool) -> &'static str { + match (multiple, required) { + (false, true) => "", + (false, false) => "?", + (true, true) => "+", + (true, false) => "*", + } +} + +/// Format a type reference for YAML output. Uses the disambiguation rule: +/// plain string if unambiguous, `{unnamed: name}` if the name exists as both +/// named and unnamed and we need the unnamed interpretation. +fn format_type_ref( + kind: &str, + named: bool, + all_named: &BTreeSet, + _all_unnamed: &BTreeSet, +) -> String { + if named { + quote_yaml(kind) + } else { + let is_also_named = all_named.contains(kind); + if is_also_named { + format!("{{unnamed: {}}}", force_quote(kind)) + } else { + force_quote(kind) + } + } +} + +/// Always wrap in double quotes. Used for unnamed node references so they're +/// visually distinct from named ones — YAML treats both forms as equivalent strings. +fn force_quote(s: &str) -> String { + format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")) +} + +/// Quote a YAML string value if it contains special characters or could be +/// misinterpreted. +fn quote_yaml(s: &str) -> String { + let needs_quoting = s.is_empty() + || s.contains(|c: char| { + matches!( + c, + ':' | '{' + | '}' + | '[' + | ']' + | ',' + | '&' + | '*' + | '#' + | '?' + | '|' + | '-' + | '<' + | '>' + | '=' + | '!' + | '%' + | '@' + | '`' + | '"' + | '\'' + ) + }) + || s.starts_with(' ') + || s.ends_with(' ') + || s == "true" + || s == "false" + || s == "null" + || s == "yes" + || s == "no" + || s.parse::().is_ok(); + + if needs_quoting { + format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")) + } else { + s.to_string() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_conversion() { + let yaml = r#" +supertypes: + _expression: + - assignment + - binary + +named: + assignment: + left: _lhs + right: _expression + binary: + left: [_expression, _simple_numeric] + operator: ["!=", "+"] + right: _expression + argument_list: + $children*: [_expression, block_argument] + identifier: + +unnamed: + - "!=" + - "+" + - "end" +"#; + + let json_str = convert(yaml).unwrap(); + let result: Vec = serde_json::from_str(&json_str).unwrap(); + + // Check supertype + let expr = &result[0]; + assert_eq!(expr["type"], "_expression"); + assert_eq!(expr["named"], true); + assert_eq!(expr["subtypes"].as_array().unwrap().len(), 2); + + // Check assignment + let assign = result.iter().find(|n| n["type"] == "assignment").unwrap(); + assert_eq!(assign["fields"]["left"]["required"], true); + assert_eq!(assign["fields"]["left"]["multiple"], false); + assert_eq!(assign["fields"]["left"]["types"][0]["type"], "_lhs"); + assert_eq!(assign["fields"]["left"]["types"][0]["named"], true); + + // Check binary.operator — "!=" and "+" should resolve to unnamed + let binary = result.iter().find(|n| n["type"] == "binary").unwrap(); + let op_types = binary["fields"]["operator"]["types"].as_array().unwrap(); + assert_eq!(op_types[0]["type"], "!="); + assert_eq!(op_types[0]["named"], false); + assert_eq!(op_types[1]["type"], "+"); + assert_eq!(op_types[1]["named"], false); + + // Check argument_list has children, not a field + let arg_list = result + .iter() + .find(|n| n["type"] == "argument_list") + .unwrap(); + assert!(arg_list.get("children").is_some()); + assert_eq!(arg_list["children"]["multiple"], true); + assert_eq!(arg_list["children"]["required"], false); + + // Check identifier is a leaf + let ident = result.iter().find(|n| n["type"] == "identifier").unwrap(); + assert_eq!(ident["fields"].as_object().unwrap().len(), 0); + + // Check unnamed tokens + let end = result.iter().find(|n| n["type"] == "end").unwrap(); + assert_eq!(end["named"], false); + } + + #[test] + fn test_explicit_unnamed_disambiguation() { + let yaml = r#" +named: + foo: + field: [{unnamed: bar}] + +unnamed: + - bar +"#; + + let json_str = convert(yaml).unwrap(); + let result: Vec = serde_json::from_str(&json_str).unwrap(); + let foo = result.iter().find(|n| n["type"] == "foo").unwrap(); + assert_eq!(foo["fields"]["field"]["types"][0]["named"], false); + } + + #[test] + fn test_field_suffixes() { + let yaml = r#" +named: + test_node: + required_single: foo + optional_single?: foo + required_multiple+: foo + optional_multiple*: foo +"#; + + let json_str = convert(yaml).unwrap(); + let result: Vec = serde_json::from_str(&json_str).unwrap(); + let node = result.iter().find(|n| n["type"] == "test_node").unwrap(); + let fields = node["fields"].as_object().unwrap(); + + assert_eq!(fields["required_single"]["required"], true); + assert_eq!(fields["required_single"]["multiple"], false); + + assert_eq!(fields["optional_single"]["required"], false); + assert_eq!(fields["optional_single"]["multiple"], false); + + assert_eq!(fields["required_multiple"]["required"], true); + assert_eq!(fields["required_multiple"]["multiple"], true); + + assert_eq!(fields["optional_multiple"]["required"], false); + assert_eq!(fields["optional_multiple"]["multiple"], true); + } + + #[test] + fn test_json_to_yaml() { + let json = r#"[ + {"type": "_expression", "named": true, "subtypes": [ + {"type": "assignment", "named": true}, + {"type": "identifier", "named": true} + ]}, + {"type": "assignment", "named": true, "fields": { + "left": {"multiple": false, "required": true, "types": [ + {"type": "_expression", "named": true} + ]}, + "right": {"multiple": false, "required": false, "types": [ + {"type": "_expression", "named": true} + ]} + }, "children": { + "multiple": true, "required": false, "types": [ + {"type": "identifier", "named": true} + ] + }}, + {"type": "identifier", "named": true, "fields": {}}, + {"type": "=", "named": false}, + {"type": "end", "named": false} + ]"#; + + let yaml = convert_from_json(json).unwrap(); + + // Verify key structures are present + assert!(yaml.contains("supertypes:")); + assert!(yaml.contains("_expression:")); + assert!(yaml.contains("named:")); + assert!(yaml.contains("assignment:")); + assert!(yaml.contains("left:")); + assert!(yaml.contains("right?:")); + assert!(yaml.contains("$children*:")); + assert!(yaml.contains("identifier:")); + assert!(yaml.contains("unnamed:")); + assert!(yaml.contains("\"=\"")); + assert!(yaml.contains("end")); + } + + #[test] + fn test_round_trip() { + let yaml_input = r#" +supertypes: + _expression: + - assignment + - identifier + +named: + assignment: + left: _expression + right?: _expression + $children*: identifier + identifier: + +unnamed: + - "=" + - end +"#; + + // YAML → JSON → YAML + let json = convert(yaml_input).unwrap(); + let yaml_output = convert_from_json(&json).unwrap(); + // YAML → JSON again (should be identical) + let json2 = convert(&yaml_output).unwrap(); + + let v1: serde_json::Value = serde_json::from_str(&json).unwrap(); + let v2: serde_json::Value = serde_json::from_str(&json2).unwrap(); + assert_eq!(v1, v2); + } +} diff --git a/shared/yeast-schema/src/schema.rs b/shared/yeast-schema/src/schema.rs new file mode 100644 index 000000000000..4acd14377a4d --- /dev/null +++ b/shared/yeast-schema/src/schema.rs @@ -0,0 +1,340 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::{FieldId, KindId, CHILD_FIELD}; + +#[derive(Clone, Debug)] +pub struct NodeType { + pub kind: String, + pub named: bool, +} + +/// Multiplicity/optionality of a field declaration. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct FieldCardinality { + /// Whether the field may hold more than one child. + pub multiple: bool, + /// Whether at least one child must be present. + pub required: bool, +} + +/// A schema defining node kinds and field names for the output AST. +/// Built from a node-types.yml file, independent of any tree-sitter grammar. +/// +/// # Memory management +/// +/// `register_field`/`register_kind`/`register_unnamed_kind` (and their +/// `_with_id` siblings) use `Box::leak` to obtain `&'static str` names. This +/// is intentional: the `&'static str` names appear pervasively in `Node`, +/// `AstCursor`, query patterns, and the extractor's TRAP output, where +/// adding a lifetime would propagate widely. +/// +/// The leak is bounded by the number of distinct kind/field names registered. +/// Schemas are expected to be constructed once per process (e.g. at extractor +/// startup) and reused. Repeated construction in long-running processes will +/// leak memory unboundedly and should be avoided. +#[derive(Clone)] +pub struct Schema { + field_ids: BTreeMap, + field_names: BTreeMap, + next_field_id: FieldId, + kind_ids: BTreeMap, + unnamed_kind_ids: BTreeMap, + kind_names: BTreeMap, + next_kind_id: KindId, + field_types: BTreeMap<(String, FieldId), Vec>, + field_cardinalities: BTreeMap<(String, FieldId), FieldCardinality>, + supertypes: BTreeMap>, +} + +impl Default for Schema { + fn default() -> Self { + Self::new() + } +} + +impl Schema { + pub fn new() -> Self { + Self { + field_ids: BTreeMap::new(), + field_names: BTreeMap::new(), + next_field_id: 1, // 0 is reserved + kind_ids: BTreeMap::new(), + unnamed_kind_ids: BTreeMap::new(), + kind_names: BTreeMap::new(), + next_kind_id: 1, // 0 is reserved + field_types: BTreeMap::new(), + field_cardinalities: BTreeMap::new(), + supertypes: BTreeMap::new(), + } + } + + /// Register a field name, returning its ID. + /// If already registered, returns the existing ID. + pub fn register_field(&mut self, name: &str) -> FieldId { + if name == "child" { + return CHILD_FIELD; + } + if let Some(&id) = self.field_ids.get(name) { + return id; + } + let id = self.next_field_id; + assert!(id < CHILD_FIELD, "too many fields"); + self.next_field_id += 1; + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.field_ids.insert(name.to_string(), id); + self.field_names.insert(id, leaked); + id + } + + /// Register a field name with a specific ID, e.g. when importing IDs + /// from an external source like a tree-sitter language. If the name is + /// already registered (with any ID), nothing is changed and the + /// existing ID is returned. + pub fn register_field_with_id(&mut self, name: &str, id: FieldId) -> FieldId { + if name == "child" { + return CHILD_FIELD; + } + if let Some(&existing) = self.field_ids.get(name) { + return existing; + } + assert!(id < CHILD_FIELD, "too many fields"); + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.field_ids.insert(name.to_string(), id); + self.field_names.insert(id, leaked); + if id >= self.next_field_id { + self.next_field_id = id + 1; + } + id + } + + /// Register a named node kind name, returning its ID. + /// If already registered, returns the existing ID. + pub fn register_kind(&mut self, name: &str) -> KindId { + if let Some(&id) = self.kind_ids.get(name) { + return id; + } + let id = self.next_kind_id; + self.next_kind_id += 1; + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.kind_ids.insert(name.to_string(), id); + self.kind_names.insert(id, leaked); + id + } + + /// Register a named node kind with a specific ID, e.g. when importing + /// IDs from a tree-sitter language. If the name is already registered, + /// nothing is changed and the existing ID is returned. + pub fn register_kind_with_id(&mut self, name: &str, id: KindId) -> KindId { + if let Some(&existing) = self.kind_ids.get(name) { + return existing; + } + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.kind_ids.insert(name.to_string(), id); + self.kind_names.insert(id, leaked); + if id >= self.next_kind_id { + self.next_kind_id = id + 1; + } + id + } + + /// Register an unnamed token kind (e.g. `"="`, `"end"`), returning its ID. + /// If already registered, returns the existing ID. + pub fn register_unnamed_kind(&mut self, name: &str) -> KindId { + if let Some(&id) = self.unnamed_kind_ids.get(name) { + return id; + } + let id = self.next_kind_id; + self.next_kind_id += 1; + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.unnamed_kind_ids.insert(name.to_string(), id); + self.kind_names.insert(id, leaked); + id + } + + /// Register an unnamed token kind with a specific ID. If the name is + /// already registered as unnamed, nothing is changed and the existing + /// ID is returned. + pub fn register_unnamed_kind_with_id(&mut self, name: &str, id: KindId) -> KindId { + if let Some(&existing) = self.unnamed_kind_ids.get(name) { + return existing; + } + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.unnamed_kind_ids.insert(name.to_string(), id); + self.kind_names.insert(id, leaked); + if id >= self.next_kind_id { + self.next_kind_id = id + 1; + } + id + } + + /// Track a name for a kind ID without registering it as named or + /// unnamed. Useful when importing tree-sitter ID tables that may + /// contain duplicate IDs across the named/unnamed split. + pub fn record_kind_name(&mut self, id: KindId, name: &'static str) { + self.kind_names.entry(id).or_insert(name); + if id >= self.next_kind_id { + self.next_kind_id = id + 1; + } + } + + pub fn field_id_for_name(&self, name: &str) -> Option { + if name == "child" { + return Some(CHILD_FIELD); + } + self.field_ids.get(name).copied() + } + + pub fn field_name_for_id(&self, id: FieldId) -> Option<&'static str> { + if id == CHILD_FIELD { + return Some("child"); + } + self.field_names.get(&id).copied() + } + + pub fn id_for_node_kind(&self, kind: &str) -> Option { + self.kind_ids.get(kind).copied() + } + + pub fn id_for_unnamed_node_kind(&self, kind: &str) -> Option { + self.unnamed_kind_ids.get(kind).copied() + } + + /// Has `kind` been registered as a named kind (concrete node or + /// supertype)? + pub fn has_named_kind(&self, kind: &str) -> bool { + self.id_for_node_kind(kind).is_some() + } + + /// Has `kind` been registered as an unnamed token kind? + pub fn has_unnamed_kind(&self, kind: &str) -> bool { + self.id_for_unnamed_node_kind(kind).is_some() + } + + /// Is `field_name` declared as a field on `parent_kind`? + /// `field_name == None` checks for the implicit unfielded slot + /// (`$children`/`CHILD_FIELD`). + pub fn has_field(&self, parent_kind: &str, field_name: Option<&str>) -> bool { + let field_id = match field_name { + Some(name) => match self.field_id_for_name(name) { + Some(id) => id, + None => return false, + }, + None => CHILD_FIELD, + }; + self.field_types(parent_kind, field_id).is_some() + } + + pub fn node_kind_for_id(&self, id: KindId) -> Option<&'static str> { + self.kind_names.get(&id).copied() + } + + pub fn set_field_types( + &mut self, + parent_kind: &str, + field_id: FieldId, + node_types: Vec, + ) { + self.field_types + .insert((parent_kind.to_string(), field_id), node_types); + } + + pub fn field_types( + &self, + parent_kind: &str, + field_id: FieldId, + ) -> Option<&Vec> { + self.field_types + .get(&(parent_kind.to_string(), field_id)) + } + + pub fn set_field_cardinality( + &mut self, + parent_kind: &str, + field_id: FieldId, + cardinality: FieldCardinality, + ) { + self.field_cardinalities + .insert((parent_kind.to_string(), field_id), cardinality); + } + + /// Returns the declared cardinality for a field, if known. + pub fn field_cardinality( + &self, + parent_kind: &str, + field_id: FieldId, + ) -> Option { + self.field_cardinalities + .get(&(parent_kind.to_string(), field_id)) + .copied() + } + + /// Returns an iterator over all `(field_id, field_name)` pairs that are + /// declared as required (`required: true`) for the given `parent_kind`. + pub fn required_fields_for_kind<'a>( + &'a self, + parent_kind: &'a str, + ) -> impl Iterator)> + 'a { + self.field_cardinalities + .iter() + .filter(move |((kind, _), card)| kind == parent_kind && card.required) + .map(move |((_, field_id), _)| { + let name = self.field_name_for_id(*field_id); + (*field_id, name) + }) + } + + pub fn set_supertype_members(&mut self, supertype: &str, node_types: Vec) { + self.supertypes.insert(supertype.to_string(), node_types); + } + + /// Returns the declared members of a supertype, if known. + pub fn supertype_members(&self, supertype: &str) -> Option<&Vec> { + self.supertypes.get(supertype) + } + + /// Is `kind` a known supertype (an abstract grouping)? + pub fn is_supertype(&self, kind: &str) -> bool { + self.supertypes.contains_key(kind) + } + + fn allows_node( + &self, + node_type: &NodeType, + node_kind: &str, + node_named: bool, + active: &mut BTreeSet, + ) -> bool { + if node_type.kind == node_kind && node_type.named == node_named { + return true; + } + + if !node_type.named { + return false; + } + + let Some(members) = self.supertypes.get(&node_type.kind) else { + return false; + }; + + if !active.insert(node_type.kind.clone()) { + return false; + } + + let matched = members + .iter() + .any(|member| self.allows_node(member, node_kind, node_named, active)); + active.remove(&node_type.kind); + matched + } + + pub fn node_matches_types( + &self, + node_kind: &str, + node_named: bool, + node_types: &[NodeType], + ) -> bool { + node_types.iter().any(|node_type| { + self.allows_node(node_type, node_kind, node_named, &mut BTreeSet::new()) + }) + } +} diff --git a/shared/yeast/BUILD.bazel b/shared/yeast/BUILD.bazel index fe0b01bb87bd..5217f20ec67d 100644 --- a/shared/yeast/BUILD.bazel +++ b/shared/yeast/BUILD.bazel @@ -14,5 +14,7 @@ rust_library( "//shared/yeast-macros", ], visibility = ["//visibility:public"], - deps = all_crate_deps(), + deps = all_crate_deps() + [ + "//shared/yeast-schema", + ], ) diff --git a/shared/yeast/Cargo.toml b/shared/yeast/Cargo.toml index 166887c324cf..518a0d1cefc2 100644 --- a/shared/yeast/Cargo.toml +++ b/shared/yeast/Cargo.toml @@ -10,6 +10,7 @@ serde_json = "1.0.108" serde_yaml = "0.9" tree-sitter = ">= 0.23.0" yeast-macros = { path = "../yeast-macros" } +yeast-schema = { path = "../yeast-schema" } tree-sitter-ruby = "0.23" tree-sitter-python = "0.23" diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs index fdfe4dd0fb01..17c54166e863 100644 --- a/shared/yeast/src/lib.rs +++ b/shared/yeast/src/lib.rs @@ -43,8 +43,13 @@ impl From for usize { } /// Field and Kind ids are provided by tree-sitter -type FieldId = u16; -type KindId = u16; +type FieldId = yeast_schema::FieldId; +type KindId = yeast_schema::KindId; + +/// Sentinel field id used to mean "the implicit unfielded slot". +/// Re-exported from `yeast-schema` so the runtime and the schema share a +/// single value. +pub use yeast_schema::CHILD_FIELD; /// Trait for values that can be appended to a field's id list inside a /// `tree!`/`trees!`/`rule!` template (in `{expr}` placeholders). @@ -148,8 +153,6 @@ impl YeastSourceRange for &T { } } -pub const CHILD_FIELD: u16 = u16::MAX; - #[derive(Debug)] pub struct AstCursor<'a> { ast: &'a Ast, @@ -295,7 +298,7 @@ impl std::fmt::Debug for Ast { impl Ast { /// Construct an AST from a TS tree pub fn from_tree(language: tree_sitter::Language, tree: &tree_sitter::Tree) -> Self { - let schema = schema::Schema::from_language(&language); + let schema = schema::from_language(&language); Self::from_tree_with_schema(schema, tree, &language) } @@ -1220,7 +1223,7 @@ impl DesugaringConfig { pub fn build_schema(&self, language: &tree_sitter::Language) -> Result { match self.output_node_types_yaml { Some(yaml) => node_types_yaml::schema_from_yaml_with_language(yaml, language), - None => Ok(schema::Schema::from_language(language)), + None => Ok(schema::from_language(language)), } } } @@ -1234,7 +1237,7 @@ pub struct Runner<'a, C = ()> { impl<'a, C> Runner<'a, C> { /// Create a runner using the input grammar's schema for output. pub fn new(language: tree_sitter::Language, phases: &'a [Phase]) -> Self { - let schema = schema::Schema::from_language(&language); + let schema = schema::from_language(&language); Self { language, schema, diff --git a/shared/yeast/src/node_types_yaml.rs b/shared/yeast/src/node_types_yaml.rs index f4d9f2a1c427..7beb4bb25bed 100644 --- a/shared/yeast/src/node_types_yaml.rs +++ b/shared/yeast/src/node_types_yaml.rs @@ -1,767 +1,22 @@ -/// Converts a YAML node-types file to the tree-sitter `node-types.json` format. -/// -/// # YAML format -/// -/// ```yaml -/// supertypes: -/// _expression: -/// - assignment -/// - binary -/// -/// named: -/// assignment: -/// left: _lhs -/// right: _expression -/// identifier: -/// -/// unnamed: -/// - "+" -/// - "end" -/// ``` -/// -/// See the crate-level docs for the full format specification. -use std::collections::{BTreeMap, BTreeSet}; -use std::fmt::Write; - -use crate::CHILD_FIELD; -use serde::Deserialize; -use serde_json::json; - -/// Top-level YAML structure. -#[derive(Deserialize, Default)] -struct YamlNodeTypes { - #[serde(default)] - supertypes: BTreeMap>, - #[serde(default)] - named: BTreeMap>>, - #[serde(default)] - unnamed: Vec, -} - -/// A reference to a node type. Can be: -/// - a plain string (resolved by looking up named vs unnamed) -/// - a map `{unnamed: "name"}` to force unnamed interpretation -#[derive(Deserialize, Debug, Clone)] -#[serde(untagged)] -enum TypeRef { - Name(String), - Explicit { unnamed: String }, -} - -/// A field value: either a single type ref or a list of them. -#[derive(Deserialize, Debug, Clone)] -#[serde(untagged)] -enum TypeRefOrList { - Single(TypeRef), - List(Vec), -} - -impl TypeRefOrList { - fn into_vec(self) -> Vec { - match self { - TypeRefOrList::Single(t) => vec![t], - TypeRefOrList::List(v) => v, - } - } -} - -/// Parsed field name: base name + multiplicity markers. -struct FieldSpec { - name: Option, // None for $children - multiple: bool, - required: bool, -} - -fn parse_field_name(raw: &str) -> FieldSpec { - let is_children = - raw == "$children" || raw == "$children?" || raw == "$children*" || raw == "$children+"; - - let suffix = raw.chars().last().filter(|c| matches!(c, '?' | '*' | '+')); - - let (multiple, required) = match suffix { - Some('?') => (false, false), - Some('*') => (true, false), - Some('+') => (true, true), - _ => (false, true), // bare field name = required, single - }; - - let name = if is_children { - None - } else { - let base = raw.trim_end_matches(['?', '*', '+']); - Some(base.to_string()) - }; - - FieldSpec { - name, - multiple, - required, - } -} - -/// Resolve a TypeRef to a (type, named) pair, given the sets of known named -/// and unnamed types. -fn resolve_type_ref_pair( - type_ref: &TypeRef, - named_types: &BTreeSet, - unnamed_types: &BTreeSet, -) -> (String, bool) { - match type_ref { - TypeRef::Explicit { unnamed } => (unnamed.clone(), false), - TypeRef::Name(name) => { - let is_named = named_types.contains(name); - let is_unnamed = unnamed_types.contains(name); - if is_named && is_unnamed { - (name.clone(), true) - } else if is_unnamed { - (name.clone(), false) - } else { - (name.clone(), true) - } - } - } -} - -/// Resolve a TypeRef to a {type, named} JSON record, given the sets of known named -/// and unnamed types. -fn resolve_type_ref( - type_ref: &TypeRef, - named_types: &BTreeSet, - unnamed_types: &BTreeSet, -) -> serde_json::Value { - let (kind, named) = resolve_type_ref_pair(type_ref, named_types, unnamed_types); - json!({"type": kind, "named": named}) -} - -/// Convert YAML string to node-types JSON string. -pub fn convert(yaml_input: &str) -> Result { - let yaml: YamlNodeTypes = - serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; - - // Build the sets of known named and unnamed types for resolution. - let mut named_types = BTreeSet::new(); - for name in yaml.supertypes.keys() { - named_types.insert(name.clone()); - } - for name in yaml.named.keys() { - named_types.insert(name.clone()); - } - let unnamed_types: BTreeSet = yaml.unnamed.iter().cloned().collect(); - - let mut output = Vec::new(); - - // 1. Supertypes - for (name, members) in &yaml.supertypes { - let subtypes: Vec<_> = members - .iter() - .map(|m| resolve_type_ref(m, &named_types, &unnamed_types)) - .collect(); - output.push(json!({ - "type": name, - "named": true, - "subtypes": subtypes, - })); - } - - // 2. Named nodes - for (name, fields_opt) in &yaml.named { - let fields_map = match fields_opt { - None => { - // Leaf token: no fields, no children, no subtypes - output.push(json!({ - "type": name, - "named": true, - "fields": {}, - })); - continue; - } - Some(m) if m.is_empty() => { - output.push(json!({ - "type": name, - "named": true, - "fields": {}, - })); - continue; - } - Some(m) => m, - }; - - let mut json_fields = serde_json::Map::new(); - let mut json_children: Option = None; - - for (raw_field_name, type_refs) in fields_map { - let spec = parse_field_name(raw_field_name); - let types: Vec<_> = type_refs - .clone() - .into_vec() - .iter() - .map(|t| resolve_type_ref(t, &named_types, &unnamed_types)) - .collect(); - - // Cloning to make the borrow checker happy - let field_info = json!({ - "multiple": spec.multiple, - "required": spec.required, - "types": types, - }); - - if spec.name.is_none() { - // $children - json_children = Some(field_info); - } else { - json_fields.insert(spec.name.unwrap(), field_info); - } - } - - let mut entry = json!({ - "type": name, - "named": true, - "fields": json_fields, - }); - - if let Some(children) = json_children { - entry - .as_object_mut() - .unwrap() - .insert("children".to_string(), children); - } - - output.push(entry); - } - - // 3. Unnamed tokens - for name in &yaml.unnamed { - output.push(json!({ - "type": name, - "named": false, - })); - } - - serde_json::to_string_pretty(&output).map_err(|e| format!("Failed to serialize JSON: {e}")) -} - -/// Apply YAML node-type definitions to a mutable Schema. -/// Registers all types, fields, and allowed types from the YAML into the schema. -fn apply_yaml_to_schema(yaml: &YamlNodeTypes, schema: &mut crate::schema::Schema) { - // Register all supertypes as node kinds - for name in yaml.supertypes.keys() { - schema.register_kind(name); - } - - // Register named node kinds and their fields - for (name, fields_opt) in &yaml.named { - schema.register_kind(name); - if let Some(fields) = fields_opt { - for raw_field_name in fields.keys() { - let spec = parse_field_name(raw_field_name); - if let Some(field_name) = &spec.name { - schema.register_field(field_name); - } - } - } - } - - // Register unnamed tokens as node kinds - for name in &yaml.unnamed { - schema.register_unnamed_kind(name); - } - - let mut named_types = BTreeSet::new(); - for name in yaml.supertypes.keys() { - named_types.insert(name.clone()); - } - for name in yaml.named.keys() { - named_types.insert(name.clone()); - } - let unnamed_types: BTreeSet = yaml.unnamed.iter().cloned().collect(); - - for (supertype, members) in &yaml.supertypes { - let node_types = members - .iter() - .map(|m| { - let (kind, named) = resolve_type_ref_pair(m, &named_types, &unnamed_types); - crate::schema::NodeType { kind, named } - }) - .collect(); - schema.set_supertype_members(supertype, node_types); - } - - // Register allowed field child types for type checking. - for (parent_kind, fields_opt) in &yaml.named { - let Some(fields) = fields_opt else { - continue; - }; - - for (raw_field_name, type_refs) in fields { - let spec = parse_field_name(raw_field_name); - let field_id = match &spec.name { - Some(name) => schema.register_field(name), - None => CHILD_FIELD, - }; - - let mut node_types = type_refs - .clone() - .into_vec() - .into_iter() - .map(|type_ref| { - let (kind, named) = - resolve_type_ref_pair(&type_ref, &named_types, &unnamed_types); - crate::schema::NodeType { kind, named } - }) - .collect::>(); - node_types.sort_by(|a, b| a.kind.cmp(&b.kind).then(a.named.cmp(&b.named))); - node_types.dedup_by(|a, b| a.kind == b.kind && a.named == b.named); - schema.set_field_types(parent_kind, field_id, node_types); - schema.set_field_cardinality( - parent_kind, - field_id, - crate::schema::FieldCardinality { - multiple: spec.multiple, - required: spec.required, - }, - ); - } - } -} - -pub fn schema_from_yaml(yaml_input: &str) -> Result { - let yaml: YamlNodeTypes = - serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; - - let mut schema = crate::schema::Schema::new(); - apply_yaml_to_schema(&yaml, &mut schema); - - Ok(schema) -} - -/// Build a Schema from a YAML string, extending a tree-sitter Language. -/// The Schema inherits all field/kind names from the Language, plus any -/// additional ones defined in the YAML. +//! YAML/JSON node-types loaders for YEAST. +//! +//! The pure YAML/JSON conversion routines live in [`yeast_schema::node_types_yaml`]. +//! This module re-exports them and adds the tree-sitter-aware adapter +//! [`schema_from_yaml_with_language`]. + +pub use yeast_schema::node_types_yaml::{ + convert, convert_from_json, extend_schema_from_yaml, schema_from_yaml, +}; + +/// Build a Schema from a YAML string, layered on top of a tree-sitter +/// `Language`. The Schema inherits all field/kind names from the language +/// (preserving the language's IDs), plus any additional ones defined in +/// the YAML. pub fn schema_from_yaml_with_language( yaml_input: &str, language: &tree_sitter::Language, ) -> Result { - let yaml: YamlNodeTypes = - serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; - - let mut schema = crate::schema::Schema::from_language(language); - apply_yaml_to_schema(&yaml, &mut schema); - + let mut schema = crate::schema::from_language(language); + extend_schema_from_yaml(&mut schema, yaml_input)?; Ok(schema) } - -// --------------------------------------------------------------------------- -// JSON → YAML conversion -// --------------------------------------------------------------------------- - -/// JSON node-types structures (mirrors tree-sitter's format). -#[derive(Deserialize)] -struct JsonNodeInfo { - #[serde(rename = "type")] - kind: String, - named: bool, - #[serde(default)] - fields: BTreeMap, - children: Option, - #[serde(default)] - subtypes: Vec, -} - -#[derive(Deserialize)] -struct JsonNodeType { - #[serde(rename = "type")] - kind: String, - named: bool, -} - -#[derive(Deserialize)] -struct JsonFieldInfo { - multiple: bool, - required: bool, - types: Vec, -} - -/// Convert a tree-sitter node-types.json string to the YAML format. -pub fn convert_from_json(json_input: &str) -> Result { - let nodes: Vec = - serde_json::from_str(json_input).map_err(|e| format!("Failed to parse JSON: {e}"))?; - - // Collect all named and unnamed types for disambiguation decisions. - let mut all_named: BTreeSet = BTreeSet::new(); - let mut all_unnamed: BTreeSet = BTreeSet::new(); - for node in &nodes { - if node.named { - all_named.insert(node.kind.clone()); - } else { - all_unnamed.insert(node.kind.clone()); - } - } - - let mut supertypes: BTreeMap> = BTreeMap::new(); - let mut named: BTreeMap>> = BTreeMap::new(); - let mut unnamed: Vec = Vec::new(); - - for node in nodes { - if !node.named { - unnamed.push(node.kind); - continue; - } - - if !node.subtypes.is_empty() { - supertypes.insert(node.kind, node.subtypes); - continue; - } - - if node.fields.is_empty() && node.children.is_none() { - // Leaf token - named.insert(node.kind, None); - } else { - let mut fields = BTreeMap::new(); - for (name, info) in node.fields { - fields.insert(name, info); - } - if let Some(children) = node.children { - fields.insert("$children".to_string(), children); - } - named.insert(node.kind, Some(fields)); - } - } - - // Now emit YAML - let mut out = String::new(); - - // Supertypes - if !supertypes.is_empty() { - writeln!(out, "supertypes:").unwrap(); - for (name, members) in &supertypes { - writeln!(out, " {name}:").unwrap(); - for member in members { - let ref_str = format_type_ref(&member.kind, member.named, &all_named, &all_unnamed); - writeln!(out, " - {ref_str}").unwrap(); - } - } - writeln!(out).unwrap(); - } - - // Named - if !named.is_empty() { - writeln!(out, "named:").unwrap(); - for (name, fields_opt) in &named { - match fields_opt { - None => { - writeln!(out, " {name}:").unwrap(); - } - Some(fields) => { - writeln!(out, " {name}:").unwrap(); - for (field_name, info) in fields { - let suffix = field_suffix(info.multiple, info.required); - let yaml_name = if field_name == "$children" { - format!("$children{suffix}") - } else { - format!("{field_name}{suffix}") - }; - - let type_refs: Vec = info - .types - .iter() - .map(|t| format_type_ref(&t.kind, t.named, &all_named, &all_unnamed)) - .collect(); - - if type_refs.len() == 1 { - writeln!(out, " {yaml_name}: {}", type_refs[0]).unwrap(); - } else { - let list = type_refs - .iter() - .map(|s| s.as_str()) - .collect::>() - .join(", "); - writeln!(out, " {yaml_name}: [{list}]").unwrap(); - } - } - } - } - } - writeln!(out).unwrap(); - } - - // Unnamed - if !unnamed.is_empty() { - writeln!(out, "unnamed:").unwrap(); - for name in &unnamed { - writeln!(out, " - {}", force_quote(name)).unwrap(); - } - } - - Ok(out) -} - -fn field_suffix(multiple: bool, required: bool) -> &'static str { - match (multiple, required) { - (false, true) => "", - (false, false) => "?", - (true, true) => "+", - (true, false) => "*", - } -} - -/// Format a type reference for YAML output. Uses the disambiguation rule: -/// plain string if unambiguous, `{unnamed: name}` if the name exists as both -/// named and unnamed and we need the unnamed interpretation. -fn format_type_ref( - kind: &str, - named: bool, - all_named: &BTreeSet, - _all_unnamed: &BTreeSet, -) -> String { - if named { - quote_yaml(kind) - } else { - let is_also_named = all_named.contains(kind); - if is_also_named { - format!("{{unnamed: {}}}", force_quote(kind)) - } else { - force_quote(kind) - } - } -} - -/// Always wrap in double quotes. Used for unnamed node references so they're -/// visually distinct from named ones — YAML treats both forms as equivalent strings. -fn force_quote(s: &str) -> String { - format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")) -} - -/// Quote a YAML string value if it contains special characters or could be -/// misinterpreted. -fn quote_yaml(s: &str) -> String { - let needs_quoting = s.is_empty() - || s.contains(|c: char| { - matches!( - c, - ':' | '{' - | '}' - | '[' - | ']' - | ',' - | '&' - | '*' - | '#' - | '?' - | '|' - | '-' - | '<' - | '>' - | '=' - | '!' - | '%' - | '@' - | '`' - | '"' - | '\'' - ) - }) - || s.starts_with(' ') - || s.ends_with(' ') - || s == "true" - || s == "false" - || s == "null" - || s == "yes" - || s == "no" - || s.parse::().is_ok(); - - if needs_quoting { - format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")) - } else { - s.to_string() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_basic_conversion() { - let yaml = r#" -supertypes: - _expression: - - assignment - - binary - -named: - assignment: - left: _lhs - right: _expression - binary: - left: [_expression, _simple_numeric] - operator: ["!=", "+"] - right: _expression - argument_list: - $children*: [_expression, block_argument] - identifier: - -unnamed: - - "!=" - - "+" - - "end" -"#; - - let json_str = convert(yaml).unwrap(); - let result: Vec = serde_json::from_str(&json_str).unwrap(); - - // Check supertype - let expr = &result[0]; - assert_eq!(expr["type"], "_expression"); - assert_eq!(expr["named"], true); - assert_eq!(expr["subtypes"].as_array().unwrap().len(), 2); - - // Check assignment - let assign = result.iter().find(|n| n["type"] == "assignment").unwrap(); - assert_eq!(assign["fields"]["left"]["required"], true); - assert_eq!(assign["fields"]["left"]["multiple"], false); - assert_eq!(assign["fields"]["left"]["types"][0]["type"], "_lhs"); - assert_eq!(assign["fields"]["left"]["types"][0]["named"], true); - - // Check binary.operator — "!=" and "+" should resolve to unnamed - let binary = result.iter().find(|n| n["type"] == "binary").unwrap(); - let op_types = binary["fields"]["operator"]["types"].as_array().unwrap(); - assert_eq!(op_types[0]["type"], "!="); - assert_eq!(op_types[0]["named"], false); - assert_eq!(op_types[1]["type"], "+"); - assert_eq!(op_types[1]["named"], false); - - // Check argument_list has children, not a field - let arg_list = result - .iter() - .find(|n| n["type"] == "argument_list") - .unwrap(); - assert!(arg_list.get("children").is_some()); - assert_eq!(arg_list["children"]["multiple"], true); - assert_eq!(arg_list["children"]["required"], false); - - // Check identifier is a leaf - let ident = result.iter().find(|n| n["type"] == "identifier").unwrap(); - assert_eq!(ident["fields"].as_object().unwrap().len(), 0); - - // Check unnamed tokens - let end = result.iter().find(|n| n["type"] == "end").unwrap(); - assert_eq!(end["named"], false); - } - - #[test] - fn test_explicit_unnamed_disambiguation() { - let yaml = r#" -named: - foo: - field: [{unnamed: bar}] - -unnamed: - - bar -"#; - - let json_str = convert(yaml).unwrap(); - let result: Vec = serde_json::from_str(&json_str).unwrap(); - let foo = result.iter().find(|n| n["type"] == "foo").unwrap(); - assert_eq!(foo["fields"]["field"]["types"][0]["named"], false); - } - - #[test] - fn test_field_suffixes() { - let yaml = r#" -named: - test_node: - required_single: foo - optional_single?: foo - required_multiple+: foo - optional_multiple*: foo -"#; - - let json_str = convert(yaml).unwrap(); - let result: Vec = serde_json::from_str(&json_str).unwrap(); - let node = result.iter().find(|n| n["type"] == "test_node").unwrap(); - let fields = node["fields"].as_object().unwrap(); - - assert_eq!(fields["required_single"]["required"], true); - assert_eq!(fields["required_single"]["multiple"], false); - - assert_eq!(fields["optional_single"]["required"], false); - assert_eq!(fields["optional_single"]["multiple"], false); - - assert_eq!(fields["required_multiple"]["required"], true); - assert_eq!(fields["required_multiple"]["multiple"], true); - - assert_eq!(fields["optional_multiple"]["required"], false); - assert_eq!(fields["optional_multiple"]["multiple"], true); - } - - #[test] - fn test_json_to_yaml() { - let json = r#"[ - {"type": "_expression", "named": true, "subtypes": [ - {"type": "assignment", "named": true}, - {"type": "identifier", "named": true} - ]}, - {"type": "assignment", "named": true, "fields": { - "left": {"multiple": false, "required": true, "types": [ - {"type": "_expression", "named": true} - ]}, - "right": {"multiple": false, "required": false, "types": [ - {"type": "_expression", "named": true} - ]} - }, "children": { - "multiple": true, "required": false, "types": [ - {"type": "identifier", "named": true} - ] - }}, - {"type": "identifier", "named": true, "fields": {}}, - {"type": "=", "named": false}, - {"type": "end", "named": false} - ]"#; - - let yaml = convert_from_json(json).unwrap(); - - // Verify key structures are present - assert!(yaml.contains("supertypes:")); - assert!(yaml.contains("_expression:")); - assert!(yaml.contains("named:")); - assert!(yaml.contains("assignment:")); - assert!(yaml.contains("left:")); - assert!(yaml.contains("right?:")); - assert!(yaml.contains("$children*:")); - assert!(yaml.contains("identifier:")); - assert!(yaml.contains("unnamed:")); - assert!(yaml.contains("\"=\"")); - assert!(yaml.contains("end")); - } - - #[test] - fn test_round_trip() { - let yaml_input = r#" -supertypes: - _expression: - - assignment - - identifier - -named: - assignment: - left: _expression - right?: _expression - $children*: identifier - identifier: - -unnamed: - - "=" - - end -"#; - - // YAML → JSON → YAML - let json = convert(yaml_input).unwrap(); - let yaml_output = convert_from_json(&json).unwrap(); - // YAML → JSON again (should be identical) - let json2 = convert(&yaml_output).unwrap(); - - let v1: serde_json::Value = serde_json::from_str(&json).unwrap(); - let v2: serde_json::Value = serde_json::from_str(&json2).unwrap(); - assert_eq!(v1, v2); - } -} diff --git a/shared/yeast/src/schema.rs b/shared/yeast/src/schema.rs index da13bb8b6b70..daa8ad98eb5b 100644 --- a/shared/yeast/src/schema.rs +++ b/shared/yeast/src/schema.rs @@ -1,285 +1,54 @@ -use std::collections::{BTreeMap, BTreeSet}; - -use crate::{FieldId, KindId, CHILD_FIELD}; - -#[derive(Clone, Debug)] -pub struct NodeType { - pub kind: String, - pub named: bool, -} - -/// Multiplicity/optionality of a field declaration. -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub struct FieldCardinality { - /// Whether the field may hold more than one child. - pub multiple: bool, - /// Whether at least one child must be present. - pub required: bool, -} - -/// A schema defining node kinds and field names for the output AST. -/// Built from a node-types.yml file, independent of any tree-sitter grammar. -/// -/// # Memory management -/// -/// `register_field`/`register_kind`/`register_unnamed_kind` use `Box::leak` -/// to obtain `&'static str` names. This is intentional: the `&'static str` -/// names appear pervasively in `Node`, `AstCursor`, query patterns, and the -/// extractor's TRAP output, where adding a lifetime would propagate widely. -/// -/// The leak is bounded by the number of distinct kind/field names registered. -/// Schemas are expected to be constructed once per process (e.g. at extractor -/// startup) and reused. Repeated construction in long-running processes will -/// leak memory unboundedly and should be avoided. -#[derive(Clone)] -pub struct Schema { - field_ids: BTreeMap, - field_names: BTreeMap, - next_field_id: FieldId, - kind_ids: BTreeMap, - unnamed_kind_ids: BTreeMap, - kind_names: BTreeMap, - next_kind_id: KindId, - field_types: BTreeMap<(String, FieldId), Vec>, - field_cardinalities: BTreeMap<(String, FieldId), FieldCardinality>, - supertypes: BTreeMap>, -} - -impl Default for Schema { - fn default() -> Self { - Self::new() - } -} - -impl Schema { - pub fn new() -> Self { - Self { - field_ids: BTreeMap::new(), - field_names: BTreeMap::new(), - next_field_id: 1, // 0 is reserved - kind_ids: BTreeMap::new(), - unnamed_kind_ids: BTreeMap::new(), - kind_names: BTreeMap::new(), - next_kind_id: 1, // 0 is reserved - field_types: BTreeMap::new(), - field_cardinalities: BTreeMap::new(), - supertypes: BTreeMap::new(), - } - } - - /// Create a schema from a tree-sitter language, importing all its - /// known field and kind names. - pub fn from_language(language: &tree_sitter::Language) -> Self { - let mut schema = Self::new(); - // Import all field names, preserving tree-sitter's IDs - for id in 1..=language.field_count() as u16 { - if let Some(name) = language.field_name_for_id(id) { - schema.field_ids.insert(name.to_string(), id); - schema.field_names.insert(id, name); - if id >= schema.next_field_id { - schema.next_field_id = id + 1; - } +//! YEAST schema types. +//! +//! The schema struct itself lives in the [`yeast_schema`] crate (so it can +//! be shared with the `yeast-macros` proc-macro crate without dragging +//! tree-sitter into proc-macro compiles). This module re-exports its +//! public API and supplies the one tree-sitter-aware adapter the runtime +//! needs: [`from_language`]. + +pub use yeast_schema::schema::{FieldCardinality, NodeType, Schema}; + +/// Build a [`Schema`] from a tree-sitter language, importing all its +/// known field and kind names so the resulting schema's IDs line up with +/// the language's own IDs (i.e. `field_name_for_id` agrees). +pub fn from_language(language: &tree_sitter::Language) -> Schema { + let mut schema = Schema::new(); + + // Import all field names, preserving tree-sitter's IDs. + for id in 1..=language.field_count() as u16 { + if let Some(name) = language.field_name_for_id(id) { + schema.register_field_with_id(name, id); + } + } + + // Import all node kind names, preserving tree-sitter's IDs. + // Track named and unnamed variants separately. For both, prefer the + // canonical ID returned by `id_for_node_kind`, since some languages + // have multiple IDs for the same name (e.g. the reserved error token + // at ID 0 may share a name with a real token). + for id in 0..language.node_kind_count() as u16 { + if let Some(name) = language.node_kind_for_id(id) { + if name.is_empty() { + continue; } - } - // Import all node kind names, preserving tree-sitter's IDs. - // Track named and unnamed variants separately. For both named and - // unnamed kinds, use the canonical ID from id_for_node_kind, since - // some languages have multiple IDs for the same name (e.g., the - // reserved error token at ID 0 may share a name with a real token). - for id in 0..language.node_kind_count() as u16 { - if let Some(name) = language.node_kind_for_id(id) { - if !name.is_empty() { - let is_named = language.node_kind_is_named(id); - if is_named { - let canonical_id = language.id_for_node_kind(name, true); - if canonical_id != 0 && !schema.kind_ids.contains_key(name) { - schema.kind_ids.insert(name.to_string(), canonical_id); - schema.kind_names.insert(canonical_id, name); - } - } else { - let canonical_id = language.id_for_node_kind(name, false); - if canonical_id != 0 && !schema.unnamed_kind_ids.contains_key(name) { - schema - .unnamed_kind_ids - .insert(name.to_string(), canonical_id); - schema.kind_names.insert(canonical_id, name); - } - } - // Always track the name for any ID we encounter - schema.kind_names.entry(id).or_insert(name); - if id >= schema.next_kind_id { - schema.next_kind_id = id + 1; - } + let is_named = language.node_kind_is_named(id); + if is_named { + let canonical_id = language.id_for_node_kind(name, true); + if canonical_id != 0 && schema.id_for_node_kind(name).is_none() { + schema.register_kind_with_id(name, canonical_id); + } + } else { + let canonical_id = language.id_for_node_kind(name, false); + if canonical_id != 0 && schema.id_for_unnamed_node_kind(name).is_none() { + schema.register_unnamed_kind_with_id(name, canonical_id); } } + // Always track the name for any ID we encounter (so + // `node_kind_for_id` works for the literal `id` we saw, even + // when it isn't the canonical one). + schema.record_kind_name(id, name); } - schema - } - - /// Register a field name, returning its ID. - /// If already registered, returns the existing ID. - pub fn register_field(&mut self, name: &str) -> FieldId { - if name == "child" { - return CHILD_FIELD; - } - if let Some(&id) = self.field_ids.get(name) { - return id; - } - let id = self.next_field_id; - assert!(id < CHILD_FIELD, "too many fields"); - self.next_field_id += 1; - let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); - self.field_ids.insert(name.to_string(), id); - self.field_names.insert(id, leaked); - id - } - - /// Register a named node kind name, returning its ID. - /// If already registered, returns the existing ID. - pub fn register_kind(&mut self, name: &str) -> KindId { - if let Some(&id) = self.kind_ids.get(name) { - return id; - } - let id = self.next_kind_id; - self.next_kind_id += 1; - let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); - self.kind_ids.insert(name.to_string(), id); - self.kind_names.insert(id, leaked); - id } - /// Register an unnamed token kind (e.g. `"="`, `"end"`), returning its ID. - /// If already registered, returns the existing ID. - pub fn register_unnamed_kind(&mut self, name: &str) -> KindId { - if let Some(&id) = self.unnamed_kind_ids.get(name) { - return id; - } - let id = self.next_kind_id; - self.next_kind_id += 1; - let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); - self.unnamed_kind_ids.insert(name.to_string(), id); - self.kind_names.insert(id, leaked); - id - } - - pub fn field_id_for_name(&self, name: &str) -> Option { - if name == "child" { - return Some(CHILD_FIELD); - } - self.field_ids.get(name).copied() - } - - pub fn field_name_for_id(&self, id: FieldId) -> Option<&'static str> { - if id == CHILD_FIELD { - return Some("child"); - } - self.field_names.get(&id).copied() - } - - pub fn id_for_node_kind(&self, kind: &str) -> Option { - self.kind_ids.get(kind).copied() - } - - pub fn id_for_unnamed_node_kind(&self, kind: &str) -> Option { - self.unnamed_kind_ids.get(kind).copied() - } - - pub fn node_kind_for_id(&self, id: KindId) -> Option<&'static str> { - self.kind_names.get(&id).copied() - } - - pub fn set_field_types( - &mut self, - parent_kind: &str, - field_id: FieldId, - node_types: Vec, - ) { - self.field_types - .insert((parent_kind.to_string(), field_id), node_types); - } - - pub fn field_types(&self, parent_kind: &str, field_id: FieldId) -> Option<&Vec> { - self.field_types.get(&(parent_kind.to_string(), field_id)) - } - - pub fn set_field_cardinality( - &mut self, - parent_kind: &str, - field_id: FieldId, - cardinality: FieldCardinality, - ) { - self.field_cardinalities - .insert((parent_kind.to_string(), field_id), cardinality); - } - - /// Returns the declared cardinality for a field, if known. - pub fn field_cardinality( - &self, - parent_kind: &str, - field_id: FieldId, - ) -> Option { - self.field_cardinalities - .get(&(parent_kind.to_string(), field_id)) - .copied() - } - - /// Returns an iterator over all `(field_id, field_name)` pairs that are - /// declared as required (`required: true`) for the given `parent_kind`. - pub fn required_fields_for_kind<'a>( - &'a self, - parent_kind: &'a str, - ) -> impl Iterator)> + 'a { - self.field_cardinalities - .iter() - .filter(move |((kind, _), card)| kind == parent_kind && card.required) - .map(move |((_, field_id), _)| { - let name = self.field_name_for_id(*field_id); - (*field_id, name) - }) - } - - pub fn set_supertype_members(&mut self, supertype: &str, node_types: Vec) { - self.supertypes.insert(supertype.to_string(), node_types); - } - - fn allows_node( - &self, - node_type: &NodeType, - node_kind: &str, - node_named: bool, - active: &mut BTreeSet, - ) -> bool { - if node_type.kind == node_kind && node_type.named == node_named { - return true; - } - - if !node_type.named { - return false; - } - - let Some(members) = self.supertypes.get(&node_type.kind) else { - return false; - }; - - if !active.insert(node_type.kind.clone()) { - return false; - } - - let matched = members - .iter() - .any(|member| self.allows_node(member, node_kind, node_named, active)); - active.remove(&node_type.kind); - matched - } - - pub fn node_matches_types( - &self, - node_kind: &str, - node_named: bool, - node_types: &[NodeType], - ) -> bool { - node_types.iter().any(|node_type| { - self.allows_node(node_type, node_kind, node_named, &mut BTreeSet::new()) - }) - } + schema } From 74ad78f17b4e7c49cc11587ab6bfa1896f81151f Mon Sep 17 00:00:00 2001 From: Taus Date: Fri, 19 Jun 2026 15:31:02 +0000 Subject: [PATCH 2/5] yeast: add `rules!` macro This macro allows the easy addition of multiple rules at the same time. In addition, it also accepts an input and output schema, which eventually will be used to check the validity of the rewrite rules. --- shared/yeast-macros/src/lib.rs | 40 +++ shared/yeast-macros/src/parse.rs | 243 +++++++++++++++++++ shared/yeast/doc/yeast.md | 41 ++++ shared/yeast/src/lib.rs | 2 +- shared/yeast/tests/input-types.yml | 40 +++ shared/yeast/tests/test.rs | 120 +++++++++ unified/extractor/tests/rules_macro_smoke.rs | 25 ++ 7 files changed, 510 insertions(+), 1 deletion(-) create mode 100644 shared/yeast/tests/input-types.yml create mode 100644 unified/extractor/tests/rules_macro_smoke.rs diff --git a/shared/yeast-macros/src/lib.rs b/shared/yeast-macros/src/lib.rs index 7db97f9fb709..0df96c91d26b 100644 --- a/shared/yeast-macros/src/lib.rs +++ b/shared/yeast-macros/src/lib.rs @@ -113,3 +113,43 @@ pub fn rule(input: TokenStream) -> TokenStream { Err(err) => err.to_compile_error().into(), } } + +/// Bundle a list of YEAST rewrite rules with input/output node-types +/// schema paths. Returns a `Vec`; substitutable for +/// `vec![rule!(...), ...]`. +/// +/// Each comma-separated item in the bracketed list may be: +/// +/// 1. A **bare rule body** `(query) => (template)` — the `rule!(...)` +/// wrapper is implicit. +/// 2. An explicit `rule!(...)` invocation, possibly chained as +/// `rule!(...).repeated()` or path-prefixed as `yeast::rule!(...)`. +/// 3. Any other expression returning a `Rule` (helper-function calls, +/// conditionals). +/// +/// ```ignore +/// let translation_rules: Vec = yeast::rules! { +/// input: "tree-sitter-swift/node-types.yml", +/// output: "ast_types.yml", +/// [ +/// (source_file (_)* @cs) => (top_level body: {..cs}), +/// (simple_identifier) @id => (name_expr identifier: (identifier #{id})), +/// rule!((integer_literal) @lit => (int_literal #{lit})).repeated(), +/// helper_fn(), +/// ] +/// }; +/// ``` +/// +/// Paths are resolved relative to the consuming crate's `CARGO_MANIFEST_DIR` +/// (the same convention `include_str!` uses for relative paths). The +/// resolved paths are also emitted as `include_str!` references so the +/// consuming crate gets invalidated when a schema YAML changes, prepping +/// the ground for compile-time type-checking against those schemas. +#[proc_macro] +pub fn rules(input: TokenStream) -> TokenStream { + let input2: TokenStream2 = input.into(); + match parse::parse_rules_top(input2) { + Ok(output) => output.into(), + Err(err) => err.to_compile_error().into(), + } +} diff --git a/shared/yeast-macros/src/parse.rs b/shared/yeast-macros/src/parse.rs index 2ab6236fdac9..01070c74bbce 100644 --- a/shared/yeast-macros/src/parse.rs +++ b/shared/yeast-macros/src/parse.rs @@ -897,6 +897,219 @@ fn expect_repetition(tokens: &mut Tokens) -> Result { } } +// --------------------------------------------------------------------------- +// rules! parsing — bundle a list of rules with input/output schema paths. +// +// The macro accepts both bare rule bodies (`(query) => (template)`) and +// explicit `rule!(...)` invocations. The schema paths are recorded but +// not yet consumed; a later change layers compile-time type-checking on +// top, using these paths to load the input/output schemas. +// --------------------------------------------------------------------------- + +/// Parse `rules! { input: "path", output: "path", [ items, ... ] }`. +/// +/// Each item in the bracketed list can be: +/// * a **bare rule body** `(query) => (template)` — wrapped implicitly +/// in `yeast::rule! { ... }` for codegen; +/// * an explicit `rule!(...)` (or `rule!(...).repeated()`, +/// `yeast::rule!(...)`, etc.) — passed through verbatim; +/// * any other expression returning a `Rule` (helper-function calls, +/// conditionals) — passed through verbatim. +/// +/// Returns a `Vec` containing the items in order. The expansion +/// also emits `include_str!` references to the resolved schema paths so +/// Cargo treats them as inputs to the consuming crate; this validates +/// path existence at compile time and prepares the ground for later +/// schema-aware checks. +pub fn parse_rules_top(input: TokenStream) -> Result { + let mut tokens = input.into_iter().peekable(); + + let input_path = parse_named_string_arg(&mut tokens, "input")?; + expect_punct(&mut tokens, ',', "expected `,` after input path")?; + let output_path = parse_named_string_arg(&mut tokens, "output")?; + expect_punct(&mut tokens, ',', "expected `,` after output path")?; + + // Resolve paths relative to the consuming crate's CARGO_MANIFEST_DIR + // so callers can write paths like "tree-sitter-swift/node-types.yml" + // alongside their other workspace-relative includes (e.g. include_str!). + let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").map_err(|_| { + syn::Error::new( + Span::call_site(), + "rules!: CARGO_MANIFEST_DIR is not set; cannot resolve schema paths", + ) + })?; + let resolve_path = |raw: &str| -> std::path::PathBuf { + let p = std::path::PathBuf::from(raw); + if p.is_absolute() { + p + } else { + std::path::PathBuf::from(&manifest_dir).join(p) + } + }; + let input_abs = resolve_path(&input_path.value); + let output_abs = resolve_path(&output_path.value); + + let list = expect_group(&mut tokens, Delimiter::Bracket)?; + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned( + tok, + "unexpected token after `rules!` list", + )); + } + + let items = split_top_level_commas(list.stream()); + let emitted_items: Vec = items + .into_iter() + .map(|item| { + // Bare rule body — wrap in `yeast::rule! { ... }` so the + // existing rule-construction macro handles codegen. Other + // items pass through unchanged. + if has_top_level_arrow(&item) { + quote! { yeast::rule! { #item } } + } else { + item + } + }) + .collect(); + + // Emit `include_str!` references to both schema files so Cargo + // treats them as inputs to the consuming crate's compilation. The + // `const _` bindings are unused; rustc/LLVM drop them after the + // file-input dependency edge is recorded. Absolute paths are used + // because `include_str!` resolves relative paths against the source + // file, while `rules!`'s own paths are relative to + // `CARGO_MANIFEST_DIR`. + let input_abs_str = input_abs.to_string_lossy().into_owned(); + let output_abs_str = output_abs.to_string_lossy().into_owned(); + let input_lit = proc_macro2::Literal::string(&input_abs_str); + let output_lit = proc_macro2::Literal::string(&output_abs_str); + + Ok(quote! { + { + const _: &::core::primitive::str = ::core::include_str!(#input_lit); + const _: &::core::primitive::str = ::core::include_str!(#output_lit); + vec![ #(#emitted_items),* ] + } + }) +} + +/// True iff `item` contains a `=>` operator at the top level (not nested +/// inside any group). Used to detect bare rule bodies inside `rules!`. +fn has_top_level_arrow(item: &TokenStream) -> bool { + let toks: Vec = item.clone().into_iter().collect(); + find_top_level_arrow(&toks).is_some() +} + +/// Find the index of the first token of a top-level `=>` operator (the +/// `=`), ignoring `=>` inside any group. Returns `None` if not present. +fn find_top_level_arrow(toks: &[TokenTree]) -> Option { + let mut i = 0; + while i + 1 < toks.len() { + if let (TokenTree::Punct(p1), TokenTree::Punct(p2)) = (&toks[i], &toks[i + 1]) { + if p1.as_char() == '=' + && p1.spacing() == proc_macro2::Spacing::Joint + && p2.as_char() == '>' + { + return Some(i); + } + } + i += 1; + } + None +} + +/// A string literal argument named `expected_name` parsed from `name: "value"`. +struct NamedString { + value: String, + #[allow(dead_code)] + span: Span, +} + +fn parse_named_string_arg(tokens: &mut Tokens, expected_name: &str) -> Result { + let name = expect_ident( + tokens, + &format!("expected `{expected_name}:` argument"), + )?; + if name != expected_name { + return Err(syn::Error::new_spanned( + name, + format!("expected `{expected_name}:` argument"), + )); + } + expect_punct( + tokens, + ':', + &format!("expected `:` after `{expected_name}`"), + )?; + let lit = expect_literal(tokens)?; + let span = lit.span(); + let value = string_literal_value(&lit).ok_or_else(|| { + syn::Error::new( + span, + format!("`{expected_name}` must be a string literal path"), + ) + })?; + Ok(NamedString { value, span }) +} + +/// Read a literal as a plain Rust string, stripping the surrounding quotes +/// and unescaping. Falls back to `None` if the literal isn't a string. +fn string_literal_value(lit: &Literal) -> Option { + let raw = lit.to_string(); + let bytes = raw.as_bytes(); + // Match plain `"..."` literals; reject byte strings, raw strings (for + // simplicity), char literals, numbers, etc. + if bytes.first() != Some(&b'"') || bytes.last() != Some(&b'"') { + return None; + } + let mut out = String::with_capacity(raw.len()); + let mut chars = raw[1..raw.len() - 1].chars(); + while let Some(c) = chars.next() { + if c != '\\' { + out.push(c); + continue; + } + match chars.next()? { + 'n' => out.push('\n'), + 't' => out.push('\t'), + 'r' => out.push('\r'), + '\\' => out.push('\\'), + '\'' => out.push('\''), + '"' => out.push('"'), + '0' => out.push('\0'), + other => { + // Unknown escape — give up rather than silently mis-parse. + out.push('\\'); + out.push(other); + } + } + } + Some(out) +} + +/// Split a token stream into top-level comma-separated items. Commas inside +/// any group token (parens, brackets, braces) are ignored so that things +/// like `rule!(a, b)` aren't accidentally split. +fn split_top_level_commas(stream: TokenStream) -> Vec { + let mut items = Vec::new(); + let mut current: Vec = Vec::new(); + for tt in stream { + if let TokenTree::Punct(p) = &tt { + if p.as_char() == ',' && p.spacing() == proc_macro2::Spacing::Alone { + if !current.is_empty() { + items.push(current.drain(..).collect()); + } + continue; + } + } + current.push(tt); + } + if !current.is_empty() { + items.push(current.into_iter().collect()); + } + items +} + fn maybe_wrap_capture(tokens: &mut Tokens, base: TokenStream) -> Result { if peek_is_at(tokens) { let name = consume_capture_marker(tokens)?; @@ -970,3 +1183,33 @@ fn maybe_wrap_list_capture(tokens: &mut Tokens, elem: TokenStream) -> Result` is present. + let toks = quote! { (a) => (b) }; + assert!(has_top_level_arrow(&toks)); + // `rule!((a) => (b))`: the `=>` is INSIDE the macro group, so + // it's not at top level. Must NOT be detected as a bare body. + let toks = quote! { rule!((a) => (b)) }; + assert!(!has_top_level_arrow(&toks)); + // Helper call: no `=>` anywhere. + let toks = quote! { make_rule() }; + assert!(!has_top_level_arrow(&toks)); + // Match expressions inside a block: `=>` is inside braces. + let toks = quote! { { match x { 1 => 2, _ => 3 } } }; + assert!(!has_top_level_arrow(&toks)); + // Bare shorthand form: top-level `=>` followed by a bare ident. + let toks = quote! { (a) => kind }; + assert!(has_top_level_arrow(&toks)); + } +} diff --git a/shared/yeast/doc/yeast.md b/shared/yeast/doc/yeast.md index 8aa050592f6b..3427597be2e6 100644 --- a/shared/yeast/doc/yeast.md +++ b/shared/yeast/doc/yeast.md @@ -437,3 +437,44 @@ For the dbscheme/QL code generator, set `Language::desugar` to a `DesugaringConfig` carrying the same YAML; the generator converts it to JSON for downstream code generation. The `phases` field of the config is unused at code-generation time. + +## The `rules!` macro + +The [`rules!`] macro bundles a list of rewrite rules with the input and +output node-types schema paths. It's a drop-in replacement for the +hand-written `vec![rule!(...), rule!(...), ...]` form and accepts a +slightly looser syntax: bare rule bodies don't need an explicit +`rule!(...)` wrapper. + +```rust +let translation_rules: Vec = yeast::rules! { + input: "tree-sitter-swift/node-types.yml", + output: "ast_types.yml", + [ + (simple_identifier) @name + => + (name_expr identifier: (identifier #{name})), + + (integer_literal) @lit + => + (int_literal #{lit}), + ] +}; +``` + +Each comma-separated item in the bracketed list may be: + +- A **bare rule body** `(query) => (template)` — no `rule!(...)` wrapper. +- An explicit `rule!(...)` invocation, with optional postfix calls such + as `rule!(...).repeated()`. +- Any other expression returning a `Rule` (helper functions, etc.). + +Schema paths are resolved relative to the consuming crate's +`CARGO_MANIFEST_DIR` (the same convention `include_str!` uses for +relative paths). The resolved paths are emitted as `include_str!` +references in the expansion so the consuming crate's incremental cache +invalidates when a schema YAML changes — laying the groundwork for +schema-aware compile-time checks on the rule bodies. + +The `Vec` produced by `rules!` flows into `add_phase` exactly as +before. \ No newline at end of file diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs index 17c54166e863..4363d3124dca 100644 --- a/shared/yeast/src/lib.rs +++ b/shared/yeast/src/lib.rs @@ -15,7 +15,7 @@ pub mod schema; pub mod tree_builder; mod visitor; -pub use yeast_macros::{query, rule, tree, trees}; +pub use yeast_macros::{query, rule, rules, tree, trees}; use captures::Captures; use query::QueryNode; diff --git a/shared/yeast/tests/input-types.yml b/shared/yeast/tests/input-types.yml new file mode 100644 index 000000000000..6bc184ec6470 --- /dev/null +++ b/shared/yeast/tests/input-types.yml @@ -0,0 +1,40 @@ +# Test input schema for yeast rules! macro tests. Covers a small subset of +# tree-sitter-ruby kinds used by the test rules. Kept deliberately small so +# the macro's compile-time loader can be exercised over a known surface. + +named: + program: + $children*: [assignment, call, identifier, for] + + assignment: + left: [identifier, left_assignment_list] + right: [identifier, integer, call] + + left_assignment_list: + $children*: identifier + + for: + pattern: [identifier, left_assignment_list] + value: in + body: do + + in: + $children: [identifier, call] + + do: + $children*: [identifier, assignment, call] + + call: + receiver: [identifier, call] + method: identifier + + identifier: + integer: + +unnamed: + - "=" + - "," + - "for" + - "in" + - "do" + - "end" diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs index 57a9e17dbd4c..4a2fb26f4fe3 100644 --- a/shared/yeast/tests/test.rs +++ b/shared/yeast/tests/test.rs @@ -1322,3 +1322,123 @@ fn test_hash_brace_uses_capture_location_for_leaf() { assert_eq!(bar.start_byte(), 4); assert_eq!(bar.end_byte(), 7); } + +// ---- `rules!` macro tests (compile-time type-checking) ---- + +/// `rules!` should accept well-typed rules using the bare-rule-body +/// syntax (no inner `rule!` invocations) and produce a `Vec` that +/// behaves identically to a plain `vec![rule!(...)]` list. +#[test] +fn test_rules_macro_accepts_bare_rule_body() { + let rules: Vec = yeast::rules! { + input: "tests/input-types.yml", + output: "tests/node-types.yml", + [ + (assignment + left: (_) @left + right: (_) @right + ) + => + (assignment + left: {right} + right: {left} + ), + ] + }; + + let dump = run_and_dump("x = 1", rules); + assert_dump_eq( + &dump, + r#" + program + assignment + left: integer "1" + right: identifier "x" + "#, + ); +} + +/// The bare-rule-body shorthand `=> output_kind` should also be accepted. +#[test] +fn test_rules_macro_accepts_bare_shorthand_form() { + let rules: Vec = yeast::rules! { + input: "tests/input-types.yml", + output: "tests/node-types.yml", + [ + (assignment + left: (_) @method + right: (_) @receiver + ) + => call, + ] + }; + + let dump = run_and_dump("x = 1", rules); + assert_dump_eq( + &dump, + r#" + program + call + method: identifier "x" + receiver: integer "1" + "#, + ); +} + +/// Backwards-compat: explicit `rule!(...)` invocations inside `rules!` +/// should still type-check and behave the same as the bare form. +#[test] +fn test_rules_macro_accepts_explicit_rule_macro() { + let rules: Vec = yeast::rules! { + input: "tests/input-types.yml", + output: "tests/node-types.yml", + [ + rule!( + (assignment + left: (_) @left + right: (_) @right + ) + => + (assignment + left: {right} + right: {left} + ) + ), + ] + }; + assert_eq!(rules.len(), 1); +} + +/// `rules!` should pass through items that aren't bare rule bodies or +/// `rule!(...)` calls (e.g. helper-function calls returning a `Rule`), +/// without type-checking them. Bare and explicit rules in the same list +/// still get checked. +#[test] +fn test_rules_macro_allows_non_rule_items() { + fn extra() -> yeast::Rule { + rule!((identifier) => (identifier "extra")) + } + let rules: Vec = yeast::rules! { + input: "tests/input-types.yml", + output: "tests/node-types.yml", + [ + (integer) => (integer "checked"), + extra(), + ] + }; + assert_eq!(rules.len(), 2); +} + +/// `rules!` should accept lists that mix bare-rule and explicit-rule items. +#[test] +fn test_rules_macro_mixes_bare_and_explicit_forms() { + let rules: Vec = yeast::rules! { + input: "tests/input-types.yml", + output: "tests/node-types.yml", + [ + (integer) => (integer "I"), + rule!((identifier) => (identifier "S")), + ] + }; + assert_eq!(rules.len(), 2); +} diff --git a/unified/extractor/tests/rules_macro_smoke.rs b/unified/extractor/tests/rules_macro_smoke.rs new file mode 100644 index 000000000000..cde8ae3ca4ab --- /dev/null +++ b/unified/extractor/tests/rules_macro_smoke.rs @@ -0,0 +1,25 @@ +/// Smoke test: load a few real Swift translation rules through the new +/// `yeast::rules!` macro using the bare-rule-body syntax, and confirm the +/// input + output schemas accept them. Compiles only — any type-checking +/// error surfaces as a compile-time error. +#[test] +fn rules_macro_compiles_against_real_swift_schemas() { + let _rules: Vec = yeast::rules! { + input: "tree-sitter-swift/node-types.yml", + output: "ast_types.yml", + [ + (simple_identifier) @name + => + (name_expr + identifier: (identifier #{name})), + + (integer_literal) @lit + => + (int_literal #{lit}), + + (line_string_literal) @lit + => + (string_literal #{lit}), + ] + }; +} From a099c44a0823452cc2fc90a12caf0009220dff64 Mon Sep 17 00:00:00 2001 From: Taus Date: Thu, 2 Jul 2026 12:05:23 +0000 Subject: [PATCH 3/5] yeast: Require type annotations on root-level Rust interpolations In order to facilitate static type checking of rules (and to make it easier for human readers as well), rust blocks at the root level (i.e. rules of the form `... => { ... }`) must now have a type annotation in front. All other forms are unaffected: if the right hand side of a rule is a tree, we can read the type of the root node directly. For interpolations that happen inside of such a tree, we can recover the type by looking at what field we're interpolating into, and consulting the output schema. All existing uses have been updated to have the appropriate type annotations, though these are of course not checked yet (and so could be wrong). Finally, this commit also removes the final catch-all rule `_ @node => {node}`. Because of the preceding rule that matches `(_) @node`, this rule would only ever match unnamed nodes, and I think in practice it did not match at all (at least not in our current set of tests). To give it a proper type we would have to add some notion of an "any" type, which I would like to avoid. If it _does_ turn out to be needed, we can easily add it back (ideally with a test-case that shows why it's still needed). --- shared/yeast-macros/src/parse.rs | 143 +++++++++++++++++- shared/yeast/doc/yeast.md | 65 +++++++- shared/yeast/tests/test.rs | 116 +++++++++++++- .../extractor/src/languages/swift/swift.rs | 55 +++---- 4 files changed, 335 insertions(+), 44 deletions(-) diff --git a/shared/yeast-macros/src/parse.rs b/shared/yeast-macros/src/parse.rs index 01070c74bbce..986a9bac641a 100644 --- a/shared/yeast-macros/src/parse.rs +++ b/shared/yeast-macros/src/parse.rs @@ -617,6 +617,76 @@ fn extract_captures_inner( } } +/// A rule's return-type annotation, when the body is a Rust block. Written +/// between `=>` and the block body using the schema's own vocabulary: +/// +/// ```text +/// => kind { … } // single node of that kind +/// => kind? { … } // Option (0 or 1) +/// => kind* { … } // Vec (0+) +/// ``` +/// +/// Template bodies (`=> (kind …)`) never carry an annotation — the +/// output kind is the template root. The shorthand `=> kind` (no +/// body) also carries no annotation. See `parse_rule_top` for dispatch. +#[derive(Clone, Debug)] +struct ReturnAnnotation { + kind: Ident, + multiplicity: AnnotationMultiplicity, +} + +#[derive(Clone, Copy, Debug, PartialEq)] +enum AnnotationMultiplicity { + Single, + Optional, + Repeated, +} + +/// Peek at the token stream to decide whether the transform following +/// `=>` is a **new** annotation form (`kind [? | *] { … }`). If so, +/// consume the annotation and return it, leaving the `{ … }` body in +/// the stream for the caller to parse. Otherwise leave the stream +/// untouched and return `None`. +/// +/// The lookahead distinguishes: +/// `kind {` → annotation (single) +/// `kind? {` → annotation (optional) +/// `kind* {` → annotation (repeated) +/// `kind` → shorthand form (no `{` follows) — NOT an annotation +/// anything else → template or bare block — NOT an annotation +fn try_consume_return_annotation(tokens: &mut Tokens) -> Result> { + // Must start with an identifier (the kind name). + let mut lookahead = tokens.clone(); + let Some(TokenTree::Ident(_)) = lookahead.next() else { + return Ok(None); + }; + // Then optionally `?` or `*`, then a `{` group. + let after_suffix = match lookahead.peek() { + Some(TokenTree::Punct(p)) if p.as_char() == '?' || p.as_char() == '*' => { + lookahead.next(); + lookahead.peek() + } + other => other, + }; + if !matches!(after_suffix, Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Brace) { + return Ok(None); + } + // Commit: consume the ident + suffix from the real stream. + let kind = expect_ident(tokens, "expected output-kind name in annotation")?; + let multiplicity = match tokens.peek() { + Some(TokenTree::Punct(p)) if p.as_char() == '?' => { + tokens.next(); + AnnotationMultiplicity::Optional + } + Some(TokenTree::Punct(p)) if p.as_char() == '*' => { + tokens.next(); + AnnotationMultiplicity::Repeated + } + _ => AnnotationMultiplicity::Single, + }; + Ok(Some(ReturnAnnotation { kind, multiplicity })) +} + /// Parse `rule!( query => transform )`. pub fn parse_rule_top(input: TokenStream) -> Result { let mut tokens = input.into_iter().peekable(); @@ -688,8 +758,52 @@ pub fn parse_rule_top(input: TokenStream) -> Result { }) .collect(); - // Parse transform: either shorthand `=> kind_name` or full `=> (template ...)` - let transform_body = if peek_is_field(&mut tokens) && { + // Parse transform: the token(s) after `=>` fall into one of three + // shapes, dispatched in order: + // + // 1. `kind [? | *] { rust_body }` — annotated Rust body (NEW). + // Static-analysis-ready: the annotation declares the output + // kind and multiplicity in the schema's own vocabulary. + // 2. `kind` alone — shorthand: emit `(kind field: {@cap})…` from + // the query's captures. + // 3. anything else — full template form (`(kind …)` or bare + // `{ … }` splice via `parse_direct_list`). + let annotation = try_consume_return_annotation(&mut tokens)?; + + let transform_body = if let Some(annotation) = annotation { + // Annotation form: `=> kind [? | *] { rust_body }`. + let body_group = expect_group(&mut tokens, Delimiter::Brace)?; + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned( + tok, + "unexpected token after annotated rule body", + )); + } + let body = body_group.stream(); + // The annotation is not yet consumed by codegen — it will drive + // typed handles once the schema-driven codegen lands. For now, + // emit a self-documenting reference to the annotated kind and + // preserve today's `Vec` closure return so behavior + // is unchanged. + let kind_str = annotation.kind.to_string(); + let mult_str = match annotation.multiplicity { + AnnotationMultiplicity::Single => "single", + AnnotationMultiplicity::Optional => "optional", + AnnotationMultiplicity::Repeated => "repeated", + }; + let _ = (kind_str, mult_str); // silence unused warnings until wired + + // For now, adapt the user's typed return value to the framework's + // `Vec` closure result. This uses `IntoFieldIds`, which + // already accepts a bare `Id`, an iterable of ids, or `Option` + // — matching the three annotation multiplicities. + quote! { + let __value = { #body }; + let mut __ids: Vec = Vec::new(); + yeast::IntoFieldIds::extend_into(__value, &mut __ids); + __ids + } + } else if peek_is_field(&mut tokens) && { // Shorthand form: bare identifier = output node kind. // Auto-generate template from captures. let mut lookahead = tokens.clone(); @@ -749,6 +863,26 @@ pub fn parse_rule_top(input: TokenStream) -> Result { vec![__id] } } else { + // Reject bare `{ ... }` transforms — they used to be accepted + // as either a Rust body producing a `Vec` or a template + // consisting of a single `{cap}` splice. Both patterns lost + // static-analysis information (no visible output kind), so we + // now require rules with block bodies to use the annotation + // form `=> kind [? | *] { ... }`. Templates must start with a + // parenthesized node (e.g. `(if_expr ...)`). + if let Some(TokenTree::Group(g)) = tokens.peek() { + if g.delimiter() == Delimiter::Brace { + let span = g.span(); + return Err(syn::Error::new( + span, + "bare `{...}` rule bodies are no longer accepted; \ + use the annotation form `=> kind [? | *] { ... }` \ + (where the kind names the output node's schema kind, \ + optionally suffixed with `?` or `*` for multiplicity)", + )); + } + } + // Full template form let transform_items = parse_direct_list(&mut tokens, &ctx_ident)?; @@ -1026,10 +1160,7 @@ struct NamedString { } fn parse_named_string_arg(tokens: &mut Tokens, expected_name: &str) -> Result { - let name = expect_ident( - tokens, - &format!("expected `{expected_name}:` argument"), - )?; + let name = expect_ident(tokens, &format!("expected `{expected_name}:` argument"))?; if name != expected_name { return Err(syn::Error::new_spanned( name, diff --git a/shared/yeast/doc/yeast.md b/shared/yeast/doc/yeast.md index 3427597be2e6..90edb510c1a1 100644 --- a/shared/yeast/doc/yeast.md +++ b/shared/yeast/doc/yeast.md @@ -312,13 +312,15 @@ already conforms to the output schema. For rules that need the raw (input-schema) capture — typically to read its source text or to translate it explicitly with mutable context state between calls — use `@@name` instead. The body sees the original -input-schema `Id`: +input-schema `Id`. Because these rules always have a Rust block body, +they use the annotation form (see [the `rule!` macro +section](#the-rule-macro) for the full grammar): ```rust yeast::rule!( (assignment left: (_) @@raw_lhs right: (_) @rhs) => - { + call { // raw_lhs is untranslated: read its original source text. let text = ctx.ast.source_text(raw_lhs); // rhs is already translated by the auto-translate prefix. @@ -372,26 +374,79 @@ automatically: single captures bind as `Id`, repeated captures (after ## The `rule!` macro -`rule!` combines a query and a transform into a single declaration: +`rule!` combines a query and a transform into a single declaration. +There are three transform forms, each suited to a different level of +rule complexity: ```rust -// Full template form +// 1. Template form — a tree literal describing the output. yeast::rule!( (query_pattern field: (_) @capture) => (output_template field: {capture}) ) -// Shorthand form — captures become fields on the output node +// 2. Shorthand form — captures become fields on a bare output kind. yeast::rule!( (query_pattern field: (_) @capture) => output_kind ) + +// 3. Annotation form — a Rust block body preceded by the output kind. +yeast::rule!( + (query_pattern child: (_)+ @@children) + => + output_kind* { + // arbitrary Rust; must evaluate to a value compatible with the + // declared multiplicity (see below). + let mut result = Vec::new(); + for child in children { + result.extend(ctx.translate(child)?); + } + result + } +) ``` The shorthand `=> kind` form auto-generates the template, mapping each capture name to a field of the same name on the output node. +### Annotation form + +Rules that need imperative logic — mutating [`BuildCtx`] state per +iteration, computing intermediate values, or looping over captures — +use the annotation form. It has three shapes distinguished by a suffix +on the output-kind identifier: + +| Syntax | Body must evaluate to | Meaning | +|---------------------|-------------------------------------|--------------------------------| +| `=> kind { ... }` | a single node id of `kind` | Emit exactly one node. | +| `=> kind? { ... }` | an `Option` of a node id of `kind` | Emit 0 or 1 nodes (`None`/`Some`). | +| `=> kind* { ... }` | an iterable of node ids of `kind` | Emit 0+ nodes; flattens into the enclosing splice slot. | + +The suffix mirrors the `?` / `*` markers used elsewhere in the schema +DSL (see [`ast_types.yml`](../../../unified/extractor/ast_types.yml)): +bare identifier = required single, `?` = optional single, `*` = +repeated. + +The annotation names the schema kind of the output, giving the macro +enough information for future static analysis (e.g. computing the +static output type of translated captures at their consumer sites). + +**Bare `=> { ... }` block bodies are rejected** — every Rust-block body +must carry an annotation, so the output kind is always visible without +having to inspect the block's expression. + +### Choosing between the forms + +Prefer the simplest form that fits: + +- If the whole transform is a tree literal, use the **template form**. +- If the transform is a template whose root matches a query capture + 1:1, use the **shorthand form**. +- If the transform needs Rust logic (loops, `let` bindings, calls to + `ctx.translate`, etc.), use the **annotation form**. + ## Integration with the extractor A YEAST desugaring pass is configured with a [`DesugaringConfig`], which diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs index 4a2fb26f4fe3..3a24709dd9ff 100644 --- a/shared/yeast/tests/test.rs +++ b/shared/yeast/tests/test.rs @@ -989,7 +989,7 @@ fn test_one_shot_recurses_into_returned_capture() { yeast::rule!( (assignment left: (_) @left right: (_) @right) => - {left} + identifier { left } ), yeast::rule!((identifier) => (identifier "ID")), yeast::rule!((integer) => (integer "INT")), @@ -1084,7 +1084,7 @@ fn test_raw_capture_marker() { yeast::rule!( (assignment left: (_) @@raw_lhs right: (_) @rhs) => - { + call { let text = ctx.ast.source_text(raw_lhs); tree!((call method: (identifier #{text.as_str()}) @@ -1139,7 +1139,7 @@ fn test_raw_capture_marker_explicit_translate() { yeast::rule!( (assignment left: (_) @@raw_lhs right: (_) @rhs) => - { + call { let translated_lhs = ctx.translate(raw_lhs)?; tree!((call method: {translated_lhs} @@ -1442,3 +1442,113 @@ fn test_rules_macro_mixes_bare_and_explicit_forms() { }; assert_eq!(rules.len(), 2); } + +// ---- Rule-body return-type annotation tests ---- +// +// The annotation form `=> kind [? | *] { rust_body }` is the future +// interface for Rust-bodied rules: the schema-vocabulary annotation +// declares the rule's output kind for static analysis. Today's codegen +// does NOT yet consume the annotation (it just adapts the returned +// value to `Vec` via `IntoFieldIds`); these tests only exercise +// the parser + the runtime-equivalence property. + +/// Annotation form with `*` (repeated): the rule body returns a +/// `Vec` and the annotation says the outputs are `assignment`s. +#[test] +fn test_rule_annotation_repeated() { + // Behaviourally equivalent to a two-node splice template. + let r: Rule = rule!( + (assignment left: (_) @l right: (_) @r) + => + assignment* { + let a1 = tree!((assignment left: {l} right: {r})); + let a2 = tree!((assignment left: {r} right: {l})); + vec![a1, a2] + } + ); + let ast = run_and_ast("x = 1", vec![r]); + // Just verify the run completes without a schema error; two + // top-level `assignment` nodes should appear as siblings. + let mut count = 0usize; + for id in ast.reachable_node_ids() { + if let Some(n) = ast.get_node(id) { + if n.kind_name() == "assignment" { + count += 1; + } + } + } + assert!( + count >= 2, + "expected at least two assignment nodes, got {count}" + ); +} + +/// Annotation form with `?` (optional): the rule body returns +/// `Option`. This uses `None` so the rule effectively deletes the +/// node. +#[test] +fn test_rule_annotation_optional_none() { + // Delete every `integer` (returning None yields no output nodes). + let r: Rule = rule!( + (integer) @lit + => + integer? { + let _ = lit; + None:: + } + ); + let ast = run_and_ast("42", vec![r]); + // No integer node should survive. + for id in ast.reachable_node_ids() { + if let Some(n) = ast.get_node(id) { + assert_ne!(n.kind_name(), "integer", "integer should have been deleted"); + } + } +} + +/// Annotation form (single): the rule body returns a bare `Id`. +#[test] +fn test_rule_annotation_single() { + // Identity on assignment nodes, expressed with the annotation form. + let r: Rule = rule!( + (assignment left: (_) @l right: (_) @r) + => + assignment { + tree!((assignment left: {l} right: {r})) + } + ); + let ast = run_and_ast("x = 1", vec![r]); + let mut has_assignment = false; + for id in ast.reachable_node_ids() { + if let Some(n) = ast.get_node(id) { + if n.kind_name() == "assignment" { + has_assignment = true; + } + } + } + assert!(has_assignment, "expected an assignment node"); +} + +/// The shorthand `=> kind` form (no body, no annotation) must still be +/// distinguished from the annotation form and continue to work. +#[test] +fn test_shorthand_still_works_alongside_annotation_syntax() { + let r: Rule = rule!( + (assignment left: (_) @method right: (_) @receiver) + => + call + ); + let ast = run_and_ast("x = 1", vec![r]); + let mut has_call = false; + for id in ast.reachable_node_ids() { + if let Some(n) = ast.get_node(id) { + if n.kind_name() == "call" { + has_call = true; + } + } + } + assert!( + has_call, + "shorthand form should still produce a `call` node" + ); +} diff --git a/unified/extractor/src/languages/swift/swift.rs b/unified/extractor/src/languages/swift/swift.rs index 5689d930bff3..908a7bd437f7 100644 --- a/unified/extractor/src/languages/swift/swift.rs +++ b/unified/extractor/src/languages/swift/swift.rs @@ -104,8 +104,8 @@ fn translation_rules() -> Vec> { ) ), // Declarations may be wrapped in local/global wrapper nodes. - rule!((global_declaration _ @inner) => {inner}), - rule!((local_declaration _ @inner) => {inner}), + rule!((global_declaration _ @inner) => stmt { inner }), + rule!((local_declaration _ @inner) => stmt { inner }), // ---- Literals ---- rule!((integer_literal) => (int_literal)), rule!((hex_literal) => (int_literal)), @@ -198,7 +198,7 @@ fn translation_rules() -> Vec> { type: _? @ty computed_value: (computed_property accessor: _+ @@accessors)) => - {{ + accessor_declaration* { ctx.property_name = Some(tree!((identifier #{pattern}))); ctx.property_type = ty; @@ -210,7 +210,7 @@ fn translation_rules() -> Vec> { result.extend(ctx.translate(acc)?); } result - }} + } ), // Computed property: shorthand getter (no explicit get/set, just // statements) → a single accessor_declaration with kind "get". @@ -249,7 +249,7 @@ fn translation_rules() -> Vec> { value: _? @val observers: (willset_didset_block willset: _? @@ws didset: _? @@ds)) => - {{ + member* { let var_decl = tree!( (variable_declaration modifier: {ctx.binding_modifier} @@ -271,7 +271,7 @@ fn translation_rules() -> Vec> { result.extend(ctx.translate(obs)?); } result - }} + } ), // property_binding with any pattern name (identifier or // destructuring). Reads outer modifiers / chained tag from `ctx`. @@ -305,7 +305,7 @@ fn translation_rules() -> Vec> { declarator: _* @@decls (modifiers)* @mods) => - {{ + member* { let binding_text = ctx.ast.source_text(binding_kind); ctx.binding_modifier = Some(ctx.literal("modifier", &binding_text)); ctx.outer_modifiers = mods; @@ -316,7 +316,7 @@ fn translation_rules() -> Vec> { result.extend(ctx.translate(decl)?); } result - }} + } ), // ---- Enums ---- // enum_type_parameter → parameter (with optional name as pattern). @@ -376,7 +376,7 @@ fn translation_rules() -> Vec> { rule!( (enum_entry case: _+ @@cases (modifiers)* @mods) => - {{ + member* { ctx.outer_modifiers = mods; let mut result = Vec::new(); @@ -385,7 +385,7 @@ fn translation_rules() -> Vec> { result.extend(ctx.translate(case)?); } result - }} + } ), // Plain assignment: `x = expr` rule!( @@ -400,9 +400,9 @@ fn translation_rules() -> Vec> { (compound_assign_expr target: {target} operator: (infix_operator #{op}) value: {value}) ), // Unwrap `type` wrapper node - rule!((type name: @inner) => {inner}), + rule!((type name: @inner) => type_expr { inner }), // `directly_assignable_expression` is just a wrapper; unwrap it - rule!((directly_assignable_expression expr: @inner) => {inner}), + rule!((directly_assignable_expression expr: @inner) => expr { inner }), // Pattern with bound_identifier → name_pattern rule!((pattern bound_identifier: @name) => (name_pattern identifier: (identifier #{name}))), // Pattern with 'let' or 'var' binding: extract the inner pattern @@ -410,7 +410,7 @@ fn translation_rules() -> Vec> { rule!( (pattern kind: (binding_pattern binding: _? pattern: @pattern)) => - {pattern} + pattern { pattern } ), // case T.foo(x,y) pattern rule!( @@ -463,10 +463,10 @@ fn translation_rules() -> Vec> { rule!( (function_parameter parameter: @@p default_value: _? @def) => - {{ + parameter* { ctx.default_value = def; ctx.translate(p)? - }} + } ), // Parameter with external name and type rule!( @@ -689,7 +689,7 @@ fn translation_rules() -> Vec> { element: (pattern_element pattern: (name_pattern identifier: (identifier #{name}))))) ), // If-condition — unwrap (pass through the inner expression/pattern) - rule!((if_condition kind: @inner) => {inner}), + rule!((if_condition kind: @inner) => expr_or_pattern { inner }), // ---- Loops ---- // For-in loop with optional where-clause guard. rule!( @@ -722,7 +722,7 @@ fn translation_rules() -> Vec> { body: (block stmt: {body})) ), // Labeled statement (e.g. `outer: for ...`). Strip the trailing ':' from the label token. - rule!((labeled_statement label: (statement_label) @lbl statement: @stmt) => { + rule!((labeled_statement label: (statement_label) @lbl statement: @stmt) => labeled_stmt { let text = ctx.ast.source_text(lbl); let name = &text[..text.len() - 1]; tree!((labeled_stmt label: (identifier #{name}) stmt: {stmt})) @@ -744,7 +744,7 @@ fn translation_rules() -> Vec> { rule!((dictionary_literal_item key: @k value: @v) => (key_value_pair key: {k} value: {v})), // ---- Optionals and errors ---- // Optional chaining — unwrap the marker - rule!((optional_chain_marker expr: @inner) => {inner}), + rule!((optional_chain_marker expr: @inner) => expr { inner }), // try/try?/try! expr → unary_expr with operator "try", "try?" or "try!" rule!((try_expression (try_operator) @op expr: @inner) => (unary_expr operator: (prefix_operator #{op}) operand: {inner})), rule!((try_expression operator: (try_operator) @op expr: @inner) => (unary_expr operator: (prefix_operator #{op}) operand: {inner})), @@ -800,7 +800,7 @@ fn translation_rules() -> Vec> { rule!( (identifier part: _+ @parts) => - {member_chain(&mut ctx, parts)} + expr { member_chain(&mut ctx, parts) } ), // Scoped import declaration (for example `import struct Foo.Bar`): // flatten the identifier parts into a member_access_expr and bind the @@ -831,7 +831,7 @@ fn translation_rules() -> Vec> { // Super expression → super_expr rule!((super_expression) => (super_expr)), // Modifiers — unwrap to individual modifier children - rule!((modifiers _* @mods) => {mods}), + rule!((modifiers _* @mods) => modifier* { mods }), rule!((attribute) @m => (modifier #{m})), rule!((visibility_modifier) @m => (modifier #{m})), rule!((function_modifier) @m => (modifier #{m})), @@ -843,7 +843,7 @@ fn translation_rules() -> Vec> { rule!((inheritance_modifier) @m => (modifier #{m})), rule!((property_behavior_modifier) @m => (modifier #{m})), // Type annotations — unwrap - rule!((type_annotation type: @inner) => {inner}), + rule!((type_annotation type: @inner) => type_expr { inner }), // user_type is split into simple_user_type parts. // Keep a conservative textual fallback to avoid dropping type information. rule!((user_type) @ty => (named_type_expr name: (identifier #{ty}))), @@ -1018,7 +1018,7 @@ fn translation_rules() -> Vec> { type: _? @ty (modifiers)* @mods) => - {{ + accessor_declaration* { ctx.property_name = Some(tree!((identifier #{name}))); ctx.property_type = ty; ctx.outer_modifiers = mods; @@ -1029,7 +1029,7 @@ fn translation_rules() -> Vec> { result.extend(ctx.translate(acc)?); } result - }} + } ), // getter_specifier / setter_specifier → bodyless accessor_declaration // getter_specifier / setter_specifier → bodyless @@ -1056,7 +1056,7 @@ fn translation_rules() -> Vec> { modifier: {chained_modifier(&mut ctx)}) ), // protocol_property_requirements wrapper — should be consumed by above; fallback - rule!((protocol_property_requirements accessor: _* @accs) => {accs}), + rule!((protocol_property_requirements accessor: _* @accs) => accessor_declaration* { accs }), // Computed getter → accessor_declaration (body optional). // Reads property name/type from the outer property_binding rule // and binding/outer modifiers + chained tag from the outer @@ -1116,7 +1116,7 @@ fn translation_rules() -> Vec> { // willset/didset block — spread to children (only reachable as a // fallback; the outer property_binding manual rule normally // captures the willset/didset clauses directly). - rule!((willset_didset_block _* @clauses) => {clauses}), + rule!((willset_didset_block _* @clauses) => accessor_declaration* { clauses }), // willset clause → accessor_declaration (body optional). Reads // `ctx.property_name` set by the outer property_binding rule and // binding/outer modifiers + chained tag from the outer @@ -1152,11 +1152,6 @@ fn translation_rules() -> Vec> { => (unsupported_node) ), - rule!( - _ @node - => - {node} - ), ] } From d1711f5206348d668f8b6d315b4bd8398d34a58e Mon Sep 17 00:00:00 2001 From: Taus Date: Thu, 2 Jul 2026 12:57:43 +0000 Subject: [PATCH 4/5] yeast: Fix bug in matching `(_)` Turns out, `(_)` would match both named and unnamed nodes, as we never checked the value of the `match_unnamed` field. This is the real reason why the final catch-all rule we removed in the last commit was superfluous -- unnamed nodes were being caught by the penultimate rule instead (and mapped to `unsupported_node`). Having fixed the bug, we now (correctly) get errors due to unmatched unnamed nodes in the input. To fix this, we change the catch-all rule to match unnamed nodes as well. This restores the previous behaviour exactly. At some point, we should find a better way to handle unnamed nodes, as it seems wasteful to map these to `unsupported_node` (since we in practice only use them for their string content). Perhaps we should not attempt to translate unnamed nodes at all? --- shared/yeast/src/query.rs | 12 +++++++++++- unified/extractor/src/languages/swift/swift.rs | 11 +++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/shared/yeast/src/query.rs b/shared/yeast/src/query.rs index bcf0f7facab1..3e61ac60b2be 100644 --- a/shared/yeast/src/query.rs +++ b/shared/yeast/src/query.rs @@ -66,7 +66,17 @@ impl QueryNode { pub fn do_match(&self, ast: &Ast, node: Id, matches: &mut Captures) -> Result { match self { - QueryNode::Any { .. } => Ok(true), + QueryNode::Any { match_unnamed } => { + if *match_unnamed { + Ok(true) + } else { + // `(_)` only matches named nodes, matching tree-sitter + // semantics. Bare `_` (with `match_unnamed = true`) + // matches any node. + let n = ast.get_node(node).unwrap(); + Ok(n.is_named()) + } + } QueryNode::Node { kind, children } => { let node = ast.get_node(node).unwrap(); let target_kind = ast diff --git a/unified/extractor/src/languages/swift/swift.rs b/unified/extractor/src/languages/swift/swift.rs index 908a7bd437f7..af9158433816 100644 --- a/unified/extractor/src/languages/swift/swift.rs +++ b/unified/extractor/src/languages/swift/swift.rs @@ -1147,8 +1147,15 @@ fn translation_rules() -> Vec> { // Preprocessor conditionals — unsupported rule!((diagnostic) => (unsupported_node)), // ---- Fallbacks ---- - rule!( - (_) + // Bare `_` (rather than `(_)`) so this matches both named nodes + // and unnamed tokens. Any unnamed token that escapes the + // input-schema-specific rules (e.g. captured operators in + // `additive_expression op: @op`) has its auto-translated value + // replaced with an `unsupported_node` whose source range is + // inherited from the original token, so `#{op}` still reads the + // original text. + rule!( + _ => (unsupported_node) ), From 558be37c540534c4d4ae5a9e4f165bf4345718ee Mon Sep 17 00:00:00 2001 From: Taus Date: Thu, 2 Jul 2026 21:03:59 +0000 Subject: [PATCH 5/5] yeast: Fix escaping bug in yeast-macros Happily, it turned out that there was already a library function for handling this case. --- shared/yeast-macros/src/parse.rs | 37 +++++--------------------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/shared/yeast-macros/src/parse.rs b/shared/yeast-macros/src/parse.rs index 986a9bac641a..01c0b574b1cf 100644 --- a/shared/yeast-macros/src/parse.rs +++ b/shared/yeast-macros/src/parse.rs @@ -1183,39 +1183,12 @@ fn parse_named_string_arg(tokens: &mut Tokens, expected_name: &str) -> Result Option { - let raw = lit.to_string(); - let bytes = raw.as_bytes(); - // Match plain `"..."` literals; reject byte strings, raw strings (for - // simplicity), char literals, numbers, etc. - if bytes.first() != Some(&b'"') || bytes.last() != Some(&b'"') { - return None; - } - let mut out = String::with_capacity(raw.len()); - let mut chars = raw[1..raw.len() - 1].chars(); - while let Some(c) = chars.next() { - if c != '\\' { - out.push(c); - continue; - } - match chars.next()? { - 'n' => out.push('\n'), - 't' => out.push('\t'), - 'r' => out.push('\r'), - '\\' => out.push('\\'), - '\'' => out.push('\''), - '"' => out.push('"'), - '0' => out.push('\0'), - other => { - // Unknown escape — give up rather than silently mis-parse. - out.push('\\'); - out.push(other); - } - } - } - Some(out) + let tokens = TokenStream::from(TokenTree::Literal(lit.clone())); + syn::parse2::(tokens).ok().map(|s| s.value()) } /// Split a token stream into top-level comma-separated items. Commas inside