diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 98b26e3..74f86c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,13 +26,16 @@ jobs: - name: Set up LuaRocks uses: leafo/gh-actions-luarocks@v4 - - name: Install dev dependencies (busted, lua-cjson) + # libpcre2-dev provides the PCRE2 C library that lrexlib-pcre2 links against + # (the regex engine for $match/$replace/$split/$contains, lazy-loaded). + - name: Install the LuaJIT interpreter + PCRE2 + run: sudo apt-get update && sudo apt-get install -y luajit libpcre2-dev + + - name: Install dev dependencies (busted, lua-cjson, lrexlib-pcre2) run: | luarocks install busted luarocks install lua-cjson - - - name: Install the LuaJIT interpreter - run: sudo apt-get update && sudo apt-get install -y luajit + luarocks install lrexlib-pcre2 # Invoke busted's runner directly under LuaJIT (this is exactly what # busted's own bin/busted does) so the whole suite — unit tests + the diff --git a/jsonata-scm-1.rockspec b/jsonata-scm-1.rockspec index 2e5e9eb..9e60f51 100644 --- a/jsonata-scm-1.rockspec +++ b/jsonata-scm-1.rockspec @@ -11,6 +11,7 @@ description = { } dependencies = { "lua >= 5.1", + "lrexlib-pcre2", } test_dependencies = { "busted", diff --git a/spec/jsonata-suite/baseline.lua b/spec/jsonata-suite/baseline.lua index c1aaee8..1216d8d 100644 --- a/spec/jsonata-suite/baseline.lua +++ b/spec/jsonata-suite/baseline.lua @@ -274,6 +274,7 @@ return { ["function-applications/case018"] = true, ["function-applications/case019"] = true, ["function-applications/case020"] = true, + ["function-applications/case021"] = true, ["function-assert/case000"] = true, ["function-assert/case001"] = true, ["function-assert/case002"] = true, @@ -604,6 +605,7 @@ return { ["function-shuffle/case003"] = true, ["function-sift/case000"] = true, ["function-sift/case001"] = true, + ["function-sift/case002"] = true, ["function-sift/case003"] = true, ["function-sift/case004"] = true, ["function-signatures/case000"] = true, @@ -1049,6 +1051,13 @@ return { ["range-operator/case020"] = true, ["range-operator/case022"] = true, ["range-operator/case023"] = true, + ["regex/case000"] = true, + ["regex/case001"] = true, + ["regex/case002"] = true, + ["regex/case003"] = true, + ["regex/case004"] = true, + ["regex/case005"] = true, + ["regex/case006"] = true, ["regex/case022"] = true, ["regex/case035"] = true, ["regex/case036"] = true, @@ -1252,4 +1261,5 @@ return { ["wildcards/case009"] = true, ["wildcards/case010/0"] = true, ["wildcards/case010/1"] = true, + ["wildcards/case010/2"] = true, } diff --git a/spec/regex_spec.lua b/spec/regex_spec.lua new file mode 100644 index 0000000..215ee5c --- /dev/null +++ b/spec/regex_spec.lua @@ -0,0 +1,77 @@ +local jsonata = require("jsonata") +local parser = require("jsonata.parser") +local function run(src, input) + return jsonata.compile(src):evaluate(input) +end + +describe("M7a: regex literal lexing + / disambiguation", function() + it("parses /ab+/ as a regex node", function() + local ast = parser.parse("/ab+/") + assert.are.equal("regex", ast.type) + assert.are.equal("ab+", ast.source) + assert.are.equal("", ast.flags) + end) + it("captures i/m flags", function() + assert.are.equal("i", parser.parse("/ab+/i").flags) + end) + it("/ after a value is division, not a regex", function() + assert.are.equal(2, run("4/2")) + assert.are.equal(5, run("($x := 10; $x/2)")) + assert.are.equal(5, run("x/2", { x = 10 })) + assert.are.equal(1, run("(1+1)/2")) + end) + it("empty // raises S0301", function() + local ok, err = pcall(parser.parse, "//") + assert.is_false(ok) + assert.are.equal("S0301", err.code) + end) + it("unterminated /re raises S0302", function() + local ok, err = pcall(parser.parse, "/unterminated") + assert.is_false(ok) + assert.are.equal("S0302", err.code) + end) + it("a closing / inside [..] does not end the regex (depth tracking)", function() + assert.are.equal("[/]", parser.parse("/[/]/").source) + end) +end) + +describe("M7a: regex value (callable function)", function() + it("a regex is a function value", function() + assert.are.equal("function", run("$type(/b+/)")) + end) + it("applying a regex returns a match object", function() + assert.are.equal("bb", run('($m := /b+/; $m("abbc")).match')) + end) + it("captures appear in groups (0-indexed array access)", function() + assert.are.equal("bb", run('($m := /a(b+)/; $m("xabbc")).groups[0]')) + end) + it("the i flag is case-insensitive", function() + assert.are.equal("Hat", run('($m := /hat/i; $m("a Hat here")).match')) + end) + it("no match returns undefined", function() + assert.is_nil(run('($m := /z+/; $m("abc"))')) + end) +end) + +describe("M7a: $contains / $split with regex", function() + it("$contains with regex", function() + assert.is_true(run('$contains("ababbxabbcc", /ab+/)')) + assert.is_false(run('$contains("ababbxabbcc", /ax+/)')) + assert.is_true(run('$contains("a Hat", /hat/i)')) + end) + it("$contains with a string still works", function() + assert.is_true(run('$contains("Hello World", "lo")')) + assert.is_false(run('$contains("Hello World", "world")')) + end) + it("$split with regex", function() + assert.are.same({ "a", "a", "xa", "cc" }, run('$split("ababbxabbcc", /b+/)')) + assert.are.same({ "a", "b", "c" }, run('$split("a1b2c", /[0-9]/)')) + assert.are.same({ "ababbxabbcc" }, run('$split("ababbxabbcc", /d+/)')) + end) + it("$split with regex + limit", function() + assert.are.same({ "a", "a" }, run('$split("ababbxabbcc", /b+/, 2)')) + end) + it("$split with a string separator still works", function() + assert.are.same({ "Hello", "World" }, run('$split("Hello World", " ")')) + end) +end) diff --git a/src/jsonata/errors.lua b/src/jsonata/errors.lua index d5e7d69..dc843f3 100644 --- a/src/jsonata/errors.lua +++ b/src/jsonata/errors.lua @@ -12,6 +12,9 @@ local MESSAGES = { S0215 = "A context variable binding must precede any predicates on a step in a path expression", S0216 = "A context variable binding must precede the order-by clause on a step in a path expression", S0217 = "The object representing the 'parent' cannot be derived from this expression", + S0301 = "Empty regular expressions are not allowed", + S0302 = "No terminating / in regular expression", + S0303 = "Invalid regular expression: {{value}}", S0401 = "Type parameters can only be applied to functions and arrays", S0402 = "Choice groups containing parameterized types are not supported", -- Type errors @@ -33,6 +36,7 @@ local MESSAGES = { T2012 = "The delete clause of the transform expression must evaluate to an array of strings", -- Dynamic / runtime errors D1001 = "Number out of range to be formatted", + D1004 = "Regular expression matches zero length string at position {{position}}", D1002 = "Cannot negate a non-numeric value: {{value}}", D1009 = "Multiple key definitions evaluate to same key: {{value}}", D2014 = "The size of the sequence allocated by the range operator (..) must not exceed 1e7. Attempted to allocate {{value}}.", diff --git a/src/jsonata/evaluator.lua b/src/jsonata/evaluator.lua index 8608236..b6d1606 100644 --- a/src/jsonata/evaluator.lua +++ b/src/jsonata/evaluator.lua @@ -3,6 +3,14 @@ local errors = require("jsonata.errors") local functions = require("jsonata.functions") local sort = require("jsonata.sort") +local regexlib -- lazy +local function get_regexlib() + if regexlib == nil then + regexlib = require("jsonata.regex") + end + return regexlib +end + local M = {} -- The function-composition meta-lambda (jsonata chainAST): parsed once, evaluated @@ -502,6 +510,7 @@ local function step_is_self_contained(steps) and ( s1.type == "variable" or s1.type == "function" + or s1.type == "block" or s1.type == "path" or s1.type == "wildcard" or s1.type == "descendant" @@ -1089,6 +1098,44 @@ local function _evaluate(node, input, env) return V.NOTHING end return v + elseif t == "regex" then + local rl = get_regexlib() + node._matcher = node._matcher or rl.compile(node.source, node.flags) + local matcher = node._matcher + local source = node.source + local function closure(str, fromIndex) + if type(str) ~= "string" then + return V.NOTHING + end + local m = rl.first(matcher, str, fromIndex or 0) + if m == nil then + return V.NOTHING + end + local obj = V.object() + V.obj_set(obj, "match", m.match) + V.obj_set(obj, "start", m.start) + V.obj_set(obj, "end", m["end"]) + local groups = V.array({}) + for i = 1, #m.groups do + groups[i] = m.groups[i] + end + V.obj_set(obj, "groups", groups) + V.obj_set(obj, "next", { + _jsonata_function = true, + impl = function() + if m["end"] >= #str then + return V.NOTHING + end + local nxt = closure(str, m["end"]) + if not V.is_nothing(nxt) and V.obj_get(nxt, "match") == "" then + errors.raise("D1004", { position = 0, value = source }) + end + return nxt + end, + }) + return obj + end + return { _jsonata_function = true, regex = true, source = source, flags = node.flags, impl = closure } end errors.raise("D3001", { token = t }) end diff --git a/src/jsonata/functions/string.lua b/src/jsonata/functions/string.lua index 3e4dcc7..e1702aa 100644 --- a/src/jsonata/functions/string.lua +++ b/src/jsonata/functions/string.lua @@ -3,6 +3,19 @@ local H = require("jsonata.functions.helpers") local R = {} +-- Lazily reach the evaluator's apply (same pattern as higher_order.lua) to +-- avoid a load-time require cycle. Used to run a regex value against a string. +local eval +local function apply(proc, args) + eval = eval or require("jsonata.evaluator") + return eval.apply(proc, args) +end + +-- A regex literal evaluates to a callable function value tagged `regex = true`. +local function is_regex(x) + return type(x) == "table" and x._jsonata_function and x.regex +end + -- M1 scalar $string (container serialization added in Task 7). local function to_string(x) if V.is_nothing(x) then @@ -159,18 +172,20 @@ R.contains = H.def(function(s, sub) if not require_string(s, "contains", 1) then return V.NOTHING end + if is_regex(sub) then + return not V.is_nothing(apply(sub, { s })) + end require_string(sub, "contains", 2) return string.find(s, sub, 1, true) ~= nil -end, 2) +end, 2, 2, "") R.split = H.def(function(s, sep, limit) if not require_string(s, "split", 1) then return V.NOTHING end - if sep ~= nil and V.typeof(sep) ~= "string" then - H.err("T0410", { name = "split", position = 2, value = sep }) - end - if limit ~= nil then + if limit == nil or V.is_nothing(limit) then + limit = nil + else if V.typeof(limit) ~= "number" then H.err("T0410", { name = "split", position = 3, value = limit }) end @@ -179,20 +194,41 @@ R.split = H.def(function(s, sep, limit) end end local result = V.array({}) - if sep == "" then - for _, ch in ipairs(H.utf8_chars(s)) do - result[#result + 1] = ch - end - else - local pos = 1 + if is_regex(sep) then + local pos = 0 -- 0-based char index into s while true do - local i = string.find(s, sep, pos, true) - if not i then - result[#result + 1] = s:sub(pos) + local m = apply(sep, { string.sub(s, pos + 1) }) + if V.is_nothing(m) then break end - result[#result + 1] = s:sub(pos, i - 1) - pos = i + #sep + local mstart = pos + V.obj_get(m, "start") + local mend = pos + V.obj_get(m, "end") + result[#result + 1] = string.sub(s, pos + 1, mstart) + pos = mend + if mend == mstart then -- zero-width match guard + pos = pos + 1 + end + end + result[#result + 1] = string.sub(s, pos + 1) + else + if sep ~= nil and V.typeof(sep) ~= "string" then + H.err("T0410", { name = "split", position = 2, value = sep }) + end + if sep == "" then + for _, ch in ipairs(H.utf8_chars(s)) do + result[#result + 1] = ch + end + else + local pos = 1 + while true do + local i = string.find(s, sep, pos, true) + if not i then + result[#result + 1] = s:sub(pos) + break + end + result[#result + 1] = s:sub(pos, i - 1) + pos = i + #sep + end end end if limit ~= nil then @@ -203,7 +239,7 @@ R.split = H.def(function(s, sep, limit) return trimmed end return result -end, 2, 3) +end, 2, 3, ">") R.join = H.def(function(arr, sep) if nothing_guard(arr) then diff --git a/src/jsonata/parser.lua b/src/jsonata/parser.lua index b9b9f19..b644695 100644 --- a/src/jsonata/parser.lua +++ b/src/jsonata/parser.lua @@ -33,6 +33,8 @@ local function make_parser(source) type = t.type, value = t.value, position = t.position, + source = t.source, + flags = t.flags, } if t.type == "operator" or t.type == "keyword" then local sym = symbols[t.value] @@ -59,7 +61,7 @@ local function make_parser(source) local t = self.node self.advance() if t.nud == nil then - if t.type == "number" or t.type == "string" or t.type == "name" or t.type == "variable" then + if t.type == "number" or t.type == "string" or t.type == "name" or t.type == "variable" or t.type == "regex" then -- terminals build themselves below elseif t.type == "(end)" then errors.raise("S0203", { position = #source }) @@ -97,6 +99,8 @@ local function make_parser(source) return { type = "name", value = t.value, position = t.position } elseif t.type == "variable" then return { type = "variable", value = t.value, position = t.position } + elseif t.type == "regex" then + return { type = "regex", source = t.source, flags = t.flags, position = t.position } end errors.raise("S0201", { position = t.position, token = tostring(t.value) }) end diff --git a/src/jsonata/regex.lua b/src/jsonata/regex.lua new file mode 100644 index 0000000..3328377 --- /dev/null +++ b/src/jsonata/regex.lua @@ -0,0 +1,50 @@ +-- Lazy PCRE2 adapter. require("rex_pcre2") happens on first compile, so +-- non-regex programs never load it. +local M = {} + +local rex -- cached module + +local function engine() + if rex == nil then + rex = require("rex_pcre2") + end + return rex +end + +-- Compile /source/flags (jsonata flags: i, m only) into a PCRE2 matcher. +function M.compile(source, flags) + local e = engine() + local F = e.flags() + local cf = 0 + if flags:find("i", 1, true) then + cf = cf + F.CASELESS + end + if flags:find("m", 1, true) then + cf = cf + F.MULTILINE + end + local ok, matcher = pcall(e.new, source, cf) + if not ok then + error({ code = "S0303", position = 0, value = source }, 0) + end + return matcher +end + +-- First match at or after 0-based char index `from`. Returns a plain table +-- { match=, start=<0-based>, ["end"]=<0-based exclusive>, groups={...} } or nil. +-- (PCRE2 byte offsets == char offsets for ASCII; multibyte is a documented edge.) +function M.first(matcher, str, from) + local init = (from or 0) + 1 -- 1-based byte + local st, en, caps = matcher:tfind(str, init) + if not st then + return nil + end + local matched = (en < st) and "" or str:sub(st, en) + return { + match = matched, + start = st - 1, + ["end"] = st - 1 + #matched, + groups = caps or {}, + } +end + +return M diff --git a/src/jsonata/tokenizer.lua b/src/jsonata/tokenizer.lua index ab755df..f7c3836 100644 --- a/src/jsonata/tokenizer.lua +++ b/src/jsonata/tokenizer.lua @@ -31,7 +31,7 @@ local ESCAPES = { } function M.new(source) - return setmetatable({ src = source, pos = 1, len = #source }, Tokenizer) + return setmetatable({ src = source, pos = 1, len = #source, _prev = nil }, Tokenizer) end function Tokenizer:_peek() @@ -104,7 +104,7 @@ function Tokenizer:_read_backtick() return { type = "name", value = value, position = start } end -function Tokenizer:next() +function Tokenizer:_next_raw() self:_skip_ws() if self.pos > self.len then return nil @@ -183,4 +183,73 @@ function Tokenizer:next() errors.raise("S0201", { position = start, token = c }) end +-- A `/` is a regex when an operand is expected, division when a value precedes. +local VALUE_END_KEYWORDS = { ["true"] = true, ["false"] = true, ["null"] = true } +local function operand_expected(prev) + if prev == nil then + return true + end + local t, v = prev.type, prev.value + if t == "number" or t == "string" or t == "variable" or t == "name" then + return false + end + if t == "operator" and (v == ")" or v == "]" or v == "}") then + return false + end + if t == "keyword" and VALUE_END_KEYWORDS[v] then + return false + end + return true +end + +function Tokenizer:next() + self:_skip_ws() + if self.pos <= self.len and self:_peek() == "/" and operand_expected(self._prev) then + local tok = self:_read_regex() + self._prev = tok + return tok + end + local tok = self:_next_raw() + self._prev = tok + return tok +end + +function Tokenizer:_read_regex() + local start = self.pos + self.pos = self.pos + 1 -- consume opening '/' + local pat_start = self.pos + local depth = 0 + while self.pos <= self.len do + local ch = self:_peek() + if ch == "\\" then + self.pos = self.pos + 2 -- skip escaped char + elseif ch == "/" and depth == 0 then + local pattern = self.src:sub(pat_start, self.pos - 1) + if pattern == "" then + errors.raise("S0301", { position = self.pos }) + end + self.pos = self.pos + 1 -- consume closing '/' + local fstart = self.pos + while self.pos <= self.len do + local f = self:_peek() + if f == "i" or f == "m" then + self.pos = self.pos + 1 + else + break + end + end + local flags = self.src:sub(fstart, self.pos - 1) + return { type = "regex", source = pattern, flags = flags, position = start } + else + if ch == "(" or ch == "[" or ch == "{" then + depth = depth + 1 + elseif ch == ")" or ch == "]" or ch == "}" then + depth = depth - 1 + end + self.pos = self.pos + 1 + end + end + errors.raise("S0302", { position = self.pos }) +end + return M