Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,16 @@ jobs:
- name: Set up LuaRocks
uses: leafo/gh-actions-luarocks@v4

- name: Install dev dependencies (busted, lua-cjson)
# libpcre2-dev provides the PCRE2 C library that lrexlib-pcre2 links against
# (the regex engine for $match/$replace/$split/$contains, lazy-loaded).
- name: Install the LuaJIT interpreter + PCRE2
run: sudo apt-get update && sudo apt-get install -y luajit libpcre2-dev

- name: Install dev dependencies (busted, lua-cjson, lrexlib-pcre2)
run: |
luarocks install busted
luarocks install lua-cjson

- name: Install the LuaJIT interpreter
run: sudo apt-get update && sudo apt-get install -y luajit
luarocks install lrexlib-pcre2

# Invoke busted's runner directly under LuaJIT (this is exactly what
# busted's own bin/busted does) so the whole suite — unit tests + the
Expand Down
1 change: 1 addition & 0 deletions jsonata-scm-1.rockspec
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ description = {
}
dependencies = {
"lua >= 5.1",
"lrexlib-pcre2",
}
test_dependencies = {
"busted",
Expand Down
10 changes: 10 additions & 0 deletions spec/jsonata-suite/baseline.lua
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ return {
["function-applications/case018"] = true,
["function-applications/case019"] = true,
["function-applications/case020"] = true,
["function-applications/case021"] = true,
["function-assert/case000"] = true,
["function-assert/case001"] = true,
["function-assert/case002"] = true,
Expand Down Expand Up @@ -604,6 +605,7 @@ return {
["function-shuffle/case003"] = true,
["function-sift/case000"] = true,
["function-sift/case001"] = true,
["function-sift/case002"] = true,
["function-sift/case003"] = true,
["function-sift/case004"] = true,
["function-signatures/case000"] = true,
Expand Down Expand Up @@ -1049,6 +1051,13 @@ return {
["range-operator/case020"] = true,
["range-operator/case022"] = true,
["range-operator/case023"] = true,
["regex/case000"] = true,
["regex/case001"] = true,
["regex/case002"] = true,
["regex/case003"] = true,
["regex/case004"] = true,
["regex/case005"] = true,
["regex/case006"] = true,
["regex/case022"] = true,
["regex/case035"] = true,
["regex/case036"] = true,
Expand Down Expand Up @@ -1252,4 +1261,5 @@ return {
["wildcards/case009"] = true,
["wildcards/case010/0"] = true,
["wildcards/case010/1"] = true,
["wildcards/case010/2"] = true,
}
77 changes: 77 additions & 0 deletions spec/regex_spec.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
local jsonata = require("jsonata")
local parser = require("jsonata.parser")
local function run(src, input)
return jsonata.compile(src):evaluate(input)
end

describe("M7a: regex literal lexing + / disambiguation", function()
it("parses /ab+/ as a regex node", function()
local ast = parser.parse("/ab+/")
assert.are.equal("regex", ast.type)
assert.are.equal("ab+", ast.source)
assert.are.equal("", ast.flags)
end)
it("captures i/m flags", function()
assert.are.equal("i", parser.parse("/ab+/i").flags)
end)
it("/ after a value is division, not a regex", function()
assert.are.equal(2, run("4/2"))
assert.are.equal(5, run("($x := 10; $x/2)"))
assert.are.equal(5, run("x/2", { x = 10 }))
assert.are.equal(1, run("(1+1)/2"))
end)
it("empty // raises S0301", function()
local ok, err = pcall(parser.parse, "//")
assert.is_false(ok)
assert.are.equal("S0301", err.code)
end)
it("unterminated /re raises S0302", function()
local ok, err = pcall(parser.parse, "/unterminated")
assert.is_false(ok)
assert.are.equal("S0302", err.code)
end)
it("a closing / inside [..] does not end the regex (depth tracking)", function()
assert.are.equal("[/]", parser.parse("/[/]/").source)
end)
end)

describe("M7a: regex value (callable function)", function()
it("a regex is a function value", function()
assert.are.equal("function", run("$type(/b+/)"))
end)
it("applying a regex returns a match object", function()
assert.are.equal("bb", run('($m := /b+/; $m("abbc")).match'))
end)
it("captures appear in groups (0-indexed array access)", function()
assert.are.equal("bb", run('($m := /a(b+)/; $m("xabbc")).groups[0]'))
end)
it("the i flag is case-insensitive", function()
assert.are.equal("Hat", run('($m := /hat/i; $m("a Hat here")).match'))
end)
it("no match returns undefined", function()
assert.is_nil(run('($m := /z+/; $m("abc"))'))
end)
end)

describe("M7a: $contains / $split with regex", function()
it("$contains with regex", function()
assert.is_true(run('$contains("ababbxabbcc", /ab+/)'))
assert.is_false(run('$contains("ababbxabbcc", /ax+/)'))
assert.is_true(run('$contains("a Hat", /hat/i)'))
end)
it("$contains with a string still works", function()
assert.is_true(run('$contains("Hello World", "lo")'))
assert.is_false(run('$contains("Hello World", "world")'))
end)
it("$split with regex", function()
assert.are.same({ "a", "a", "xa", "cc" }, run('$split("ababbxabbcc", /b+/)'))
assert.are.same({ "a", "b", "c" }, run('$split("a1b2c", /[0-9]/)'))
assert.are.same({ "ababbxabbcc" }, run('$split("ababbxabbcc", /d+/)'))
end)
it("$split with regex + limit", function()
assert.are.same({ "a", "a" }, run('$split("ababbxabbcc", /b+/, 2)'))
end)
it("$split with a string separator still works", function()
assert.are.same({ "Hello", "World" }, run('$split("Hello World", " ")'))
end)
end)
4 changes: 4 additions & 0 deletions src/jsonata/errors.lua
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ local MESSAGES = {
S0215 = "A context variable binding must precede any predicates on a step in a path expression",
S0216 = "A context variable binding must precede the order-by clause on a step in a path expression",
S0217 = "The object representing the 'parent' cannot be derived from this expression",
S0301 = "Empty regular expressions are not allowed",
S0302 = "No terminating / in regular expression",
S0303 = "Invalid regular expression: {{value}}",
S0401 = "Type parameters can only be applied to functions and arrays",
S0402 = "Choice groups containing parameterized types are not supported",
-- Type errors
Expand All @@ -33,6 +36,7 @@ local MESSAGES = {
T2012 = "The delete clause of the transform expression must evaluate to an array of strings",
-- Dynamic / runtime errors
D1001 = "Number out of range to be formatted",
D1004 = "Regular expression matches zero length string at position {{position}}",
D1002 = "Cannot negate a non-numeric value: {{value}}",
D1009 = "Multiple key definitions evaluate to same key: {{value}}",
D2014 = "The size of the sequence allocated by the range operator (..) must not exceed 1e7. Attempted to allocate {{value}}.",
Expand Down
47 changes: 47 additions & 0 deletions src/jsonata/evaluator.lua
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@ local errors = require("jsonata.errors")
local functions = require("jsonata.functions")
local sort = require("jsonata.sort")

local regexlib -- lazy
local function get_regexlib()
if regexlib == nil then
regexlib = require("jsonata.regex")
end
return regexlib
end

local M = {}

-- The function-composition meta-lambda (jsonata chainAST): parsed once, evaluated
Expand Down Expand Up @@ -502,6 +510,7 @@ local function step_is_self_contained(steps)
and (
s1.type == "variable"
or s1.type == "function"
or s1.type == "block"
or s1.type == "path"
or s1.type == "wildcard"
or s1.type == "descendant"
Expand Down Expand Up @@ -1089,6 +1098,44 @@ local function _evaluate(node, input, env)
return V.NOTHING
end
return v
elseif t == "regex" then
local rl = get_regexlib()
node._matcher = node._matcher or rl.compile(node.source, node.flags)
local matcher = node._matcher
local source = node.source
local function closure(str, fromIndex)
if type(str) ~= "string" then
return V.NOTHING
end
local m = rl.first(matcher, str, fromIndex or 0)
if m == nil then
return V.NOTHING
end
local obj = V.object()
V.obj_set(obj, "match", m.match)
V.obj_set(obj, "start", m.start)
V.obj_set(obj, "end", m["end"])
local groups = V.array({})
for i = 1, #m.groups do
groups[i] = m.groups[i]
end
V.obj_set(obj, "groups", groups)
V.obj_set(obj, "next", {
_jsonata_function = true,
impl = function()
if m["end"] >= #str then
return V.NOTHING
end
local nxt = closure(str, m["end"])
if not V.is_nothing(nxt) and V.obj_get(nxt, "match") == "" then
errors.raise("D1004", { position = 0, value = source })
end
return nxt
end,
})
return obj
end
return { _jsonata_function = true, regex = true, source = source, flags = node.flags, impl = closure }
end
errors.raise("D3001", { token = t })
end
Expand Down
70 changes: 53 additions & 17 deletions src/jsonata/functions/string.lua
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@ local H = require("jsonata.functions.helpers")

local R = {}

-- Lazily reach the evaluator's apply (same pattern as higher_order.lua) to
-- avoid a load-time require cycle. Used to run a regex value against a string.
local eval
local function apply(proc, args)
eval = eval or require("jsonata.evaluator")
return eval.apply(proc, args)
end

-- A regex literal evaluates to a callable function value tagged `regex = true`.
local function is_regex(x)
return type(x) == "table" and x._jsonata_function and x.regex
end

-- M1 scalar $string (container serialization added in Task 7).
local function to_string(x)
if V.is_nothing(x) then
Expand Down Expand Up @@ -159,18 +172,20 @@ R.contains = H.def(function(s, sub)
if not require_string(s, "contains", 1) then
return V.NOTHING
end
if is_regex(sub) then
return not V.is_nothing(apply(sub, { s }))
end
require_string(sub, "contains", 2)
return string.find(s, sub, 1, true) ~= nil
end, 2)
end, 2, 2, "<s-(sf):b>")

R.split = H.def(function(s, sep, limit)
if not require_string(s, "split", 1) then
return V.NOTHING
end
if sep ~= nil and V.typeof(sep) ~= "string" then
H.err("T0410", { name = "split", position = 2, value = sep })
end
if limit ~= nil then
if limit == nil or V.is_nothing(limit) then
limit = nil
else
if V.typeof(limit) ~= "number" then
H.err("T0410", { name = "split", position = 3, value = limit })
end
Expand All @@ -179,20 +194,41 @@ R.split = H.def(function(s, sep, limit)
end
end
local result = V.array({})
if sep == "" then
for _, ch in ipairs(H.utf8_chars(s)) do
result[#result + 1] = ch
end
else
local pos = 1
if is_regex(sep) then
local pos = 0 -- 0-based char index into s
while true do
local i = string.find(s, sep, pos, true)
if not i then
result[#result + 1] = s:sub(pos)
local m = apply(sep, { string.sub(s, pos + 1) })
if V.is_nothing(m) then
break
end
result[#result + 1] = s:sub(pos, i - 1)
pos = i + #sep
local mstart = pos + V.obj_get(m, "start")
local mend = pos + V.obj_get(m, "end")
result[#result + 1] = string.sub(s, pos + 1, mstart)
pos = mend
if mend == mstart then -- zero-width match guard
pos = pos + 1
end
end
result[#result + 1] = string.sub(s, pos + 1)
else
if sep ~= nil and V.typeof(sep) ~= "string" then
H.err("T0410", { name = "split", position = 2, value = sep })
end
if sep == "" then
for _, ch in ipairs(H.utf8_chars(s)) do
result[#result + 1] = ch
end
else
local pos = 1
while true do
local i = string.find(s, sep, pos, true)
if not i then
result[#result + 1] = s:sub(pos)
break
end
result[#result + 1] = s:sub(pos, i - 1)
pos = i + #sep
end
end
end
if limit ~= nil then
Expand All @@ -203,7 +239,7 @@ R.split = H.def(function(s, sep, limit)
return trimmed
end
return result
end, 2, 3)
end, 2, 3, "<s-(sf)n?:a<s>>")

R.join = H.def(function(arr, sep)
if nothing_guard(arr) then
Expand Down
6 changes: 5 additions & 1 deletion src/jsonata/parser.lua
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ local function make_parser(source)
type = t.type,
value = t.value,
position = t.position,
source = t.source,
flags = t.flags,
}
if t.type == "operator" or t.type == "keyword" then
local sym = symbols[t.value]
Expand All @@ -59,7 +61,7 @@ local function make_parser(source)
local t = self.node
self.advance()
if t.nud == nil then
if t.type == "number" or t.type == "string" or t.type == "name" or t.type == "variable" then
if t.type == "number" or t.type == "string" or t.type == "name" or t.type == "variable" or t.type == "regex" then
-- terminals build themselves below
elseif t.type == "(end)" then
errors.raise("S0203", { position = #source })
Expand Down Expand Up @@ -97,6 +99,8 @@ local function make_parser(source)
return { type = "name", value = t.value, position = t.position }
elseif t.type == "variable" then
return { type = "variable", value = t.value, position = t.position }
elseif t.type == "regex" then
return { type = "regex", source = t.source, flags = t.flags, position = t.position }
end
errors.raise("S0201", { position = t.position, token = tostring(t.value) })
end
Expand Down
Loading
Loading