Builtin: Redo serialize.lua (#11427)

Features:

* Support for arbitrary references, including self-referencing
* Short output, references "long" strings as a bonus
* Around the same speed, potentially slower if long, short keys are present
* Properly works with NaN and inf
This commit is contained in:
Lars Müller 2022-06-11 20:00:26 +02:00 committed by GitHub
parent ba65e0ace7
commit 3eafcab64e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 352 additions and 216 deletions

@ -40,13 +40,22 @@ jobs:
luacheck: luacheck:
name: "Builtin Luacheck and Unit Tests" name: "Builtin Luacheck and Unit Tests"
runs-on: ubuntu-20.04 runs-on: ubuntu-20.04
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- uses: leafo/gh-actions-lua@v9 - uses: leafo/gh-actions-lua@v9
with: with:
luaVersion: "5.1.5" luaVersion: "5.1.5"
- uses: leafo/gh-actions-luarocks@v4 - uses: leafo/gh-actions-luarocks@v4
- name: Install LuaJIT
run: |
cd $HOME
git clone https://github.com/LuaJIT/LuaJIT/
cd LuaJIT
make -j$(nproc)
- name: Install luarocks tools - name: Install luarocks tools
run: | run: |
luarocks install --local luacheck luarocks install --local luacheck
@ -56,6 +65,7 @@ jobs:
run: | run: |
$HOME/.luarocks/bin/luacheck builtin $HOME/.luarocks/bin/luacheck builtin
$HOME/.luarocks/bin/busted builtin $HOME/.luarocks/bin/busted builtin
$HOME/.luarocks/bin/busted builtin --lua=$HOME/LuaJIT/src/luajit
- name: Run checks (devtest) - name: Run checks (devtest)
run: | run: |

@ -1,205 +1,214 @@
--- Lua module to serialize values as Lua code. --- Lua module to serialize values as Lua code.
-- From: https://github.com/fab13n/metalua/blob/no-dll/src/lib/serialize.lua -- From: https://github.com/appgurueu/modlib/blob/master/luon.lua
-- License: MIT -- License: MIT
-- @copyright 2006-2997 Fabien Fleutot <metalua@gmail.com>
-- @author Fabien Fleutot <metalua@gmail.com>
-- @author ShadowNinja <shadowninja@minetest.net>
--------------------------------------------------------------------------------
--- Serialize an object into a source code string. This string, when passed as local next, rawget, pairs, pcall, error, type, setfenv, loadstring
-- an argument to deserialize(), returns an object structurally identical to = next, rawget, pairs, pcall, error, type, setfenv, loadstring
-- the original one. The following are currently supported:
-- * Booleans, numbers, strings, and nil.
-- * Functions; uses interpreter-dependent (and sometimes platform-dependent) bytecode!
-- * Tables; they can cantain multiple references and can be recursive, but metatables aren't saved.
-- This works in two phases:
-- 1. Recursively find and record multiple references and recursion.
-- 2. Recursively dump the value into a string.
-- @param x Value to serialize (nil is allowed).
-- @return load()able string containing the value.
function core.serialize(x)
local local_index = 1 -- Top index of the "_" local table in the dump
-- table->nil/1/2 set of tables seen.
-- nil = not seen, 1 = seen once, 2 = seen multiple times.
local seen = {}
-- nest_points are places where a table appears within itself, directly local table_concat, string_dump, string_format, string_match, math_huge
-- or not. For instance, all of these chunks create nest points in = table.concat, string.dump, string.format, string.match, math.huge
-- table x: "x = {}; x[x] = 1", "x = {}; x[1] = x",
-- "x = {}; x[1] = {y = {x}}". -- Recursively counts occurences of objects (non-primitives including strings) in a table.
-- To handle those, two tables are used by mark_nest_point: local function count_objects(value)
-- * nested - Transient set of tables being currently traversed. local counts = {}
-- Used for detecting nested tables. if value == nil then
-- * nest_points - parent->{key=value, ...} table cantaining the nested -- Early return for nil; tables can't contain nil
-- keys and values in the parent. They're all dumped after all the return counts
-- other table operations have been performed.
--
-- mark_nest_point(p, k, v) fills nest_points with information required
-- to remember that key/value (k, v) creates a nest point in table
-- parent. It also marks "parent" and the nested item(s) as occuring
-- multiple times, since several references to it will be required in
-- order to patch the nest points.
local nest_points = {}
local nested = {}
local function mark_nest_point(parent, k, v)
local nk, nv = nested[k], nested[v]
local np = nest_points[parent]
if not np then
np = {}
nest_points[parent] = np
end end
np[k] = v local function count_values(val)
seen[parent] = 2 local type_ = type(val)
if nk then seen[k] = 2 end if type_ == "boolean" or type_ == "number" then
if nv then seen[v] = 2 end
end
-- First phase, list the tables and functions which appear more than
-- once in x.
local function mark_multiple_occurences(x)
local tp = type(x)
if tp ~= "table" and tp ~= "function" then
-- No identity (comparison is done by value, not by instance)
return return
end end
if seen[x] == 1 then local count = counts[val]
seen[x] = 2 counts[val] = (count or 0) + 1
elseif seen[x] ~= 2 then if type_ == "table" then
seen[x] = 1 if not count then
for k, v in pairs(val) do
count_values(k)
count_values(v)
end
end
elseif type_ ~= "string" and type_ ~= "function" then
error("unsupported type: " .. type_)
end
end
count_values(value)
return counts
end end
if tp == "table" then -- Build a "set" of Lua keywords. These can't be used as short key names.
nested[x] = true -- See https://www.lua.org/manual/5.1/manual.html#2.1
for k, v in pairs(x) do local keywords = {}
if nested[k] or nested[v] then for _, keyword in pairs({
mark_nest_point(x, k, v) "and", "break", "do", "else", "elseif",
"end", "false", "for", "function", "if",
"in", "local", "nil", "not", "or",
"repeat", "return", "then", "true", "until", "while",
"goto" -- LuaJIT, Lua 5.2+
}) do
keywords[keyword] = true
end
local function quote(string)
return string_format("%q", string)
end
local function dump_func(func)
return string_format("loadstring(%q)", string_dump(func))
end
-- Serializes Lua nil, booleans, numbers, strings, tables and even functions
-- Tables are referenced by reference, strings are referenced by value. Supports circular tables.
local function serialize(value, write)
local reference, refnum = "r1", 1
-- [object] = reference string
local references = {}
-- Circular tables that must be filled using `table[key] = value` statements
local to_fill = {}
for object, count in pairs(count_objects(value)) do
local type_ = type(object)
-- Object must appear more than once. If it is a string, the reference has to be shorter than the string.
if count >= 2 and (type_ ~= "string" or #reference + 2 < #object) then
write(reference)
write("=")
if type_ == "table" then
write("{}")
elseif type_ == "function" then
write(dump_func(object))
elseif type_ == "string" then
write(quote(object))
end
write(";")
references[object] = reference
if type_ == "table" then
to_fill[object] = reference
end
refnum = refnum + 1
reference = ("r%X"):format(refnum)
end
end
-- Used to decide whether we should do "key=..."
local function use_short_key(key)
return not references[key] and type(key) == "string" and (not keywords[key]) and string_match(key, "^[%a_][%a%d_]*$")
end
local function dump(value)
-- Primitive types
if value == nil then
return write("nil")
end
if value == true then
return write("true")
end
if value == false then
return write("false")
end
local type_ = type(value)
if type_ == "number" then
return write(string_format("%.17g", value))
end
-- Reference types: table, function and string
local ref = references[value]
if ref then
return write(ref)
end
if type_ == "string" then
return write(quote(value))
end
if type_ == "function" then
return write(dump_func(value))
end
if type_ == "table" then
write("{")
-- First write list keys:
-- Don't use the table length #value here as it may horribly fail
-- for tables which use large integers as keys in the hash part;
-- stop at the first "hole" (nil value) instead
local len = 0
local first = true -- whether this is the first entry, which may not have a leading comma
while true do
local v = rawget(value, len + 1) -- use rawget to avoid metatables like the vector metatable
if v == nil then break end
if first then first = false else write(",") end
dump(v)
len = len + 1
end
-- Now write map keys ([key] = value)
for k, v in next, value do
-- We have written all non-float keys in [1, len] already
if type(k) ~= "number" or k % 1 ~= 0 or k < 1 or k > len then
if first then first = false else write(",") end
if use_short_key(k) then
write(k)
else else
mark_multiple_occurences(k) write("[")
mark_multiple_occurences(v) dump(k)
write("]")
end
write("=")
dump(v)
end end
end end
nested[x] = nil write("}")
return
end end
end end
-- Write the statements to fill circular tables
local dumped = {} -- object->varname set for table, ref in pairs(to_fill) do
local local_defs = {} -- Dumped local definitions as source code lines for k, v in pairs(table) do
write(ref)
-- Mutually recursive local functions: if use_short_key(k) then
local dump_val, dump_or_ref_val write(".")
write(k)
-- If x occurs multiple times, dump the local variable rather than
-- the value. If it's the first time it's dumped, also dump the
-- content in local_defs.
function dump_or_ref_val(x)
if seen[x] ~= 2 then
return dump_val(x)
end
local var = dumped[x]
if var then -- Already referenced
return var
end
-- First occurence, create and register reference
local val = dump_val(x)
local i = local_index
local_index = local_index + 1
var = "_["..i.."]"
local_defs[#local_defs + 1] = var.." = "..val
dumped[x] = var
return var
end
-- Second phase. Dump the object; subparts occuring multiple times
-- are dumped in local variables which can be referenced multiple
-- times. Care is taken to dump local vars in a sensible order.
function dump_val(x)
local tp = type(x)
if x == nil then return "nil"
elseif tp == "string" then return string.format("%q", x)
elseif tp == "boolean" then return x and "true" or "false"
elseif tp == "function" then
return string.format("loadstring(%q)", string.dump(x))
elseif tp == "number" then
-- Serialize numbers reversibly with string.format
return string.format("%.17g", x)
elseif tp == "table" then
local vals = {}
local idx_dumped = {}
local np = nest_points[x]
for i, v in ipairs(x) do
if not np or not np[i] then
vals[#vals + 1] = dump_or_ref_val(v)
end
idx_dumped[i] = true
end
for k, v in pairs(x) do
if (not np or not np[k]) and
not idx_dumped[k] then
vals[#vals + 1] = "["..dump_or_ref_val(k).."] = "
..dump_or_ref_val(v)
end
end
return "{"..table.concat(vals, ", ").."}"
else else
error("Can't serialize data of type "..tp) write("[")
dump(k)
write("]")
end end
write("=")
dump(v)
write(";")
end
end
write("return ")
dump(value)
end end
local function dump_nest_points() function core.serialize(value)
for parent, vals in pairs(nest_points) do local rope = {}
for k, v in pairs(vals) do serialize(value, function(text)
local_defs[#local_defs + 1] = dump_or_ref_val(parent) -- Faster than table.insert(rope, text) on PUC Lua 5.1
.."["..dump_or_ref_val(k).."] = " rope[#rope + 1] = text
..dump_or_ref_val(v) end)
end return table_concat(rope)
end
end
mark_multiple_occurences(x)
local top_level = dump_or_ref_val(x)
dump_nest_points()
if next(local_defs) then
return "local _ = {}\n"
..table.concat(local_defs, "\n")
.."\nreturn "..top_level
else
return "return "..top_level
end
end
-- Deserialization
local function safe_loadstring(...)
local func, err = loadstring(...)
if func then
setfenv(func, {})
return func
end
return nil, err
end end
local function dummy_func() end local function dummy_func() end
local nan = (0/0)^1 -- +nan
function core.deserialize(str, safe) function core.deserialize(str, safe)
if type(str) ~= "string" then local func, err = loadstring(str)
return nil, "Cannot deserialize type '"..type(str) if not func then return nil, err end
.."'. Argument must be a string."
end
if str:byte(1) == 0x1B then
return nil, "Bytecode prohibited"
end
local f, err = loadstring(str)
if not f then return nil, err end
-- The environment is recreated every time so deseralized code cannot -- math.huge is serialized to inf, NaNs are serialized to nan by Lua
-- pollute it with permanent references. local env = {inf = math_huge, nan = nan}
setfenv(f, {loadstring = safe and dummy_func or safe_loadstring}) if safe then
env.loadstring = dummy_func
local good, data = pcall(f)
if good then
return data
else else
return nil, data env.loadstring = function(str, ...)
local func, err = loadstring(str, ...)
if func then
setfenv(func, env)
return func
end
return nil, err
end end
end end
setfenv(func, env)
local success, value_or_err = pcall(func)
if success then
return value_or_err
end
return nil, value_or_err
end

@ -6,38 +6,92 @@ _G.setfenv = require 'busted.compatibility'.setfenv
dofile("builtin/common/serialize.lua") dofile("builtin/common/serialize.lua")
dofile("builtin/common/vector.lua") dofile("builtin/common/vector.lua")
describe("serialize", function() -- Supports circular tables; does not support table keys
it("works", function() -- Correctly checks whether a mapping of references ("same") exists
local test_in = {cat={sound="nyan", speed=400}, dog={sound="woof"}} -- Is significantly more efficient than assert.same
local test_out = core.deserialize(core.serialize(test_in)) local function assert_same(a, b, same)
same = same or {}
if same[a] or same[b] then
assert(same[a] == b and same[b] == a)
return
end
if a == b then
return
end
if type(a) ~= "table" or type(b) ~= "table" then
assert(a == b)
return
end
same[a] = b
same[b] = a
local count = 0
for k, v in pairs(a) do
count = count + 1
assert(type(k) ~= "table")
assert_same(v, b[k], same)
end
for _ in pairs(b) do
count = count - 1
end
assert(count == 0)
end
assert.same(test_in, test_out) local x, y = {}, {}
local t1, t2 = {x, x, y, y}, {x, y, x, y}
assert.same(t1, t2) -- will succeed because it only checks whether the depths match
assert(not pcall(assert_same, t1, t2)) -- will correctly fail because it checks whether the refs match
describe("serialize", function()
local function assert_preserves(value)
local preserved_value = core.deserialize(core.serialize(value))
assert_same(value, preserved_value)
end
it("works", function()
assert_preserves({cat={sound="nyan", speed=400}, dog={sound="woof"}})
end) end)
it("handles characters", function() it("handles characters", function()
local test_in = {escape_chars="\n\r\t\v\\\"\'", non_european="θשׁ٩∂"} assert_preserves({escape_chars="\n\r\t\v\\\"\'", non_european="θשׁ٩∂"})
local test_out = core.deserialize(core.serialize(test_in)) end)
assert.same(test_in, test_out)
it("handles NaN & infinities", function()
local nan = core.deserialize(core.serialize(0/0))
assert(nan ~= nan)
assert_preserves(math.huge)
assert_preserves(-math.huge)
end) end)
it("handles precise numbers", function() it("handles precise numbers", function()
local test_in = 0.2695949158945771 assert_preserves(0.2695949158945771)
local test_out = core.deserialize(core.serialize(test_in))
assert.same(test_in, test_out)
end) end)
it("handles big integers", function() it("handles big integers", function()
local test_in = 269594915894577 assert_preserves(269594915894577)
local test_out = core.deserialize(core.serialize(test_in))
assert.same(test_in, test_out)
end) end)
it("handles recursive structures", function() it("handles recursive structures", function()
local test_in = { hello = "world" } local test_in = { hello = "world" }
test_in.foo = test_in test_in.foo = test_in
assert_preserves(test_in)
end)
local test_out = core.deserialize(core.serialize(test_in)) it("handles cross-referencing structures", function()
assert.same(test_in, test_out) local test_in = {
foo = {
baz = {
{}
},
},
bar = {
baz = {},
},
}
test_in.foo.baz[1].foo = test_in.foo
test_in.foo.baz[1].bar = test_in.bar
test_in.bar.baz[1] = test_in.foo.baz[1]
assert_preserves(test_in)
end) end)
it("strips functions in safe mode", function() it("strips functions in safe mode", function()
@ -47,6 +101,7 @@ describe("serialize", function()
end, end,
foo = "bar" foo = "bar"
} }
setfenv(test_in.func, _G)
local str = core.serialize(test_in) local str = core.serialize(test_in)
assert.not_nil(str:find("loadstring")) assert.not_nil(str:find("loadstring"))
@ -58,13 +113,75 @@ describe("serialize", function()
it("vectors work", function() it("vectors work", function()
local v = vector.new(1, 2, 3) local v = vector.new(1, 2, 3)
assert.same({{x = 1, y = 2, z = 3}}, core.deserialize(core.serialize({v}))) assert_preserves({v})
assert.same({x = 1, y = 2, z = 3}, core.deserialize(core.serialize(v))) assert_preserves(v)
-- abuse -- abuse
v = vector.new(1, 2, 3) v = vector.new(1, 2, 3)
v.a = "bla" v.a = "bla"
assert.same({x = 1, y = 2, z = 3, a = "bla"}, assert_preserves(v)
core.deserialize(core.serialize(v))) end)
it("handles keywords as keys", function()
assert_preserves({["and"] = "keyword", ["for"] = "keyword"})
end)
describe("fuzzing", function()
local atomics = {true, false, math.huge, -math.huge} -- no NaN or nil
local function atomic()
return atomics[math.random(1, #atomics)]
end
local function num()
local sign = math.random() < 0.5 and -1 or 1
local val = math.random(0, 2^52)
local exp = math.random() < 0.5 and 1 or 2^(math.random(-120, 120))
return sign * val * exp
end
local function charcodes(count)
if count == 0 then return end
return math.random(0, 0xFF), charcodes(count - 1)
end
local function str()
return string.char(charcodes(math.random(0, 100)))
end
local primitives = {atomic, num, str}
local function primitive()
return primitives[math.random(1, #primitives)]()
end
local function tab(max_actions)
local root = {}
local tables = {root}
local function random_table()
return tables[#tables == 1 and 1 or math.random(1, #tables)] -- luacheck: ignore
end
for _ = 1, math.random(1, max_actions) do
local tab = random_table()
local value
if math.random() < 0.5 then
if math.random() < 0.5 then
value = random_table()
else
value = {}
table.insert(tables, value)
end
else
value = primitive()
end
tab[math.random() < 0.5 and (#tab + 1) or primitive()] = value
end
return root
end
it("primitives work", function()
for _ = 1, 1e3 do
assert_preserves(primitive())
end
end)
it("tables work", function()
for _ = 1, 100 do
local fuzzed_table = tab(1e3)
assert_same(fuzzed_table, table.copy(fuzzed_table))
assert_preserves(fuzzed_table)
end
end)
end) end)
end) end)