mirror of
https://github.com/appgurueu/modlib.git
synced 2024-11-22 07:13:45 +01:00
Fix JSON reader surrogate pair handling
This commit is contained in:
parent
2a6846c9a2
commit
9664be450e
38
json.lua
38
json.lua
@ -5,6 +5,7 @@ local _ENV = {}
|
|||||||
setfenv(1, _ENV)
|
setfenv(1, _ENV)
|
||||||
|
|
||||||
--! experimental
|
--! experimental
|
||||||
|
-- See https://tools.ietf.org/id/draft-ietf-json-rfc4627bis-09.html#unichars and https://json.org
|
||||||
|
|
||||||
-- Null
|
-- Null
|
||||||
-- TODO consider using userdata (for ex. by using newproxy)
|
-- TODO consider using userdata (for ex. by using newproxy)
|
||||||
@ -100,6 +101,7 @@ for i = 0, 5 do
|
|||||||
end
|
end
|
||||||
|
|
||||||
-- TODO SAX vs DOM
|
-- TODO SAX vs DOM
|
||||||
|
local utf8 = modlib.text.utf8
|
||||||
function read(self, read_)
|
function read(self, read_)
|
||||||
local index = 0
|
local index = 0
|
||||||
local char
|
local char
|
||||||
@ -146,27 +148,51 @@ function read(self, read_)
|
|||||||
read()
|
read()
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
local function utf8_codepoint(codepoint)
|
||||||
|
return syntax_assert(utf8(codepoint), "invalid codepoint")
|
||||||
|
end
|
||||||
local function string()
|
local function string()
|
||||||
local chars = {}
|
local chars = {}
|
||||||
|
local high_surrogate
|
||||||
while true do
|
while true do
|
||||||
|
local string_char, next_high_surrogate
|
||||||
if char == '"' then
|
if char == '"' then
|
||||||
|
if high_surrogate then
|
||||||
|
table_insert(chars, utf8_codepoint(high_surrogate))
|
||||||
|
end
|
||||||
return table_concat(chars)
|
return table_concat(chars)
|
||||||
end
|
end
|
||||||
if char == "\\" then
|
if char == "\\" then
|
||||||
read()
|
read()
|
||||||
if char == "u" then
|
if char == "u" then
|
||||||
local num = 0
|
local codepoint = 0
|
||||||
for i = 3, 0, -1 do
|
for i = 3, 0, -1 do
|
||||||
num = syntax_assert(hex_digit_values[read()], "expected a hex digit") * (16 ^ i) + num
|
codepoint = syntax_assert(hex_digit_values[read()], "expected a hex digit") * (16 ^ i) + codepoint
|
||||||
|
end
|
||||||
|
if high_surrogate and codepoint >= 0xDC00 and codepoint <= 0xDFFF then
|
||||||
|
-- TODO strict mode: throw an error for single surrogates
|
||||||
|
codepoint = 0x10000 + (high_surrogate - 0xD800) * 0x400 + codepoint - 0xDC00
|
||||||
|
-- Don't write the high surrogate
|
||||||
|
high_surrogate = nil
|
||||||
|
end
|
||||||
|
if codepoint >= 0xD800 and codepoint <= 0xDBFF then
|
||||||
|
next_high_surrogate = codepoint
|
||||||
|
else
|
||||||
|
string_char = utf8_codepoint(codepoint)
|
||||||
end
|
end
|
||||||
table_insert(chars, syntax_assert(modlib.text.utf8(num), "invalid codepoint"))
|
|
||||||
else
|
else
|
||||||
table_insert(chars, syntax_assert(decoding_escapes[char], "invalid escape sequence"))
|
string_char = syntax_assert(decoding_escapes[char], "invalid escape sequence")
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
syntax_assert(char, "unclosed string")
|
|
||||||
-- TODO check whether the character is one that must be escaped ("strict" mode)
|
-- TODO check whether the character is one that must be escaped ("strict" mode)
|
||||||
table_insert(chars, char)
|
string_char = syntax_assert(char, "unclosed string")
|
||||||
|
end
|
||||||
|
if high_surrogate then
|
||||||
|
table_insert(chars, utf8_codepoint(high_surrogate))
|
||||||
|
end
|
||||||
|
high_surrogate = next_high_surrogate
|
||||||
|
if string_char then
|
||||||
|
table_insert(chars, string_char)
|
||||||
end
|
end
|
||||||
read()
|
read()
|
||||||
end
|
end
|
||||||
|
17
test.lua
17
test.lua
@ -282,7 +282,22 @@ do
|
|||||||
return json:read_string(json:write_string(object))
|
return json:read_string(json:write_string(object))
|
||||||
end)
|
end)
|
||||||
-- Verify spacing is accepted
|
-- Verify spacing is accepted
|
||||||
assert(modlib.table.equals_noncircular(json:read_string'\t\t\n{ "a" : 1, \t"b":2, "c" : [ 1, 2 ,3 ] } \n\r\t', {a = 1, b = 2, c = {1, 2, 3}}))
|
assert(table.equals_noncircular(json:read_string'\t\t\n{ "a" : 1, \t"b":2, "c" : [ 1, 2 ,3 ] } \n\r\t', {a = 1, b = 2, c = {1, 2, 3}}))
|
||||||
|
-- Simple surrogate pair tests
|
||||||
|
for _, prefix in pairs{"x", ""} do
|
||||||
|
for _, suffix in pairs{"x", ""} do
|
||||||
|
local function test(str, expected_str)
|
||||||
|
if type(expected_str) == "number" then
|
||||||
|
expected_str = text.utf8(expected_str)
|
||||||
|
end
|
||||||
|
return assert(json:read_string('"' .. prefix .. str .. suffix .. '"') == prefix .. expected_str .. suffix)
|
||||||
|
end
|
||||||
|
test([[\uD834\uDD1E]], 0x1D11E)
|
||||||
|
test([[\uDD1E\uD834]], text.utf8(0xDD1E) .. text.utf8(0xD834))
|
||||||
|
test([[\uD834]], 0xD834)
|
||||||
|
test([[\uDD1E]], 0xDD1E)
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
-- luon
|
-- luon
|
||||||
|
Loading…
Reference in New Issue
Block a user