mirror of
https://github.com/appgurueu/modlib.git
synced 2024-11-22 07:13:45 +01:00
Fix JSON reader surrogate pair handling
This commit is contained in:
parent
2a6846c9a2
commit
9664be450e
38
json.lua
38
json.lua
@ -5,6 +5,7 @@ local _ENV = {}
|
||||
setfenv(1, _ENV)
|
||||
|
||||
--! experimental
|
||||
-- See https://tools.ietf.org/id/draft-ietf-json-rfc4627bis-09.html#unichars and https://json.org
|
||||
|
||||
-- Null
|
||||
-- TODO consider using userdata (for ex. by using newproxy)
|
||||
@ -100,6 +101,7 @@ for i = 0, 5 do
|
||||
end
|
||||
|
||||
-- TODO SAX vs DOM
|
||||
local utf8 = modlib.text.utf8
|
||||
function read(self, read_)
|
||||
local index = 0
|
||||
local char
|
||||
@ -146,27 +148,51 @@ function read(self, read_)
|
||||
read()
|
||||
end
|
||||
end
|
||||
local function utf8_codepoint(codepoint)
|
||||
return syntax_assert(utf8(codepoint), "invalid codepoint")
|
||||
end
|
||||
local function string()
|
||||
local chars = {}
|
||||
local high_surrogate
|
||||
while true do
|
||||
local string_char, next_high_surrogate
|
||||
if char == '"' then
|
||||
if high_surrogate then
|
||||
table_insert(chars, utf8_codepoint(high_surrogate))
|
||||
end
|
||||
return table_concat(chars)
|
||||
end
|
||||
if char == "\\" then
|
||||
read()
|
||||
if char == "u" then
|
||||
local num = 0
|
||||
local codepoint = 0
|
||||
for i = 3, 0, -1 do
|
||||
num = syntax_assert(hex_digit_values[read()], "expected a hex digit") * (16 ^ i) + num
|
||||
codepoint = syntax_assert(hex_digit_values[read()], "expected a hex digit") * (16 ^ i) + codepoint
|
||||
end
|
||||
if high_surrogate and codepoint >= 0xDC00 and codepoint <= 0xDFFF then
|
||||
-- TODO strict mode: throw an error for single surrogates
|
||||
codepoint = 0x10000 + (high_surrogate - 0xD800) * 0x400 + codepoint - 0xDC00
|
||||
-- Don't write the high surrogate
|
||||
high_surrogate = nil
|
||||
end
|
||||
if codepoint >= 0xD800 and codepoint <= 0xDBFF then
|
||||
next_high_surrogate = codepoint
|
||||
else
|
||||
string_char = utf8_codepoint(codepoint)
|
||||
end
|
||||
table_insert(chars, syntax_assert(modlib.text.utf8(num), "invalid codepoint"))
|
||||
else
|
||||
table_insert(chars, syntax_assert(decoding_escapes[char], "invalid escape sequence"))
|
||||
string_char = syntax_assert(decoding_escapes[char], "invalid escape sequence")
|
||||
end
|
||||
else
|
||||
syntax_assert(char, "unclosed string")
|
||||
-- TODO check whether the character is one that must be escaped ("strict" mode)
|
||||
table_insert(chars, char)
|
||||
string_char = syntax_assert(char, "unclosed string")
|
||||
end
|
||||
if high_surrogate then
|
||||
table_insert(chars, utf8_codepoint(high_surrogate))
|
||||
end
|
||||
high_surrogate = next_high_surrogate
|
||||
if string_char then
|
||||
table_insert(chars, string_char)
|
||||
end
|
||||
read()
|
||||
end
|
||||
|
17
test.lua
17
test.lua
@ -282,7 +282,22 @@ do
|
||||
return json:read_string(json:write_string(object))
|
||||
end)
|
||||
-- Verify spacing is accepted
|
||||
assert(modlib.table.equals_noncircular(json:read_string'\t\t\n{ "a" : 1, \t"b":2, "c" : [ 1, 2 ,3 ] } \n\r\t', {a = 1, b = 2, c = {1, 2, 3}}))
|
||||
assert(table.equals_noncircular(json:read_string'\t\t\n{ "a" : 1, \t"b":2, "c" : [ 1, 2 ,3 ] } \n\r\t', {a = 1, b = 2, c = {1, 2, 3}}))
|
||||
-- Simple surrogate pair tests
|
||||
for _, prefix in pairs{"x", ""} do
|
||||
for _, suffix in pairs{"x", ""} do
|
||||
local function test(str, expected_str)
|
||||
if type(expected_str) == "number" then
|
||||
expected_str = text.utf8(expected_str)
|
||||
end
|
||||
return assert(json:read_string('"' .. prefix .. str .. suffix .. '"') == prefix .. expected_str .. suffix)
|
||||
end
|
||||
test([[\uD834\uDD1E]], 0x1D11E)
|
||||
test([[\uDD1E\uD834]], text.utf8(0xDD1E) .. text.utf8(0xD834))
|
||||
test([[\uD834]], 0xD834)
|
||||
test([[\uDD1E]], 0xDD1E)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
-- luon
|
||||
|
Loading…
Reference in New Issue
Block a user