Fix & optimize utf8.char

This commit is contained in:
Lars Mueller 2022-10-01 19:31:27 +02:00
parent 4677b2c827
commit cd234548e5

@ -1,4 +1,5 @@
local string_char, table_concat = string.char, table.concat local assert, error, select, string_char, table_concat
= assert, error, select, string.char, table.concat
local utf8 = {} local utf8 = {}
@ -7,28 +8,36 @@ function utf8.is_valid_codepoint(codepoint)
return codepoint <= 0x10FFFF and (codepoint < 0xD800 or codepoint > 0xDFFF) return codepoint <= 0x10FFFF and (codepoint < 0xD800 or codepoint > 0xDFFF)
end end
local function utf8_char(codepoint) local function utf8_bytes(codepoint)
if codepoint <= 0x007F then -- single byte if codepoint <= 0x007F then
return string_char(codepoint) -- UTF-8 encoded string return codepoint
end end if codepoint <= 0x7FF then
local result = "" local payload_2 = codepoint % 0x40
local i = 0 codepoint = (codepoint - payload_2) / 0x40
repeat return 0xC0 + codepoint, 0x80 + payload_2
local remainder = codepoint % 64 end if codepoint <= 0xFFFF then
result = string_char(128 + remainder) .. result local payload_3 = codepoint % 0x40
codepoint = (codepoint - remainder) / 64 codepoint = (codepoint - payload_3) / 0x40
i = i + 1 local payload_2 = codepoint % 0x40
until codepoint <= 2 ^ (8 - i - 2) codepoint = (codepoint - payload_2) / 0x40
return 0xE0 + codepoint, 0x80 + payload_2, 0x80 + payload_3
return string_char(0x100 - 2 ^ (8 - i - 1) + codepoint) .. result -- UTF-8 encoded string end if codepoint <= 0x10FFFF then
local payload_4 = codepoint % 0x40
codepoint = (codepoint - payload_4) / 0x40
local payload_3 = codepoint % 0x40
codepoint = (codepoint - payload_3) / 0x40
local payload_2 = codepoint % 0x40
codepoint = (codepoint - payload_2) / 0x40
return 0xF0 + codepoint, 0x80 + payload_2, 0x80 + payload_3, 0x80 + payload_4
end error"codepoint out of range"
end end
function utf8.char(...) function utf8.char(...)
local n_args = select("#", ...) local n_args = select("#", ...)
if n_args == 1 then return utf8_char(...) end if n_args == 0 then return end
local chars = {} local chars = {}
for i = 1, n_args do for i = 1, n_args do
chars[i] = utf8_char(select(i, ...)) chars[i] = string_char(utf8_bytes(select(i, ...)))
end end
return table_concat(chars) return table_concat(chars)
end end