From feb6ad276ecd25563d5172083305adc4ab7b3aac Mon Sep 17 00:00:00 2001 From: Lars Mueller Date: Sat, 1 Oct 2022 19:40:22 +0200 Subject: [PATCH] Add utf8.codepoint (inverse of utf8.char) --- utf8.lua | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/utf8.lua b/utf8.lua index 6d5b47c..9bc0b05 100644 --- a/utf8.lua +++ b/utf8.lua @@ -34,7 +34,11 @@ end function utf8.char(...) local n_args = select("#", ...) - if n_args == 0 then return end + if n_args == 0 then + return + end if n_args == 1 then + return string_char(utf8_bytes(...)) + end local chars = {} for i = 1, n_args do chars[i] = string_char(utf8_bytes(select(i, ...))) @@ -45,4 +49,38 @@ end -- Overly permissive pattern that greedily matches a single UTF-8 codepoint utf8.charpattern = "[%z-\127\194-\253][\128-\191]*" +local function utf8_codepoint(str) + local first_byte = str:byte() + if first_byte < 0x80 then + assert(#str == 1, "invalid UTF-8") + return first_byte + end + + local len, head_bits + if first_byte >= 0xC0 and first_byte <= 0xDF then -- 110_00000 to 110_11111 + len, head_bits = 2, first_byte % 0x20 -- last 5 bits + elseif first_byte >= 0xE0 and first_byte <= 0xEF then -- 1110_0000 to 1110_1111 + len, head_bits = 3, first_byte % 0x10 -- last 4 bits + elseif first_byte >= 0xF0 and first_byte <= 0xF7 then -- 11110_000 to 11110_111 + len, head_bits = 4, first_byte % 0x8 -- last 3 bits + else error"invalid UTF-8" end + assert(#str == len, "invalid UTF-8") + + local codepoint = 0 + local pow = 1 + for i = len, 2, -1 do + local byte = str:byte(i) + local val_bits = byte % 0x40 -- extract last 6 bits xxxxxx from 10xxxxxx + assert(byte - val_bits == 0x80) -- assert that first two bits are 10 + codepoint = codepoint + val_bits * pow + pow = pow * 0x40 + end + return codepoint + head_bits * pow +end + +function utf8.codepoint(...) + if select("#", ...) == 0 then return end + return utf8_codepoint(...), utf8.codepoint(select(2, ...)) +end + return utf8 \ No newline at end of file