Move text.utf8 to utf8.char

This commit is contained in:
Lars Mueller 2022-09-09 14:17:18 +02:00
parent 341dd5851b
commit 825599f2d0
6 changed files with 31 additions and 26 deletions

@ -5,7 +5,7 @@ local res, code = https.request"https://html.spec.whatwg.org/entities.json"
assert(code == 200)
local entity_map = {}
for entity, chars in pairs(assert(modlib.json:read_string(res))) do
entity_map[entity:sub(2, #entity - 1)] = table.concat(modlib.table.map(chars.codepoints, modlib.text.utf8))
entity_map[entity:sub(2, #entity - 1)] = table.concat(modlib.table.map(chars.codepoints, modlib.utf8.char))
end
local entries = {}
for entity, chars in pairs(entity_map) do

@ -45,6 +45,7 @@ for _, file in pairs{
"table",
"vararg",
"text",
"utf8",
"vector",
"quaternion",
"trie",

@ -101,7 +101,7 @@ for i = 0, 5 do
end
-- TODO SAX vs DOM
local utf8 = modlib.text.utf8
local utf8_char = modlib.utf8.char
function read(self, read_)
local index = 0
local char
@ -149,7 +149,7 @@ function read(self, read_)
end
end
local function utf8_codepoint(codepoint)
return syntax_assert(utf8(codepoint), "invalid codepoint")
return syntax_assert(utf8_char(codepoint), "invalid codepoint")
end
local function string()
local chars = {}

@ -138,27 +138,6 @@ magic_charset = "[" .. table.concat(magic_charset) .. "]"
function escape_magic_chars(text) return text:gsub("(" .. magic_charset .. ")", "%%%1") end
function utf8(number)
if number <= 0x007F then
-- Single byte
return string.char(number)
end
if number < 0x00A0 or number > 0x10FFFF then
-- Out of range
return
end
local result = ""
local i = 0
while true do
local remainder = number % 64
result = string.char(128 + remainder) .. result
number = (number - remainder) / 64
i = i + 1
if number <= 2 ^ (8 - i - 2) then break end
end
return string.char(256 - 2 ^ (8 - i - 1) + number) .. result
end
local keywords = modlib.table.set{"and", "break", "do", "else", "elseif", "end", "false", "for", "function", "if", "in", "local", "nil", "not", "or", "repeat", "return", "then", "true", "until", "while"}
keywords["goto"] = true -- Lua 5.2 (LuaJIT) support

25
utf8.lua Normal file

@ -0,0 +1,25 @@
local string_char = string.char
local utf8 = {}
function utf8.char(codepoint)
if codepoint <= 0x007F then
-- Single byte
return string_char(codepoint)
end
if codepoint < 0x00A0 or codepoint > 0x10FFFF then
-- Out of range
return -- TODO (?) error instead
end
local result = ""
local i = 0
repeat
local remainder = codepoint % 64
result = string_char(128 + remainder) .. result
codepoint = (codepoint - remainder) / 64
i = i + 1
until codepoint <= 2 ^ (8 - i - 2)
return string_char(0x100 - 2 ^ (8 - i - 1) + codepoint) .. result
end
return utf8

@ -6,8 +6,8 @@ local html = setmetatable({}, {__index = function(self, key)
local function unescape(text)
return text
:gsub("&([A-Za-z]+);", named_entities) -- named
:gsub("&#(%d+);", function(digits) return modlib.text.utf8(tonumber(digits)) end) -- decimal
:gsub("&#x(%x+);", function(digits) return modlib.text.utf8(tonumber(digits, 16)) end) -- hex
:gsub("&#(%d+);", function(digits) return modlib.utf8.char(tonumber(digits)) end) -- decimal
:gsub("&#x(%x+);", function(digits) return modlib.utf8.char(tonumber(digits, 16)) end) -- hex
end
self.unescape = unescape
return unescape