mirror of
https://github.com/SpaceVim/SpaceVim.git
synced 2025-02-11 03:35:47 +08:00
280 lines
7.9 KiB
Lua
Vendored
280 lines
7.9 KiB
Lua
Vendored
local utf8 = {}
|
|
|
|
local bit = require("bit") -- luajit
|
|
|
|
local band = bit.band
|
|
local bor = bit.bor
|
|
local rshift = bit.rshift
|
|
local lshift = bit.lshift
|
|
|
|
---The pattern (a string, not a function) "[\0-\x7F\xC2-\xF4][\x80-\xBF]*",
|
|
---which matches exactly one UTF-8 byte sequence, assuming that the subject is a valid UTF-8 string.
|
|
utf8.charpattern = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"
|
|
|
|
---@param idx integer
|
|
---@param func_name string
|
|
---@param range_name string
|
|
---@return string @error message
|
|
local function create_errmsg(idx, func_name, range_name)
|
|
return string.format("bad argument #%s to '%s' (%s out of range)", idx, func_name, range_name)
|
|
end
|
|
|
|
---Converts indexes of a string to positive numbers.
|
|
---@param str string
|
|
---@param idx integer
|
|
---@param msg string
|
|
---@return integer
|
|
local function validate_range(str, idx, msg)
|
|
idx = idx > 0 and idx or #str + idx + 1
|
|
if idx < 0 or idx > #str then
|
|
error(msg, 2)
|
|
end
|
|
return idx
|
|
end
|
|
|
|
---Receives zero or more integers, converts each one to its corresponding UTF-8 byte sequence
|
|
---and returns a string with the concatenation of all these sequences.
|
|
---@vararg integer
|
|
---@return string
|
|
function utf8.char(...)
|
|
local buffer = {}
|
|
for i, v in ipairs({ ... }) do
|
|
if v < 0 or v > 0x10FFFF then
|
|
error(create_errmsg(i, "char", "value"), 2)
|
|
elseif v < 0x80 then
|
|
-- single-byte
|
|
buffer[i] = string.char(v)
|
|
elseif v < 0x800 then
|
|
-- two-byte
|
|
local b1 = bor(0xC0, band(rshift(v, 6), 0x1F)) -- 110x-xxxx
|
|
local b2 = bor(0x80, band(v, 0x3F)) -- 10xx-xxxx
|
|
buffer[i] = string.char(b1, b2)
|
|
elseif v < 0x10000 then
|
|
-- three-byte
|
|
local b1 = bor(0xE0, band(rshift(v, 12), 0x0F)) -- 1110-xxxx
|
|
local b2 = bor(0x80, band(rshift(v, 6), 0x3F)) -- 10xx-xxxx
|
|
local b3 = bor(0x80, band(v, 0x3F)) -- 10xx-xxxx
|
|
buffer[i] = string.char(b1, b2, b3)
|
|
else
|
|
-- four-byte
|
|
local b1 = bor(0xF0, band(rshift(v, 18), 0x07)) -- 1111-0xxx
|
|
local b2 = bor(0x80, band(rshift(v, 12), 0x3F)) -- 10xx-xxxx
|
|
local b3 = bor(0x80, band(rshift(v, 6), 0x3F)) -- 10xx-xxxx
|
|
local b4 = bor(0x80, band(v, 0x3F)) -- 10xx-xxxx
|
|
buffer[i] = string.char(b1, b2, b3, b4)
|
|
end
|
|
end
|
|
return table.concat(buffer, "")
|
|
end
|
|
|
|
---Returns the next one character range.
|
|
---@param s string
|
|
---@param start_pos integer
|
|
---@return integer? start_pos, integer? end_pos
|
|
local function next_char(s, start_pos)
|
|
local b1 = s:byte(start_pos)
|
|
if not b1 then
|
|
return -- for offset's #s+1
|
|
end
|
|
|
|
local end_pos
|
|
|
|
if band(b1, 0x80) == 0x00 then -- single-byte (0xxx-xxxx)
|
|
return start_pos, start_pos
|
|
elseif 0xC2 <= b1 and b1 <= 0xDF then -- two-byte (range 0xC2 to 0xDF)
|
|
end_pos = start_pos + 1
|
|
elseif band(b1, 0xF0) == 0xE0 then -- three-byte (1110-xxxx)
|
|
end_pos = start_pos + 2
|
|
elseif 0xF0 <= b1 and b1 <= 0xF4 then -- four-byte (range 0xF0 to 0xF4)
|
|
end_pos = start_pos + 3
|
|
else -- invalid 1st byte
|
|
return
|
|
end
|
|
|
|
-- validate (end_pos)
|
|
if end_pos > #s then
|
|
return
|
|
end
|
|
-- validate (continuation)
|
|
for _, bn in ipairs({ s:byte(start_pos + 1, end_pos) }) do
|
|
if band(bn, 0xC0) ~= 0x80 then -- 10xx-xxxx?
|
|
return
|
|
end
|
|
end
|
|
|
|
return start_pos, end_pos
|
|
end
|
|
|
|
---Returns values so that the construction
|
|
---
|
|
---for p, c in utf8.codes(s) do body end
|
|
---
|
|
---will iterate over all UTF-8 characters in string s, with p being the position (in bytes) and c the code point of each character.
|
|
---It raises an error if it meets any invalid byte sequence.
|
|
---@param s string
|
|
---@return function iterator
|
|
function utf8.codes(s)
|
|
vim.validate({
|
|
s = { s, "string" },
|
|
})
|
|
|
|
local i = 1
|
|
return function()
|
|
if i > #s then
|
|
return
|
|
end
|
|
|
|
local start_pos, end_pos = next_char(s, i)
|
|
if start_pos == nil then
|
|
error("invalid UTF-8 code", 2)
|
|
end
|
|
|
|
i = end_pos + 1
|
|
return start_pos, s:sub(start_pos, end_pos)
|
|
end
|
|
end
|
|
|
|
---Returns the code points (as integers) from all characters in s that start between byte position i and j (both included).
|
|
---The default for i is 1 and for j is i.
|
|
---It raises an error if it meets any invalid byte sequence.
|
|
---@param s string
|
|
---@param i? integer start position. default=1
|
|
---@param j? integer end position. default=i
|
|
---@return integer @code point
|
|
function utf8.codepoint(s, i, j)
|
|
vim.validate({
|
|
s = { s, "string" },
|
|
i = { i, "number", true },
|
|
j = { j, "number", true },
|
|
})
|
|
i = validate_range(s, i or 1, create_errmsg(2, "codepoint", "initial position"))
|
|
j = validate_range(s, j or i, create_errmsg(3, "codepoint", "final position"))
|
|
|
|
local ret = {}
|
|
repeat
|
|
local char_start, char_end = next_char(s, i)
|
|
if char_start == nil then
|
|
error("invalid UTF-8 code", 2)
|
|
end
|
|
|
|
i = char_end + 1
|
|
|
|
local len = char_end - char_start + 1
|
|
if len == 1 then
|
|
-- single-byte
|
|
table.insert(ret, s:byte(char_start))
|
|
else
|
|
-- multi-byte
|
|
local b1 = s:byte(char_start)
|
|
b1 = band(lshift(b1, len + 1), 0xFF) -- e.g. 110x-xxxx -> xxxx-x000
|
|
b1 = lshift(b1, len * 5 - 7) -- >> len+1 and << (len-1)*6
|
|
|
|
local cp = 0
|
|
for k = char_start + 1, char_end do
|
|
local bn = s:byte(k)
|
|
cp = bor(lshift(cp, 6), band(bn, 0x3F))
|
|
end
|
|
|
|
cp = bor(b1, cp)
|
|
table.insert(ret, cp)
|
|
end
|
|
until char_end >= j
|
|
|
|
return unpack(ret)
|
|
end
|
|
|
|
---Returns the number of UTF-8 characters in string s that start between positions i and j (both inclusive).
|
|
---The default for i is 1 and for j is -1.
|
|
---If it finds any invalid byte sequence, returns fail plus the position of the first invalid byte.
|
|
---@param s string
|
|
---@param i? integer start position. default=1
|
|
---@param j? integer end position. default=-1
|
|
---@return integer | nil
|
|
---@return integer?
|
|
function utf8.len(s, i, j)
|
|
vim.validate({
|
|
s = { s, "string" },
|
|
i = { i, "number", true },
|
|
j = { j, "number", true },
|
|
})
|
|
i = validate_range(s, i or 1, create_errmsg(2, "len", "initial position"))
|
|
j = validate_range(s, j or -1, create_errmsg(3, "len", "final position"))
|
|
|
|
local len = 0
|
|
|
|
repeat
|
|
local char_start, char_end = next_char(s, i)
|
|
if char_start == nil then
|
|
return nil, i
|
|
end
|
|
|
|
i = char_end + 1
|
|
len = len + 1
|
|
until char_end >= j
|
|
|
|
return len
|
|
end
|
|
|
|
---Returns the position (in bytes) where the encoding of the n-th character of s (counting from position i) starts.
|
|
---A negative n gets characters before position i.
|
|
---The default for i is 1 when n is non-negative and #s+1 otherwise, so that utf8.offset(s, -n) gets the offset of the n-th character from the end of the string.
|
|
---If the specified character is neither in the subject nor right after its end, the function returns fail.
|
|
---
|
|
---As a special case, when n is 0 the function returns the start of the encoding of the character that contains the i-th byte of s.
|
|
---@param s string
|
|
---@param n integer
|
|
---@param i? integer start position. if n >= 0, default=1, otherwise default=#s+1
|
|
---@return integer?
|
|
function utf8.offset(s, n, i)
|
|
vim.validate({
|
|
s = { s, "string" },
|
|
n = { n, "number" },
|
|
i = { i, "number", true },
|
|
})
|
|
|
|
i = i or n >= 0 and 1 or #s + 1
|
|
|
|
if n >= 0 or i ~= #s + 1 then
|
|
i = validate_range(s, i, create_errmsg(3, "offset", "position"))
|
|
end
|
|
|
|
if n == 0 then
|
|
for j = i, 1, -1 do
|
|
local char_start = next_char(s, j)
|
|
if char_start then
|
|
return char_start
|
|
end
|
|
end
|
|
elseif n > 0 then
|
|
if not next_char(s, i) then
|
|
error("initial position is a continuation byte", 2)
|
|
end
|
|
|
|
for j = i, #s do
|
|
local char_start = next_char(s, j)
|
|
if char_start then
|
|
n = n - 1
|
|
if n == 0 then
|
|
return char_start
|
|
end
|
|
end
|
|
end
|
|
else
|
|
if i ~= #s + 1 and not next_char(s, i) then
|
|
error("initial position is a continuation byte", 2)
|
|
end
|
|
|
|
for j = i, 1, -1 do
|
|
local char_start = next_char(s, j)
|
|
if char_start then
|
|
n = n + 1
|
|
if n == 0 then
|
|
return char_start
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
return utf8
|