1
0
mirror of https://github.com/SpaceVim/SpaceVim.git synced 2025-02-11 03:35:47 +08:00
SpaceVim/bundle/cmp-dictionary/lua/cmp_dictionary/lib/utf8.lua
2023-06-11 21:41:39 +08:00

280 lines
7.9 KiB
Lua
Vendored

local utf8 = {}
local bit = require("bit") -- luajit
local band = bit.band
local bor = bit.bor
local rshift = bit.rshift
local lshift = bit.lshift
---The pattern (a string, not a function) "[\0-\x7F\xC2-\xF4][\x80-\xBF]*",
---which matches exactly one UTF-8 byte sequence, assuming that the subject is a valid UTF-8 string.
utf8.charpattern = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"
---@param idx integer
---@param func_name string
---@param range_name string
---@return string @error message
local function create_errmsg(idx, func_name, range_name)
return string.format("bad argument #%s to '%s' (%s out of range)", idx, func_name, range_name)
end
---Converts indexes of a string to positive numbers.
---@param str string
---@param idx integer
---@param msg string
---@return integer
local function validate_range(str, idx, msg)
idx = idx > 0 and idx or #str + idx + 1
if idx < 0 or idx > #str then
error(msg, 2)
end
return idx
end
---Receives zero or more integers, converts each one to its corresponding UTF-8 byte sequence
---and returns a string with the concatenation of all these sequences.
---@vararg integer
---@return string
function utf8.char(...)
local buffer = {}
for i, v in ipairs({ ... }) do
if v < 0 or v > 0x10FFFF then
error(create_errmsg(i, "char", "value"), 2)
elseif v < 0x80 then
-- single-byte
buffer[i] = string.char(v)
elseif v < 0x800 then
-- two-byte
local b1 = bor(0xC0, band(rshift(v, 6), 0x1F)) -- 110x-xxxx
local b2 = bor(0x80, band(v, 0x3F)) -- 10xx-xxxx
buffer[i] = string.char(b1, b2)
elseif v < 0x10000 then
-- three-byte
local b1 = bor(0xE0, band(rshift(v, 12), 0x0F)) -- 1110-xxxx
local b2 = bor(0x80, band(rshift(v, 6), 0x3F)) -- 10xx-xxxx
local b3 = bor(0x80, band(v, 0x3F)) -- 10xx-xxxx
buffer[i] = string.char(b1, b2, b3)
else
-- four-byte
local b1 = bor(0xF0, band(rshift(v, 18), 0x07)) -- 1111-0xxx
local b2 = bor(0x80, band(rshift(v, 12), 0x3F)) -- 10xx-xxxx
local b3 = bor(0x80, band(rshift(v, 6), 0x3F)) -- 10xx-xxxx
local b4 = bor(0x80, band(v, 0x3F)) -- 10xx-xxxx
buffer[i] = string.char(b1, b2, b3, b4)
end
end
return table.concat(buffer, "")
end
---Returns the next one character range.
---@param s string
---@param start_pos integer
---@return integer? start_pos, integer? end_pos
local function next_char(s, start_pos)
local b1 = s:byte(start_pos)
if not b1 then
return -- for offset's #s+1
end
local end_pos
if band(b1, 0x80) == 0x00 then -- single-byte (0xxx-xxxx)
return start_pos, start_pos
elseif 0xC2 <= b1 and b1 <= 0xDF then -- two-byte (range 0xC2 to 0xDF)
end_pos = start_pos + 1
elseif band(b1, 0xF0) == 0xE0 then -- three-byte (1110-xxxx)
end_pos = start_pos + 2
elseif 0xF0 <= b1 and b1 <= 0xF4 then -- four-byte (range 0xF0 to 0xF4)
end_pos = start_pos + 3
else -- invalid 1st byte
return
end
-- validate (end_pos)
if end_pos > #s then
return
end
-- validate (continuation)
for _, bn in ipairs({ s:byte(start_pos + 1, end_pos) }) do
if band(bn, 0xC0) ~= 0x80 then -- 10xx-xxxx?
return
end
end
return start_pos, end_pos
end
---Returns values so that the construction
---
---for p, c in utf8.codes(s) do body end
---
---will iterate over all UTF-8 characters in string s, with p being the position (in bytes) and c the code point of each character.
---It raises an error if it meets any invalid byte sequence.
---@param s string
---@return function iterator
function utf8.codes(s)
vim.validate({
s = { s, "string" },
})
local i = 1
return function()
if i > #s then
return
end
local start_pos, end_pos = next_char(s, i)
if start_pos == nil then
error("invalid UTF-8 code", 2)
end
i = end_pos + 1
return start_pos, s:sub(start_pos, end_pos)
end
end
---Returns the code points (as integers) from all characters in s that start between byte position i and j (both included).
---The default for i is 1 and for j is i.
---It raises an error if it meets any invalid byte sequence.
---@param s string
---@param i? integer start position. default=1
---@param j? integer end position. default=i
---@return integer @code point
function utf8.codepoint(s, i, j)
vim.validate({
s = { s, "string" },
i = { i, "number", true },
j = { j, "number", true },
})
i = validate_range(s, i or 1, create_errmsg(2, "codepoint", "initial position"))
j = validate_range(s, j or i, create_errmsg(3, "codepoint", "final position"))
local ret = {}
repeat
local char_start, char_end = next_char(s, i)
if char_start == nil then
error("invalid UTF-8 code", 2)
end
i = char_end + 1
local len = char_end - char_start + 1
if len == 1 then
-- single-byte
table.insert(ret, s:byte(char_start))
else
-- multi-byte
local b1 = s:byte(char_start)
b1 = band(lshift(b1, len + 1), 0xFF) -- e.g. 110x-xxxx -> xxxx-x000
b1 = lshift(b1, len * 5 - 7) -- >> len+1 and << (len-1)*6
local cp = 0
for k = char_start + 1, char_end do
local bn = s:byte(k)
cp = bor(lshift(cp, 6), band(bn, 0x3F))
end
cp = bor(b1, cp)
table.insert(ret, cp)
end
until char_end >= j
return unpack(ret)
end
---Returns the number of UTF-8 characters in string s that start between positions i and j (both inclusive).
---The default for i is 1 and for j is -1.
---If it finds any invalid byte sequence, returns fail plus the position of the first invalid byte.
---@param s string
---@param i? integer start position. default=1
---@param j? integer end position. default=-1
---@return integer | nil
---@return integer?
function utf8.len(s, i, j)
vim.validate({
s = { s, "string" },
i = { i, "number", true },
j = { j, "number", true },
})
i = validate_range(s, i or 1, create_errmsg(2, "len", "initial position"))
j = validate_range(s, j or -1, create_errmsg(3, "len", "final position"))
local len = 0
repeat
local char_start, char_end = next_char(s, i)
if char_start == nil then
return nil, i
end
i = char_end + 1
len = len + 1
until char_end >= j
return len
end
---Returns the position (in bytes) where the encoding of the n-th character of s (counting from position i) starts.
---A negative n gets characters before position i.
---The default for i is 1 when n is non-negative and #s+1 otherwise, so that utf8.offset(s, -n) gets the offset of the n-th character from the end of the string.
---If the specified character is neither in the subject nor right after its end, the function returns fail.
---
---As a special case, when n is 0 the function returns the start of the encoding of the character that contains the i-th byte of s.
---@param s string
---@param n integer
---@param i? integer start position. if n >= 0, default=1, otherwise default=#s+1
---@return integer?
function utf8.offset(s, n, i)
vim.validate({
s = { s, "string" },
n = { n, "number" },
i = { i, "number", true },
})
i = i or n >= 0 and 1 or #s + 1
if n >= 0 or i ~= #s + 1 then
i = validate_range(s, i, create_errmsg(3, "offset", "position"))
end
if n == 0 then
for j = i, 1, -1 do
local char_start = next_char(s, j)
if char_start then
return char_start
end
end
elseif n > 0 then
if not next_char(s, i) then
error("initial position is a continuation byte", 2)
end
for j = i, #s do
local char_start = next_char(s, j)
if char_start then
n = n - 1
if n == 0 then
return char_start
end
end
end
else
if i ~= #s + 1 and not next_char(s, i) then
error("initial position is a continuation byte", 2)
end
for j = i, 1, -1 do
local char_start = next_char(s, j)
if char_start then
n = n + 1
if n == 0 then
return char_start
end
end
end
end
end
return utf8