SpaceVim/bundle/cmp-dictionary/lua/cmp_dictionary/lib/utf8.lua

local utf8 = {}

local bit = require("bit") -- luajit

local band = bit.band
local bor = bit.bor
local rshift = bit.rshift
local lshift = bit.lshift

---The pattern (a string, not a function) "[\0-\x7F\xC2-\xF4][\x80-\xBF]*",
---which matches exactly one UTF-8 byte sequence, assuming that the subject is a valid UTF-8 string.
utf8.charpattern = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"

---@param idx integer
---@param func_name string
---@param range_name string
---@return string @error message
local function create_errmsg(idx, func_name, range_name)
  return string.format("bad argument #%s to '%s' (%s out of range)", idx, func_name, range_name)
end

---Converts indexes of a string to positive numbers.
---@param str string
---@param idx integer
---@param msg string
---@return integer
local function validate_range(str, idx, msg)
  idx = idx > 0 and idx or #str + idx + 1
  if idx < 0 or idx > #str then
    error(msg, 2)
  end
  return idx
end

---Receives zero or more integers, converts each one to its corresponding UTF-8 byte sequence
---and returns a string with the concatenation of all these sequences.
---@vararg integer
---@return string
function utf8.char(...)
  local buffer = {}
  for i, v in ipairs({ ... }) do
    if v < 0 or v > 0x10FFFF then
      error(create_errmsg(i, "char", "value"), 2)
    elseif v < 0x80 then
      -- single-byte
      buffer[i] = string.char(v)
    elseif v < 0x800 then
      -- two-byte
      local b1 = bor(0xC0, band(rshift(v, 6), 0x1F)) -- 110x-xxxx
      local b2 = bor(0x80, band(v, 0x3F)) -- 10xx-xxxx
      buffer[i] = string.char(b1, b2)
    elseif v < 0x10000 then
      -- three-byte
      local b1 = bor(0xE0, band(rshift(v, 12), 0x0F)) -- 1110-xxxx
      local b2 = bor(0x80, band(rshift(v, 6), 0x3F)) -- 10xx-xxxx
      local b3 = bor(0x80, band(v, 0x3F)) -- 10xx-xxxx
      buffer[i] = string.char(b1, b2, b3)
    else
      -- four-byte
      local b1 = bor(0xF0, band(rshift(v, 18), 0x07)) -- 1111-0xxx
      local b2 = bor(0x80, band(rshift(v, 12), 0x3F)) -- 10xx-xxxx
      local b3 = bor(0x80, band(rshift(v, 6), 0x3F)) -- 10xx-xxxx
      local b4 = bor(0x80, band(v, 0x3F)) -- 10xx-xxxx
      buffer[i] = string.char(b1, b2, b3, b4)
    end
  end
  return table.concat(buffer, "")
end

---Returns the next one character range.
---@param s string
---@param start_pos integer
---@return integer? start_pos, integer? end_pos
local function next_char(s, start_pos)
  local b1 = s:byte(start_pos)
  if not b1 then
    return -- for offset's #s+1
  end

  local end_pos

  if band(b1, 0x80) == 0x00 then -- single-byte (0xxx-xxxx)
    return start_pos, start_pos
  elseif 0xC2 <= b1 and b1 <= 0xDF then -- two-byte (range 0xC2 to 0xDF)
    end_pos = start_pos + 1
  elseif band(b1, 0xF0) == 0xE0 then -- three-byte (1110-xxxx)
    end_pos = start_pos + 2
  elseif 0xF0 <= b1 and b1 <= 0xF4 then -- four-byte (range 0xF0 to 0xF4)
    end_pos = start_pos + 3
  else -- invalid 1st byte
    return
  end

  -- validate (end_pos)
  if end_pos > #s then
    return
  end
  -- validate (continuation)
  for _, bn in ipairs({ s:byte(start_pos + 1, end_pos) }) do
    if band(bn, 0xC0) ~= 0x80 then -- 10xx-xxxx?
      return
    end
  end

  return start_pos, end_pos
end

---Returns values so that the construction
---
---for p, c in utf8.codes(s) do body end
---
---will iterate over all UTF-8 characters in string s, with p being the position (in bytes) and c the code point of each character.
---It raises an error if it meets any invalid byte sequence.
---@param s string
---@return function iterator
function utf8.codes(s)
  vim.validate({
    s = { s, "string" },
  })

  local i = 1
  return function()
    if i > #s then
      return
    end

    local start_pos, end_pos = next_char(s, i)
    if start_pos == nil then
      error("invalid UTF-8 code", 2)
    end

    i = end_pos + 1
    return start_pos, s:sub(start_pos, end_pos)
  end
end

---Returns the code points (as integers) from all characters in s that start between byte position i and j (both included).
---The default for i is 1 and for j is i.
---It raises an error if it meets any invalid byte sequence.
---@param s string
---@param i? integer start position. default=1
---@param j? integer end position. default=i
---@return integer @code point
function utf8.codepoint(s, i, j)
  vim.validate({
    s = { s, "string" },
    i = { i, "number", true },
    j = { j, "number", true },
  })
  i = validate_range(s, i or 1, create_errmsg(2, "codepoint", "initial position"))
  j = validate_range(s, j or i, create_errmsg(3, "codepoint", "final position"))

  local ret = {}
  repeat
    local char_start, char_end = next_char(s, i)
    if char_start == nil then
      error("invalid UTF-8 code", 2)
    end

    i = char_end + 1

    local len = char_end - char_start + 1
    if len == 1 then
      -- single-byte
      table.insert(ret, s:byte(char_start))
    else
      -- multi-byte
      local b1 = s:byte(char_start)
      b1 = band(lshift(b1, len + 1), 0xFF) -- e.g. 110x-xxxx -> xxxx-x000
      b1 = lshift(b1, len * 5 - 7) -- >> len+1 and << (len-1)*6

      local cp = 0
      for k = char_start + 1, char_end do
        local bn = s:byte(k)
        cp = bor(lshift(cp, 6), band(bn, 0x3F))
      end

      cp = bor(b1, cp)
      table.insert(ret, cp)
    end
  until char_end >= j

  return unpack(ret)
end

---Returns the number of UTF-8 characters in string s that start between positions i and j (both inclusive).
---The default for i is 1 and for j is -1.
---If it finds any invalid byte sequence, returns fail plus the position of the first invalid byte.
---@param s string
---@param i? integer start position. default=1
---@param j? integer end position. default=-1
---@return integer | nil
---@return integer?
function utf8.len(s, i, j)
  vim.validate({
    s = { s, "string" },
    i = { i, "number", true },
    j = { j, "number", true },
  })
  i = validate_range(s, i or 1, create_errmsg(2, "len", "initial position"))
  j = validate_range(s, j or -1, create_errmsg(3, "len", "final position"))

  local len = 0

  repeat
    local char_start, char_end = next_char(s, i)
    if char_start == nil then
      return nil, i
    end

    i = char_end + 1
    len = len + 1
  until char_end >= j

  return len
end

---Returns the position (in bytes) where the encoding of the n-th character of s (counting from position i) starts.
---A negative n gets characters before position i.
---The default for i is 1 when n is non-negative and #s+1 otherwise, so that utf8.offset(s, -n) gets the offset of the n-th character from the end of the string.
---If the specified character is neither in the subject nor right after its end, the function returns fail.
---
---As a special case, when n is 0 the function returns the start of the encoding of the character that contains the i-th byte of s.
---@param s string
---@param n integer
---@param i? integer start position. if n >= 0, default=1, otherwise default=#s+1
---@return integer?
function utf8.offset(s, n, i)
  vim.validate({
    s = { s, "string" },
    n = { n, "number" },
    i = { i, "number", true },
  })

  i = i or n >= 0 and 1 or #s + 1

  if n >= 0 or i ~= #s + 1 then
    i = validate_range(s, i, create_errmsg(3, "offset", "position"))
  end

  if n == 0 then
    for j = i, 1, -1 do
      local char_start = next_char(s, j)
      if char_start then
        return char_start
      end
    end
  elseif n > 0 then
    if not next_char(s, i) then
      error("initial position is a continuation byte", 2)
    end

    for j = i, #s do
      local char_start = next_char(s, j)
      if char_start then
        n = n - 1
        if n == 0 then
          return char_start
        end
      end
    end
  else
    if i ~= #s + 1 and not next_char(s, i) then
      error("initial position is a continuation byte", 2)
    end

    for j = i, 1, -1 do
      local char_start = next_char(s, j)
      if char_start then
        n = n + 1
        if n == 0 then
          return char_start
        end
      end
    end
  end
end

return utf8