2025-10-21 00:40:14 +02:00

713 lines
13 KiB
Lua

-- Lex, by LoganDark
-- Can be loaded using os.loadAPI, has only a single function: lex.lex('code here')
-- If loaded using dofile(), it returns the lex function (for environments outside ComputerCraft)
-- It returns a list of lists, where each list is one line.
-- Each line contains tokens (in the order they are found), where each token is formatted like this:
-- {
-- type = one of the token types below,
-- data = the source code that makes up the token,
-- posFirst = the position (inclusive) within THAT LINE that the token starts
-- posLast = the position (inclusive) within THAT LINE that the token ends
-- }
-- Possible token types:
-- whitespace: Self-explanatory. Can match spaces, newlines, tabs, and carriage returns (although I don't know why anyone would use those... WINDOWS)
-- comment: Either multi-line or single-line comments.
-- string: A string. Usually the part of the string that is not an escape.
-- escape: Can only be found within strings (although they are separate tokens)
-- keyword: Keywords. Like "while", "end", "do", etc
-- value: Special values. Only true, false, and nil.
-- ident: Identifier. Variables, function names, etc..
-- number: Numbers!
-- symbol: Symbols, like brackets, parenthesis, ., .., ... etc
-- operator: Operators, like =, ==, >=, <=, ~=, etc
-- unidentified: Anything that isn't one of the above tokens. Consider them ERRORS.
local chars = {
whitespace = {
[' '] = true,
['\n'] = true,
['\t'] = true,
['\r'] = true
},
validEscapes = {
['a'] = true,
['b'] = true,
['f'] = true,
['n'] = true,
['r'] = true,
['t'] = true,
['v'] = true,
['"'] = true,
['\''] = true,
['\\'] = true,
['\n'] = true
},
ident = {
['a'] = true,
['b'] = true,
['c'] = true,
['d'] = true,
['e'] = true,
['f'] = true,
['g'] = true,
['h'] = true,
['i'] = true,
['j'] = true,
['k'] = true,
['l'] = true,
['m'] = true,
['n'] = true,
['o'] = true,
['p'] = true,
['q'] = true,
['r'] = true,
['s'] = true,
['t'] = true,
['u'] = true,
['v'] = true,
['w'] = true,
['x'] = true,
['y'] = true,
['z'] = true,
['A'] = true,
['B'] = true,
['C'] = true,
['D'] = true,
['E'] = true,
['F'] = true,
['G'] = true,
['H'] = true,
['I'] = true,
['J'] = true,
['K'] = true,
['L'] = true,
['M'] = true,
['N'] = true,
['O'] = true,
['P'] = true,
['Q'] = true,
['R'] = true,
['S'] = true,
['T'] = true,
['U'] = true,
['V'] = true,
['W'] = true,
['X'] = true,
['Y'] = true,
['Z'] = true,
['_'] = true,
['0'] = true,
['1'] = true,
['2'] = true,
['3'] = true,
['4'] = true,
['5'] = true,
['6'] = true,
['7'] = true,
['8'] = true,
['9'] = true,
start = {
['a'] = true,
['b'] = true,
['c'] = true,
['d'] = true,
['e'] = true,
['f'] = true,
['g'] = true,
['h'] = true,
['i'] = true,
['j'] = true,
['k'] = true,
['l'] = true,
['m'] = true,
['n'] = true,
['o'] = true,
['p'] = true,
['q'] = true,
['r'] = true,
['s'] = true,
['t'] = true,
['u'] = true,
['v'] = true,
['w'] = true,
['x'] = true,
['y'] = true,
['z'] = true,
['A'] = true,
['B'] = true,
['C'] = true,
['D'] = true,
['E'] = true,
['F'] = true,
['G'] = true,
['H'] = true,
['I'] = true,
['J'] = true,
['K'] = true,
['L'] = true,
['M'] = true,
['N'] = true,
['O'] = true,
['P'] = true,
['Q'] = true,
['R'] = true,
['S'] = true,
['T'] = true,
['U'] = true,
['V'] = true,
['W'] = true,
['X'] = true,
['Y'] = true,
['Z'] = true,
['_'] = true
},
},
digits = {
['0'] = true,
['1'] = true,
['2'] = true,
['3'] = true,
['4'] = true,
['5'] = true,
['6'] = true,
['7'] = true,
['8'] = true,
['9'] = true,
hex = {
['0'] = true,
['1'] = true,
['2'] = true,
['3'] = true,
['4'] = true,
['5'] = true,
['6'] = true,
['7'] = true,
['8'] = true,
['9'] = true,
['a'] = true,
['b'] = true,
['c'] = true,
['d'] = true,
['e'] = true,
['f'] = true,
['A'] = true,
['B'] = true,
['C'] = true,
['D'] = true,
['E'] = true,
['F'] = true
}
},
symbols = {
['+'] = true,
['-'] = true,
['*'] = true,
['/'] = true,
['^'] = true,
['%'] = true,
[','] = true,
['{'] = true,
['}'] = true,
['['] = true,
[']'] = true,
['('] = true,
[')'] = true,
[';'] = true,
['#'] = true,
['.'] = true,
[':'] = true,
equality = {
['~'] = true,
['='] = true,
['>'] = true,
['<'] = true
},
operators = {
['+'] = true,
['-'] = true,
['*'] = true,
['/'] = true,
['^'] = true,
['%'] = true,
['#'] = true
}
}
}
local keywords = {
structure = {
['and'] = true,
['break'] = true,
['do'] = true,
['else'] = true,
['elseif'] = true,
['end'] = true,
['for'] = true,
['function'] = true,
['goto'] = true,
['if'] = true,
['in'] = true,
['local'] = true,
['not'] = true,
['or'] = true,
['repeat'] = true,
['return'] = true,
['then'] = true,
['until'] = true,
['while'] = true
},
values = {
['true'] = true,
['false'] = true,
['nil'] = true,
['self'] = true,
}
}
function lex(text)
local pos = 1
local start = 1
local len = #text
local buffer = {}
local lines = {}
local function look(delta)
delta = pos + (delta or 0)
return text:sub(delta, delta)
end
local function get()
local char = text:sub(pos, pos)
pos = pos + 1
return char
end
local function getLevel()
local num = 0
while look(num) == '=' do
num = num + 1
end
if look(num) == '[' then
pos = pos + num
return num
else
return nil
end
end
local function getToken()
return text:sub(start, pos - 1)
end
local currentLineLength = 0
local lineoffset = 0
local function token(type, text)
local tk = buffer[#buffer]
if not tk or tk.type ~= type then
local tk = {
type = type,
data = text or getToken(),
posFirst = start - lineoffset,
posLast = pos - 1 - lineoffset
}
if tk.data ~= '' then
buffer[#buffer + 1] = tk
end
else
tk.data = tk.data .. (text or getToken())
tk.posLast = tk.posFirst + #tk.data - 1
--tk.posLast = getCol(pos - 1)
end
currentLineLength = currentLineLength + (text or getToken()):len()
start = pos
return tk
end
local function newline()
lines[#lines + 1] = buffer
buffer = {}
get()
token('newline')
buffer[1] = nil
lineoffset = lineoffset + currentLineLength
currentLineLength = 0
end
local function getData(level, type)
while true do
local char = get()
if char == '' then
return
elseif char == '\n' then
pos = pos - 1
token(type)
newline()
elseif char == ']' then
local valid = true
for i = 1, level do
if look() == '=' then
pos = pos + 1
else
valid = false
break
end
end
if valid and look() == ']' then
pos = pos + 1
return
end
end
end
end
while true do
while true do
local char = look()
if char == '\n' then
token('whitespace')
newline()
elseif chars.whitespace[char] then
pos = pos + 1
else
break
end
end
token('whitespace')
local char = get()
if char == '' then
break
elseif char == '-' and look() == '-' then
pos = pos + 1
if look() == '[' then
pos = pos + 1
local level = getLevel()
if level then
getData(level, 'comment')
else
while true do
local char2 = get()
if char2 == '' or char2 == '\n' then
pos = pos - 1
token('comment')
if char2 == '\n' then
newline()
end
break
end
end
end
else
while true do
local char2 = get()
if char2 == '' or char2 == '\n' then
pos = pos - 1
token('comment')
if char2 == '\n' then
newline()
end
break
end
end
end
token('comment')
elseif char == '\'' or char == '"' then
local cbuf = #buffer
while true do
if not buffer[cbuf] then
break
elseif buffer[cbuf].type == "whitespace" then
cbuf = cbuf-1
elseif buffer[cbuf].type == "ident" then
buffer[cbuf].type = "function"
break
else
break
end
end
while true do
local char2 = get()
if char2 == '\\' then
pos = pos - 1
token('string')
get()
local char3 = get()
if chars.digits[char3] then
for i = 1, 2 do
if chars.digits[look()] then
pos = pos + 1
end
end
elseif char3 == 'x' then
if chars.digits.hex[look()] and chars.digits.hex[look(1)] then
pos = pos + 2
else
token('unidentified')
end
elseif char3 == '\n' then
pos = pos - 1
token('escape')
newline()
elseif not chars.validEscapes[char3] then
token('unidentified')
end
token('escape')
elseif char2 == '\n' then
pos = pos - 1
token('string')
newline()
break
elseif char2 == char or char2 == '' then
break
end
end
token('string')
elseif chars.ident.start[char] then
while chars.ident[look()] do
pos = pos + 1
end
local word = getToken()
if word == 'self' or word == '_ENV' or word == "_G" then
token('arg')
elseif word == 'function' then
local findBracket = false
local c = 0
while true do
_G.debugstr = ""
local lChar = look(c)
_G.debugstr = debugstr..lChar
if lChar == " " or lChar == "\t" then
c = c+1
elseif lChar == "(" then
findBracket = true
break
else
break
end
end
if findBracket then
local cbuf = #buffer
local findEquals = false
while true do
--_G.debugstr = debugstr..buffer[cbuf].type
if not buffer[cbuf] then
break
elseif buffer[cbuf].type == "whitespace" then
cbuf = cbuf-1
elseif buffer[cbuf].data == "=" and not findEquals then
cbuf = cbuf-1
findEquals = true
elseif buffer[cbuf].type == "ident" and findEquals then
buffer[cbuf].type = "nfunction"
break
else
break
end
end
end
token('function')
elseif keywords.structure[word] then
token('keyword')
elseif keywords.values[word] then
token('value')
else
local findBracket = false
local c = 0
while true do
local lChar = look(c)
if lChar == " " or lChar == "\t" then
c = c+1
elseif lChar == "(" then
findBracket = true
break
else
break
end
end
if findBracket then
if buffer[#buffer-1] and buffer[#buffer-1].data == "function" and buffer[#buffer].type == "whitespace" then
token('nfunction')
else
token('function')
end
else
local b = #buffer
local isArg = true
local closedArgs = false
while true do
local buf = buffer[b]
if not buf then
isArg = false
break
elseif buf.data == "(" or buf.type == "whitespace" or buf.data == "," or buf.type == "arg" then
if buf.data == "(" then
closedArgs = true
end
b = b-1
elseif (buf.data == "function" or buf.type == "nfunction") and closedArgs then
token('arg')
break
else
isArg = false
break
end
end
if not isArg then
token('ident')
end
end
end
elseif chars.digits[char] or (char == '.' and chars.digits[look()]) then
if char == '0' and look() == 'x' then
pos = pos + 1
while chars.digits.hex[look()] do
pos = pos + 1
end
else
while chars.digits[look()] do
pos = pos + 1
end
if look() == '.' then
pos = pos + 1
while chars.digits[look()] do
pos = pos + 1
end
end
if look():lower() == 'e' then
pos = pos + 1
if look() == '-' then
pos = pos + 1
end
while chars.digits[look()] do
pos = pos + 1
end
end
end
token('number')
elseif char == '[' then
local level = getLevel()
if level then
local cbuf = #buffer
while true do
if not buffer[cbuf] then
break
elseif buffer[cbuf].type == "whitespace" then
cbuf = cbuf-1
elseif buffer[cbuf].type == "ident" then
buffer[cbuf].type = "function"
break
else
break
end
end
getData(level, 'string')
token('string')
else
token('symbol')
end
elseif char == '.' then
if look() == '.' then
pos = pos + 1
if look() == '.' then
pos = pos + 1
token('value')
else
token('operator')
end
else
token('symbol')
end
elseif chars.symbols.equality[char] then
if look() == '=' then
pos = pos + 1
else
end
token('operator')
elseif chars.symbols[char] then
if chars.symbols.operators[char] then
token('operator')
else
if char == "{" then
local cbuf = #buffer
while true do
if not buffer[cbuf] then
break
elseif buffer[cbuf].type == "whitespace" then
cbuf = cbuf-1
elseif buffer[cbuf].type == "ident" then
buffer[cbuf].type = "function"
break
else
break
end
end
end
token('symbol')
end
else
token('unidentified')
end
end
lines[#lines + 1] = buffer
return lines
end
return lex