512 lines
19 KiB
Lua
512 lines
19 KiB
Lua
----------------------------------------------------------------------
|
|
-- Metalua: $Id: mll.lua,v 1.3 2006/11/15 09:07:50 fab13n Exp $
|
|
--
|
|
-- Summary: generic Lua-style lexer definition. You need this plus
|
|
-- some keyword additions to create the complete Lua lexer,
|
|
-- as is done in mlp_lexer.lua.
|
|
--
|
|
-- TODO:
|
|
--
|
|
-- * Make it easy to define new flavors of strings. Replacing the
|
|
-- lexer.patterns.long_string regexp by an extensible list, with
|
|
-- customizable token tag, would probably be enough. Maybe add:
|
|
-- + an index of capture for the regexp, that would specify
|
|
-- which capture holds the content of the string-like token
|
|
-- + a token tag
|
|
-- + or a string->string transformer function.
|
|
--
|
|
-- * There are some _G.table to prevent a namespace clash which has
|
|
-- now disappered. remove them.
|
|
----------------------------------------------------------------------
|
|
--
|
|
-- Copyright (c) 2006, Fabien Fleutot <metalua@gmail.com>.
|
|
--
|
|
-- This software is released under the MIT Licence, see licence.txt
|
|
-- for details.
|
|
--
|
|
----------------------------------------------------------------------
|
|
|
|
module ("lexer", package.seeall)
|
|
|
|
-- don't load metalua.runtime as it loads metalua.base, which pollutes
|
|
-- global namespace and overwrites pairs/ipairs -- PK 6/4/2012
|
|
require 'metalua.table2'
|
|
|
|
lexer = { alpha={ }, sym={ } }
|
|
lexer.__index=lexer
|
|
|
|
local debugf = function() end
|
|
--local debugf=printf
|
|
|
|
----------------------------------------------------------------------
|
|
-- Patterns used by [lexer:extract] to decompose the raw string into
|
|
-- correctly tagged tokens.
|
|
----------------------------------------------------------------------
|
|
lexer.patterns = {
|
|
spaces = "^[ \r\n\t]*()",
|
|
short_comment = "^%-%-([^\n]*)()\n",
|
|
final_short_comment = "^%-%-([^\n]*)()$",
|
|
long_comment = "^%-%-%[(=*)%[\n?(.-)%]%1%]()",
|
|
long_string = "^%[(=*)%[\n?(.-)%]%1%]()",
|
|
number_mantissa = { "^%d+%.?%d*()", "^%d*%.%d+()" },
|
|
number_exponant = "^[eE][%+%-]?%d+()",
|
|
number_hex = "^0[xX]%x+()",
|
|
word = "^([%a_][%w_]*)()"
|
|
}
|
|
|
|
----------------------------------------------------------------------
|
|
-- unescape a whole string, applying [unesc_digits] and
|
|
-- [unesc_letter] as many times as required.
|
|
----------------------------------------------------------------------
|
|
local function unescape_string (s)
|
|
|
|
-- Turn the digits of an escape sequence into the corresponding
|
|
-- character, e.g. [unesc_digits("123") == string.char(123)].
|
|
local function unesc_digits (backslashes, digits)
|
|
if #backslashes%2==0 then
|
|
-- Even number of backslashes, they escape each other, not the digits.
|
|
-- Return them so that unesc_letter() can treaat them
|
|
return backslashes..digits
|
|
else
|
|
-- Remove the odd backslash, which escapes the number sequence.
|
|
-- The rest will be returned and parsed by unesc_letter()
|
|
backslashes = backslashes :sub (1,-2)
|
|
end
|
|
local k, j, i = digits:reverse():byte(1, 3)
|
|
local z = _G.string.byte "0"
|
|
local code = (k or z) + 10*(j or z) + 100*(i or z) - 111*z
|
|
if code > 255 then
|
|
error ("Illegal escape sequence '\\"..digits..
|
|
"' in string: ASCII codes must be in [0..255]")
|
|
end
|
|
return backslashes .. string.char (code)
|
|
end
|
|
|
|
-- Take a letter [x], and returns the character represented by the
|
|
-- sequence ['\\'..x], e.g. [unesc_letter "n" == "\n"].
|
|
local function unesc_letter(x)
|
|
local t = {
|
|
a = "\a", b = "\b", f = "\f",
|
|
n = "\n", r = "\r", t = "\t", v = "\v",
|
|
["\\"] = "\\", ["'"] = "'", ['"'] = '"', ["\n"] = "\n" }
|
|
return t[x] or error([[Unknown escape sequence '\]]..x..[[']])
|
|
end
|
|
|
|
return s
|
|
:gsub ("(\\+)([0-9][0-9]?[0-9]?)", unesc_digits)
|
|
:gsub ("\\(%D)",unesc_letter)
|
|
end
|
|
|
|
lexer.extractors = {
|
|
"skip_whitespaces_and_comments",
|
|
"extract_short_string", "extract_word", "extract_number",
|
|
"extract_long_string", "extract_symbol" }
|
|
|
|
lexer.token_metatable = {
|
|
-- __tostring = function(a)
|
|
-- return string.format ("`%s{'%s'}",a.tag, a[1])
|
|
-- end
|
|
}
|
|
|
|
lexer.lineinfo_metatable = { }
|
|
|
|
----------------------------------------------------------------------
|
|
-- Really extract next token fron the raw string
|
|
-- (and update the index).
|
|
-- loc: offset of the position just after spaces and comments
|
|
-- previous_i: offset in src before extraction began
|
|
----------------------------------------------------------------------
|
|
function lexer:extract ()
|
|
local previous_i = self.i
|
|
local loc = self.i
|
|
local eof, token
|
|
|
|
-- Put line info, comments and metatable around the tag and content
|
|
-- provided by extractors, thus returning a complete lexer token.
|
|
-- first_line: line # at the beginning of token
|
|
-- first_column_offset: char # of the last '\n' before beginning of token
|
|
-- i: scans from beginning of prefix spaces/comments to end of token.
|
|
local function build_token (tag, content)
|
|
assert (tag and content)
|
|
local i, first_line, first_column_offset, previous_line_length =
|
|
previous_i, self.line, self.column_offset, nil
|
|
|
|
-- update self.line and first_line. i := indexes of '\n' chars
|
|
while true do
|
|
i = self.src :find ("\n", i+1, true)
|
|
if not i or i>self.i then break end -- no more '\n' until end of token
|
|
previous_line_length = i - self.column_offset
|
|
if loc and i <= loc then -- '\n' before beginning of token
|
|
first_column_offset = i
|
|
first_line = first_line+1
|
|
end
|
|
self.line = self.line+1
|
|
self.column_offset = i
|
|
end
|
|
|
|
-- lineinfo entries: [1]=line, [2]=column, [3]=char, [4]=filename
|
|
local fli = { first_line, loc-first_column_offset, loc, self.src_name }
|
|
local lli = { self.line, self.i-self.column_offset-1, self.i-1, self.src_name }
|
|
--Pluto barfes when the metatable is set:(
|
|
setmetatable(fli, lexer.lineinfo_metatable)
|
|
setmetatable(lli, lexer.lineinfo_metatable)
|
|
local a = { tag = tag, lineinfo = { first=fli, last=lli }, content }
|
|
if lli[2]==-1 then lli[1], lli[2] = lli[1]-1, previous_line_length-1 end
|
|
if #self.attached_comments > 0 then
|
|
a.lineinfo.comments = self.attached_comments
|
|
fli.comments = self.attached_comments
|
|
if self.lineinfo_last then
|
|
self.lineinfo_last.comments = self.attached_comments
|
|
end
|
|
end
|
|
self.attached_comments = { }
|
|
return setmetatable (a, self.token_metatable)
|
|
end --</function build_token>
|
|
|
|
for ext_idx, extractor in ipairs(self.extractors) do
|
|
-- printf("method = %s", method)
|
|
local tag, content = self [extractor] (self)
|
|
-- [loc] is placed just after the leading whitespaces and comments;
|
|
-- for this to work, the whitespace extractor *must be* at index 1.
|
|
if ext_idx==1 then loc = self.i end
|
|
|
|
if tag then
|
|
--printf("`%s{ %q }\t%i", tag, content, loc);
|
|
return build_token (tag, content)
|
|
end
|
|
end
|
|
|
|
error "None of the lexer extractors returned anything!"
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
-- skip whites and comments
|
|
-- FIXME: doesn't take into account:
|
|
-- - unterminated long comments
|
|
-- - short comments at last line without a final \n
|
|
----------------------------------------------------------------------
|
|
function lexer:skip_whitespaces_and_comments()
|
|
local table_insert = _G.table.insert
|
|
repeat -- loop as long as a space or comment chunk is found
|
|
local _, j
|
|
local again = false
|
|
local last_comment_content = nil
|
|
-- skip spaces
|
|
self.i = self.src:match (self.patterns.spaces, self.i)
|
|
-- skip a long comment if any
|
|
_, last_comment_content, j =
|
|
self.src :match (self.patterns.long_comment, self.i)
|
|
if j then
|
|
table_insert(self.attached_comments,
|
|
{last_comment_content, self.i, j, "long"})
|
|
self.i=j; again=true
|
|
end
|
|
-- skip a short comment if any
|
|
last_comment_content, j = self.src:match (self.patterns.short_comment, self.i)
|
|
if j then
|
|
table_insert(self.attached_comments,
|
|
{last_comment_content, self.i, j, "short"})
|
|
self.i=j; again=true
|
|
end
|
|
if self.i>#self.src then return "Eof", "eof" end
|
|
until not again
|
|
|
|
if self.src:match (self.patterns.final_short_comment, self.i) then
|
|
return "Eof", "eof" end
|
|
--assert (not self.src:match(self.patterns.short_comment, self.i))
|
|
--assert (not self.src:match(self.patterns.long_comment, self.i))
|
|
-- --assert (not self.src:match(self.patterns.spaces, self.i))
|
|
return
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
-- extract a '...' or "..." short string
|
|
----------------------------------------------------------------------
|
|
function lexer:extract_short_string()
|
|
-- [k] is the first unread char, [self.i] points to [k] in [self.src]
|
|
local j, k = self.i, self.src :sub (self.i,self.i)
|
|
if k~="'" and k~='"' then return end
|
|
local i = self.i + 1
|
|
local j = i
|
|
while true do
|
|
-- k = opening char: either simple-quote or double-quote
|
|
-- i = index of beginning-of-string
|
|
-- x = next "interesting" character
|
|
-- j = position after interesting char
|
|
-- y = char just after x
|
|
local x, y
|
|
x, j, y = self.src :match ("([\\\r\n"..k.."])()(.?)", j)
|
|
if x == '\\' then j=j+1 -- don't parse escaped char
|
|
elseif x == k then break -- unescaped end of string
|
|
else -- eof or '\r' or '\n' reached before end of string
|
|
assert (not x or x=="\r" or x=="\n")
|
|
error "Unterminated string"
|
|
end
|
|
end
|
|
self.i = j
|
|
|
|
return "String", unescape_string (self.src:sub (i,j-2))
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
--
|
|
----------------------------------------------------------------------
|
|
function lexer:extract_word()
|
|
-- Id / keyword
|
|
local word, j = self.src:match (self.patterns.word, self.i)
|
|
if word then
|
|
self.i = j
|
|
if self.alpha [word] then return "Keyword", word
|
|
else return "Id", word end
|
|
end
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
--
|
|
----------------------------------------------------------------------
|
|
function lexer:extract_number()
|
|
-- Number
|
|
local j = self.src:match(self.patterns.number_hex, self.i)
|
|
if not j then
|
|
j = self.src:match (self.patterns.number_mantissa[1], self.i) or
|
|
self.src:match (self.patterns.number_mantissa[2], self.i)
|
|
if j then
|
|
j = self.src:match (self.patterns.number_exponant, j) or j;
|
|
end
|
|
end
|
|
if not j then return end
|
|
-- Number found, interpret with tonumber() and return it
|
|
local n = tonumber (self.src:sub (self.i, j-1))
|
|
self.i = j
|
|
return "Number", n
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
--
|
|
----------------------------------------------------------------------
|
|
function lexer:extract_long_string()
|
|
-- Long string
|
|
local _, content, j = self.src:match (self.patterns.long_string, self.i)
|
|
if j then self.i = j; return "String", content end
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
--
|
|
----------------------------------------------------------------------
|
|
function lexer:extract_symbol()
|
|
-- compound symbol
|
|
local k = self.src:sub (self.i,self.i)
|
|
local symk = self.sym [k]
|
|
if not symk then
|
|
self.i = self.i + 1
|
|
return "Keyword", k
|
|
end
|
|
for _, sym in pairs (symk) do
|
|
if sym == self.src:sub (self.i, self.i + #sym - 1) then
|
|
self.i = self.i + #sym;
|
|
return "Keyword", sym
|
|
end
|
|
end
|
|
-- single char symbol
|
|
self.i = self.i+1
|
|
return "Keyword", k
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
-- Add a keyword to the list of keywords recognized by the lexer.
|
|
----------------------------------------------------------------------
|
|
function lexer:add (w, ...)
|
|
assert(not ..., "lexer:add() takes only one arg, although possibly a table")
|
|
if type (w) == "table" then
|
|
for _, x in ipairs (w) do self:add (x) end
|
|
else
|
|
if w:match (self.patterns.word .. "$") then self.alpha [w] = true
|
|
elseif w:match "^%p%p+$" then
|
|
local k = w:sub(1,1)
|
|
local list = self.sym [k]
|
|
if not list then list = { }; self.sym [k] = list end
|
|
_G.table.insert (list, w)
|
|
elseif w:match "^%p$" then return
|
|
else error "Invalid keyword" end
|
|
end
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
-- Return the [n]th next token, without consumming it.
|
|
-- [n] defaults to 1. If it goes pass the end of the stream, an EOF
|
|
-- token is returned.
|
|
----------------------------------------------------------------------
|
|
function lexer:peek (n)
|
|
if not n then n=1 end
|
|
if n > #self.peeked then
|
|
for i = #self.peeked+1, n do
|
|
self.peeked [i] = self:extract()
|
|
end
|
|
end
|
|
return self.peeked [n]
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
-- Return the [n]th next token, removing it as well as the 0..n-1
|
|
-- previous tokens. [n] defaults to 1. If it goes pass the end of the
|
|
-- stream, an EOF token is returned.
|
|
----------------------------------------------------------------------
|
|
function lexer:next (n)
|
|
n = n or 1
|
|
self:peek (n)
|
|
local a
|
|
for i=1,n do
|
|
a = _G.table.remove (self.peeked, 1)
|
|
if a then
|
|
--debugf ("lexer:next() ==> %s %s",
|
|
-- table.tostring(a), tostring(a))
|
|
end
|
|
self.lastline = a.lineinfo.last[1]
|
|
end
|
|
self.lineinfo_last = a.lineinfo.last
|
|
return a or eof_token
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
-- Returns an object which saves the stream's current state.
|
|
----------------------------------------------------------------------
|
|
-- FIXME there are more fields than that to save
|
|
function lexer:save () return { self.i; _G.table.cat(self.peeked) } end
|
|
|
|
----------------------------------------------------------------------
|
|
-- Restore the stream's state, as saved by method [save].
|
|
----------------------------------------------------------------------
|
|
-- FIXME there are more fields than that to restore
|
|
function lexer:restore (s) self.i=s[1]; self.peeked=s[2] end
|
|
|
|
----------------------------------------------------------------------
|
|
-- Resynchronize: cancel any token in self.peeked, by emptying the
|
|
-- list and resetting the indexes
|
|
----------------------------------------------------------------------
|
|
function lexer:sync()
|
|
local p1 = self.peeked[1]
|
|
if p1 then
|
|
li = p1.lineinfo.first
|
|
self.line, self.i = li[1], li[3]
|
|
self.column_offset = self.i - li[2]
|
|
self.peeked = { }
|
|
self.attached_comments = p1.lineinfo.first.comments or { }
|
|
end
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
-- Take the source and offset of an old lexer.
|
|
----------------------------------------------------------------------
|
|
function lexer:takeover(old)
|
|
self:sync()
|
|
self.line, self.column_offset, self.i, self.src, self.attached_comments =
|
|
old.line, old.column_offset, old.i, old.src, old.attached_comments
|
|
return self
|
|
end
|
|
|
|
-- function lexer:lineinfo()
|
|
-- if self.peeked[1] then return self.peeked[1].lineinfo.first
|
|
-- else return { self.line, self.i-self.column_offset, self.i } end
|
|
-- end
|
|
|
|
|
|
----------------------------------------------------------------------
|
|
-- Return the current position in the sources. This position is between
|
|
-- two tokens, and can be within a space / comment area, and therefore
|
|
-- have a non-null width. :lineinfo_left() returns the beginning of the
|
|
-- separation area, :lineinfo_right() returns the end of that area.
|
|
--
|
|
-- ____ last consummed token ____ first unconsummed token
|
|
-- / /
|
|
-- XXXXX <spaces and comments> YYYYY
|
|
-- \____ \____
|
|
-- :lineinfo_left() :lineinfo_right()
|
|
----------------------------------------------------------------------
|
|
function lexer:lineinfo_right()
|
|
return self:peek(1).lineinfo.first
|
|
end
|
|
|
|
function lexer:lineinfo_left()
|
|
return self.lineinfo_last
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
-- Create a new lexstream.
|
|
----------------------------------------------------------------------
|
|
function lexer:newstream (src_or_stream, name)
|
|
name = name or "?"
|
|
if type(src_or_stream)=='table' then -- it's a stream
|
|
return setmetatable ({ }, self) :takeover (src_or_stream)
|
|
elseif type(src_or_stream)=='string' then -- it's a source string
|
|
local src = src_or_stream
|
|
local stream = {
|
|
src_name = name; -- Name of the file
|
|
src = src; -- The source, as a single string
|
|
peeked = { }; -- Already peeked, but not discarded yet, tokens
|
|
i = 1; -- Character offset in src
|
|
line = 1; -- Current line number
|
|
column_offset = 0; -- distance from beginning of file to last '\n'
|
|
attached_comments = { },-- comments accumulator
|
|
lineinfo_last = { 1, 1, 1, name }
|
|
}
|
|
setmetatable (stream, self)
|
|
|
|
-- skip initial sharp-bang for unix scripts
|
|
-- FIXME: redundant with mlp.chunk()
|
|
if src and src :match "^#" then stream.i = src :find "\n" + 1 end
|
|
return stream
|
|
else
|
|
assert(false, ":newstream() takes a source string or a stream, not a "..
|
|
type(src_or_stream))
|
|
end
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
-- if there's no ... args, return the token a (whose truth value is
|
|
-- true) if it's a `Keyword{ }, or nil. If there are ... args, they
|
|
-- have to be strings. if the token a is a keyword, and it's content
|
|
-- is one of the ... args, then returns it (it's truth value is
|
|
-- true). If no a keyword or not in ..., return nil.
|
|
----------------------------------------------------------------------
|
|
function lexer:is_keyword (a, ...)
|
|
if not a or a.tag ~= "Keyword" then return false end
|
|
local words = {...}
|
|
if #words == 0 then return a[1] end
|
|
for _, w in ipairs (words) do
|
|
if w == a[1] then return w end
|
|
end
|
|
return false
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
-- Cause an error if the next token isn't a keyword whose content
|
|
-- is listed among ... args (which have to be strings).
|
|
----------------------------------------------------------------------
|
|
function lexer:check (...)
|
|
local words = {...}
|
|
local a = self:next()
|
|
local function err ()
|
|
error ("Got " .. tostring (a) ..
|
|
", expected one of these keywords : '" ..
|
|
_G.table.concat (words,"', '") .. "'") end
|
|
|
|
if not a or a.tag ~= "Keyword" then err () end
|
|
if #words == 0 then return a[1] end
|
|
for _, w in ipairs (words) do
|
|
if w == a[1] then return w end
|
|
end
|
|
err ()
|
|
end
|
|
|
|
----------------------------------------------------------------------
|
|
--
|
|
----------------------------------------------------------------------
|
|
function lexer:clone()
|
|
local clone = {
|
|
alpha = table.deep_copy(self.alpha),
|
|
sym = table.deep_copy(self.sym) }
|
|
setmetatable(clone, self)
|
|
clone.__index = clone
|
|
return clone
|
|
end
|