String tokenizer

Here is a small example of making a string tokenizer using regular expressions. Can be handy if you plan on creating your own language in Lua. Playing around a bit with it, will try to make a something that can create a syntax tree from the tokens later on.


--# Main
function setup()
    local tokens = tokenize('foo.fie.fum(1+2 >= 100.5, "hello world")')
    for i,v in ipairs(tokens) do print(v.type, v.value) end
end
--# Tokenize
local source, tokens, cursor
local patterns = {}

local function on(pattern, createFn)
    table.insert(patterns, function ()
        local _, len, res, group = string.find(source, "^(" .. pattern .. ")")
        if len then
            if createFn then
                local token = createFn(group or res)
                token.from, token.to = cursor, cursor+len
                table.insert(tokens, token)
            end
            source = string.sub(source, len+1)
            cursor = cursor + len
            return true
        end
    end)
end

on("%s+")
on("[A-Za-z_]%a*", function (w) return {type="name", value=w} end)
on("%d+%.%d+", function (d) return {type="number", value=tonumber(d)} end)
on("%d+", function (d) return {type="number", value=tonumber(d)} end)
on('"([^"]*)"', function (s) return {type="string", value=s} end)
on("[=<>!+%.%-*&|/%^][=<>&|]?", function (op) return {type="operator", value=op} end)
on("[{}%(%),]", function (op) return {type="operator", value=op} end)

local function dispatch()
    for i,m in ipairs(patterns) do
        if m() then return true end
    end
end

function tokenize(src)
    source, tokens, cursor = src, {}, 0
    while #source>0 and dispatch() do end
    if #source > 0 then print("tokenizer failed at " .. source) end
    return tokens
end

(A little curious about build something based on this page http://javascript.crockford.com/tdop/tdop.html)

Thanks for sharing. Works great. I might use it for emulating codea editor (for in line code examples).

@tnlogy Can this be modified to take in a program and break it down. Give a listing of variable names and how many times they’re used. Might be useful to find variables that are no longer used in a large program that’s been changed many times.

Glad to hear. I was thinking about parsing the shader code and maybe make some kind of editor for it. Or making a language for describing logic in tile based games. Well, currently just messing around with it. :slight_smile: