//
// Copyright (c) 2006, Brian Frank and Andy Frank
// Licensed under the Academic Free License version 3.0
//
// History:
// 3 Sep 05 Brian Frank Creation
// 18 May 06 Brian Frank Ported from Java to Fan
//
**
** Tokenizer inputs a Str and output a list of Tokens
**
class Tokenizer : CompilerSupport
{
//////////////////////////////////////////////////////////////////////////
// Constructor
//////////////////////////////////////////////////////////////////////////
**
** Construct with characters of source file. The buffer
** passed must be normalized in that all newlines must be
** represented strictly as \n and not \r or \r\n (see
** File.readAllStr). If isDoc is false, we skip all star-star
** Fandoc comments.
**
new make(Compiler compiler, Location location, Str buf, Bool isDoc)
: super(compiler)
{
this.buf = buf
this.filename = location.file
this.isDoc = isDoc
this.tokens = TokenVal[,]
this.inStrLiteral = false
// initialize cur and peek
cur = peek = ' '
if (buf.size > 0) cur = buf[0]
if (buf.size > 1) peek = buf[1]
pos = 0
// if first line starts with #, then treat it like an end of
// line, so that Unix guys can specify the executable to run
if (cur === '#')
{
while (true)
{
if (cur === '\n') { consume; break }
if (cur === 0) break
consume
}
}
}
//////////////////////////////////////////////////////////////////////////
// Access
//////////////////////////////////////////////////////////////////////////
**
** Tokenize the entire input into a list of tokens.
**
TokenVal[] tokenize()
{
while (true)
{
tok := next
tokens.add(tok)
if (tok.kind === Token.eof) break
}
return tokens
}
**
** Return the next token in the buffer.
**
private TokenVal next()
{
while (true)
{
// save current line
line := this.line
col := this.col
// find next token
TokenVal tok := find
if (tok == null) continue
// fill in token's location
tok.file = filename
tok.line = line
tok.col = col
tok.newline = lastLine < line
// save last line
lastLine = line
return tok
}
return null // TODO - shouldn't need this
}
**
** Find the next token or return null.
**
private TokenVal find()
{
// skip whitespace
if (cur.isSpace) { consume; return null }
// raw string literal r"c:\dir\foo.txt"
if (cur === 'r' && peek === '"' && !inStrLiteral) return rawStr
// alpha means keyword or identifier
if (cur.isAlpha || cur === '_') return word
// number or .number (note that + and - are handled as unary operator)
if (cur.isDigit) return number
if (cur === '.' && peek.isDigit) return number
// str literal
if (cur === '"') return str
if (cur === '`') return uri
if (cur === '\'') return ch
// comments
if (cur === '*' && peek === '*') return docComment
if (cur === '/' && peek === '/') return skipCommentSL
if (cur === '/' && peek === '*') return skipCommentML
// symbols
return symbol
}
//////////////////////////////////////////////////////////////////////////
// Word
//////////////////////////////////////////////////////////////////////////
**
** Parse a word token: alpha (alpha|number)*
** Words are either keywords or identifiers
**
private TokenVal word()
{
// store starting position of word
start := pos
// find end of word to compute length
while (cur.isAlphaNum || cur === '_') consume
// create Str (gc note this string might now reference buf)
word := buf[start...pos]
// check keywords
keyword := Token.keywords[word]
if (keyword != null)
return TokenVal.make(keyword)
// otherwise this is a normal identifier
return TokenVal.make(Token.identifier, word)
}
//////////////////////////////////////////////////////////////////////////
// Number
//////////////////////////////////////////////////////////////////////////
**
** Parse a number literal token: int, float, decimal, or duration.
**
private TokenVal number()
{
// check for hex value
if (cur === '0' && peek === 'x')
return hex
// find end of literal
start := pos
dot := false
exp := false
// whole part
while (cur.isDigit || cur === '_') consume
// fraction part
if (cur === '.' && peek.isDigit)
{
dot = true
consume
while (cur.isDigit || cur === '_') consume
}
// exponent
if (cur === 'e' || cur === 'E')
{
consume
exp = true
if (cur === '-' || cur === '+') consume
if (!cur.isDigit) throw err("Expected exponent digits")
while (cur.isDigit || cur === '_') consume
}
// string value of literal
str := buf[start...pos].replace("_", "")
// check for suffixes
floatSuffix := false
decimalSuffix := false
Int dur := null
if (cur.isLower && peek.isLower)
{
if (cur === 'n' && peek === 's') { consume; consume; dur = 1 }
if (cur === 'm' && peek === 's') { consume; consume; dur = 1000000 }
if (cur === 's' && peek === 'e') { consume; consume; if (cur !== 'c') throw err("Expected 'sec' in Duration literal"); consume; dur = 1_000_000_000 }
if (cur === 'm' && peek === 'i') { consume; consume; if (cur !== 'n') throw err("Expected 'min' in Duration literal"); consume; dur = 60_000_000_000 }
if (cur === 'h' && peek === 'r') { consume; consume; dur = 3_600_000_000_000 }
if (cur === 'd' && peek === 'a') { consume; consume; if (cur !== 'y') throw err("Expected 'day' in Duration literal"); consume; dur = 86_400_000_000_000 }
}
else if (cur === 'f' || cur === 'F')
{
consume
floatSuffix = true
}
else if (cur === 'd' || cur === 'D')
{
consume
decimalSuffix = true
}
try
{
// float literal
if (floatSuffix)
{
num := Float.fromStr(str)
return TokenVal.make(Token.floatLiteral, num)
}
// decimal literal
if (decimalSuffix || dot || exp)
{
num := Decimal.fromStr(str)
if (dur != null)
return TokenVal.make(Token.durationLiteral, Duration.make((num*dur.toDecimal).toInt))
else
return TokenVal.make(Token.decimalLiteral, num)
}
// int literal
num := Int.fromStr(str)
if (dur != null)
return TokenVal.make(Token.durationLiteral, Duration.make(num*dur))
else
return TokenVal.make(Token.intLiteral, num)
}
catch (ParseErr e)
{
throw err("Invalid numeric literal '$str'")
}
}
**
** Process hex int/long literal starting with 0x
**
TokenVal hex()
{
consume // 0
consume // x
// read first hex
Int val := cur.fromDigit(16)
if (val == null) throw err("Expecting hex number")
consume
Int nibCount := 1
while (true)
{
Int nib := cur.fromDigit(16)
if (nib == null)
{
if (cur === '_') { consume; continue }
break
}
nibCount++
if (nibCount > 16) throw err("Hex literal too big")
val = (val << 4) + nib;
consume
}
return TokenVal.make(Token.intLiteral, val)
}
//////////////////////////////////////////////////////////////////////////
// String
//////////////////////////////////////////////////////////////////////////
**
** Parse a raw string literal token.
**
private TokenVal rawStr()
{
// consume opening 'r' and quote
consume
consume
// string contents
s := StrBuf.make
while (cur !== '"')
{
if (cur <= 0) throw err("Unexpected end of string literal")
s.addChar(cur)
consume
}
// close quote
consume
return TokenVal.make(Token.strLiteral, s.toStr)
}
**
** Parse a string literal token.
**
private TokenVal str()
{
inStrLiteral = true
try
{
// consume opening quote
consume
// store starting position
s := StrBuf.make
// loop until we find end of string
interpolated := false
while (true)
{
if (cur === '"') { consume; break }
if (cur === 0) throw err("Unexpected end of string")
if (cur === '$')
{
// if we have detected an interpolated string, then
// insert opening paren to treat whole string atomically
if (!interpolated)
{
interpolated = true
tokens.add(makeVirtualToken(Token.lparen))
}
// process interpolated string, it returns null
// if at end of string literal
if (!strInterpolation(s.toStr))
{
tokens.add(makeVirtualToken(Token.rparen))
return null
}
s.clear
}
else if (cur === '\\')
{
s.add(escape.toChar)
}
else
{
s.addChar(cur)
consume
}
}
// if interpolated then we add rparen to treat whole atomically
if (interpolated)
{
tokens.add(makeVirtualToken(Token.strLiteral, s.toStr))
tokens.add(makeVirtualToken(Token.rparen))
return null
}
else
{
return TokenVal.make(Token.strLiteral, s.toStr)
}
}
finally
{
inStrLiteral = false
}
}
**
** When we hit a $ inside a string it indicates an embedded
** expression. We make this look like a stream of tokens
** such that:
** "a ${b} c" -> "a " + b + " c"
** Return true if more in the string literal.
**
private Bool strInterpolation(Str s)
{
consume // $
tokens.add(makeVirtualToken(Token.strLiteral, s))
tokens.add(makeVirtualToken(Token.plus))
// if { we allow an expression b/w {...}
if (cur === '{')
{
tokens.add(makeVirtualToken(Token.lparen))
consume
while (true)
{
if (cur === '"') throw err("Unexpected end of string, missing }")
tok := next
if (tok.kind == Token.rbrace) break
tokens.add(tok)
}
tokens.add(makeVirtualToken(Token.rparen))
}
// else also allow a single identifier with
// dotted accessors x, x.y, x.y.z
else
{
tok := next
if (tok.kind !== Token.identifier &&
tok.kind !== Token.thisKeyword &&
tok.kind !== Token.superKeyword)
throw err("Expected identifier after \$")
tokens.add(tok)
while (true)
{
if (cur !== '.') break
tokens.add(next) // dot
tok = next
if (tok.kind !== Token.identifier) throw err("Expected identifier")
tokens.add(tok)
}
}
// if at end of string, all done
if (cur === '\"')
{
consume
return false
}
// add plus and return true to keep chugging
tokens.add(makeVirtualToken(Token.plus))
return true
}
**
** Create a virtual token for string interpolation.
**
private TokenVal makeVirtualToken(Token kind, Obj value := null)
{
tok := TokenVal.make(kind, value)
tok.file = filename
tok.line = line
tok.col = col
return tok
}
//////////////////////////////////////////////////////////////////////////
// Uri
//////////////////////////////////////////////////////////////////////////
**
** Parse a uri literal token.
**
private TokenVal uri()
{
// consume opening backtick
consume
// store starting position
s := StrBuf.make
// loop until we find end of string
while (true)
{
ch := cur
if (ch === '`') { consume; break }
if (ch === 0 || ch === '\n') throw err("Unexpected end of uri")
if (ch === '$') throw err("Uri interpolation not supported yet")
if (ch === '\\')
{
switch (peek)
{
case ':': case '/': case '?': case '#':
case '[': case ']': case '@': case '\\':
case '&': case '=': case ';':
s.addChar(ch)
s.addChar(peek)
consume
consume
default:
s.addChar(escape)
}
}
else
{
consume
s.addChar(ch)
}
}
return TokenVal.make(Token.uriLiteral, s.toStr)
}
//////////////////////////////////////////////////////////////////////////
// Char
//////////////////////////////////////////////////////////////////////////
**
** Parse a char literal token.
**
private TokenVal ch()
{
// consume opening quote
consume
// if \ then process as escape
Int c
if (cur === '\\')
{
c = escape
}
else
{
c = cur
consume
}
// expecting ' quote
if (cur !== '\'') throw err("Expecting ' close of char literal")
consume
return TokenVal.make(Token.intLiteral, c)
}
**
** Parse an escapse sequence which starts with a \
**
Int escape()
{
// consume slash
if (cur !== '\\') throw err("Internal error")
consume
// check basics
switch (cur)
{
case 'b': consume; return '\b'
case 'f': consume; return '\f'
case 'n': consume; return '\n'
case 'r': consume; return '\r'
case 't': consume; return '\t'
case '"': consume; return '"'
case '$': consume; return '$'
case '\'': consume; return '\''
case '`': consume; return '`'
case '\\': consume; return '\\'
}
// check for uxxxx
if (cur === 'u')
{
consume
n3 := cur.fromDigit(16); consume
n2 := cur.fromDigit(16); consume
n1 := cur.fromDigit(16); consume
n0 := cur.fromDigit(16); consume
if (n3 == null || n2 == null || n1 == null || n0 == null) throw err("Invalid hex value for \\uxxxx")
return ((n3 << 12) | (n2 << 8) | (n1 << 4) | n0)
}
throw err("Invalid escape sequence")
}
//////////////////////////////////////////////////////////////////////////
// Comments
//////////////////////////////////////////////////////////////////////////
**
** Skip a single line // comment
**
private TokenVal ()
{
consume // first slash
consume // next slash
while (true)
{
if (cur === '\n') { consume; break }
if (cur === 0) break
consume
}
return null
}
**
** Skip a multi line /* comment. Note unlike C/Java,
** slash/star comments can be nested.
**
private TokenVal ()
{
consume // first slash
consume // next slash
depth := 1
while (true)
{
if (cur === '*' && peek === '/') { consume; consume; depth--; if (depth <= 0) break }
if (cur === '/' && peek === '*') { consume; consume; depth++; continue }
if (cur === 0) break
consume
}
return null
}
**
** Parse a Javadoc style comment into a documentation comment token.
**
private TokenVal ()
{
// if doc is off, then just skip the line and be done
if (!isDoc) { skipCommentSL; return null }
while (cur === '*') consume
if (cur === ' ') consume
// parse comment
lines := Str[,]
s := StrBuf.make
while (cur > 0)
{
// add to buffer and advance
c := cur
consume
// if not at newline, then loop
if (c !== '\n')
{
s.addChar(c)
continue
}
// add line and reset buffer (but don't add leading empty lines)
line := s.toStr
if (!lines.isEmpty || !line.trim.isEmpty) lines.add(line)
s.clear
// we at a newline, check for leading whitespace(0+)/star(2+)/whitespace(1)
while (cur === ' ' || cur === '\t') consume
if (cur !== '*' || peek !== '*') break
while (cur === '*') consume
if (cur === ' ' || cur === '\t') consume
}
lines.add(s.toStr)
// strip trailing empty lines
while (!lines.isEmpty)
if (lines.last.trim.isEmpty) lines.removeAt(-1)
else break
return TokenVal.make(Token.docComment, lines)
}
//////////////////////////////////////////////////////////////////////////
// Symbol
//////////////////////////////////////////////////////////////////////////
**
** Parse a symbol token (typically into an operator).
**
private TokenVal symbol()
{
c := cur
consume
switch (c)
{
case '\r':
throw err("Carriage return \\r not allowed in source")
case '!':
if (cur === '=')
{
consume
if (cur === '=') { consume; return TokenVal.make(Token.notSame) }
return TokenVal.make(Token.notEq)
}
return TokenVal.make(Token.bang)
case '#':
return TokenVal.make(Token.pound)
case '%':
if (cur === '=') { consume; return TokenVal.make(Token.assignPercent) }
return TokenVal.make(Token.percent)
case '&':
if (cur === '=') { consume; return TokenVal.make(Token.assignAmp) }
if (cur === '&') { consume; return TokenVal.make(Token.doubleAmp) }
return TokenVal.make(Token.amp)
case '(':
return TokenVal.make(Token.lparen)
case ')':
return TokenVal.make(Token.rparen)
case '*':
if (cur === '=') { consume; return TokenVal.make(Token.assignStar) }
return TokenVal.make(Token.star)
case '+':
if (cur === '=') { consume; return TokenVal.make(Token.assignPlus) }
if (cur === '+') { consume; return TokenVal.make(Token.increment) }
return TokenVal.make(Token.plus)
case ',':
return TokenVal.make(Token.comma)
case '-':
if (cur === '>') { consume; return TokenVal.make(Token.arrow) }
if (cur === '-') { consume; return TokenVal.make(Token.decrement) }
if (cur === '=') { consume; return TokenVal.make(Token.assignMinus) }
return TokenVal.make(Token.minus)
case '.':
if (cur === '.')
{
consume
if (cur === '.') { consume; return TokenVal.make(Token.dotDotDot) }
return TokenVal.make(Token.dotDot)
}
return TokenVal.make(Token.dot)
case '/':
if (cur === '=') { consume; return TokenVal.make(Token.assignSlash) }
return TokenVal.make(Token.slash)
case ':':
if (cur === ':') { consume; return TokenVal.make(Token.doubleColon) }
if (cur === '=') { consume; return TokenVal.make(Token.defAssign) }
return TokenVal.make(Token.colon)
case ';':
return TokenVal.make(Token.semicolon)
case '<':
if (cur === '=')
{
consume
if (cur === '>') { consume; return TokenVal.make(Token.cmp) }
return TokenVal.make(Token.ltEq)
}
if (cur === '<')
{
consume
if (cur === '=') { consume; return TokenVal.make(Token.assignLshift) }
return TokenVal.make(Token.lshift)
}
return TokenVal.make(Token.lt)
case '=':
if (cur === '=')
{
consume
if (cur === '=') { consume; return TokenVal.make(Token.same) }
return TokenVal.make(Token.eq)
}
return TokenVal.make(Token.assign)
case '>':
if (cur === '=') { consume; return TokenVal.make(Token.gtEq) }
if (cur === '>')
{
consume
if (cur === '=') { consume; return TokenVal.make(Token.assignRshift) }
return TokenVal.make(Token.rshift)
}
return TokenVal.make(Token.gt)
case '?':
if (cur === ':') { consume; return TokenVal.make(Token.elvis) }
if (cur === '.') { consume; return TokenVal.make(Token.safeDot) }
if (cur === '-')
{
consume
if (cur !== '>') throw err("Expected '?->' symbol")
consume
return TokenVal.make(Token.safeArrow)
}
return TokenVal.make(Token.question)
case '@':
return TokenVal.make(Token.at)
case '[':
return TokenVal.make(Token.lbracket)
case ']':
return TokenVal.make(Token.rbracket)
case '^':
if (cur === '=') { consume; return TokenVal.make(Token.assignCaret) }
return TokenVal.make(Token.caret)
case '{':
return TokenVal.make(Token.lbrace)
case '|':
if (cur === '|') { consume; return TokenVal.make(Token.doublePipe) }
if (cur === '=') { consume; return TokenVal.make(Token.assignPipe) }
return TokenVal.make(Token.pipe)
case '}':
return TokenVal.make(Token.rbrace)
case '~':
return TokenVal.make(Token.tilde)
}
if (c === 0)
return TokenVal.make(Token.eof)
throw err("Unexpected symbol: " + c.toChar + " (0x" + c.toHex + ")")
}
//////////////////////////////////////////////////////////////////////////
// Utils
//////////////////////////////////////////////////////////////////////////
**
** Return a CompilerException for current location in source.
**
override CompilerErr err(Str msg, Location loc := null)
{
if (loc == null) loc = Location.make(filename, line, col)
return super.err(msg, loc);
}
////////////////////////////////////////////////////////////////
// Consume
////////////////////////////////////////////////////////////////
**
** Consume the cur char and advance to next char in buffer:
** - updates cur and peek fields
** - updates the line and col count
** - end of file, sets fields to 0
**
private Void consume()
{
// if cur is a line break, then advance line number,
// because the char we are getting ready to make cur
// is the first char on the next line
if (cur === '\n')
{
line++
col = 1
}
else
{
col++
}
// get the next character from the buffer, any
// problems mean that we have read past the end
cur = peek
pos++
if (pos+1 < buf.size)
peek = buf[pos+1] // next peek is cur+1
else
peek = 0
}
//////////////////////////////////////////////////////////////////////////
// Test
//////////////////////////////////////////////////////////////////////////
static Void main()
{
t1 := Duration.now
files := File.make(`/dev/fan/src/testSys/fan/`).list
files.each |File f|
{
tok := Tokenizer.make(null, Location.make(f.name), f.readAllStr, false).tokenize
echo("-- " + f + " [" + tok.size + "]")
}
t2 := Duration.now
echo("Time: " + (t2-t1).toMillis)
echo("Time: " + (t2-t1))
}
//////////////////////////////////////////////////////////////////////////
// Fields
//////////////////////////////////////////////////////////////////////////
private Str buf // buffer
private Int pos // index into buf for cur
private Bool isDoc // return documentation comments or if false ignore them
private Str filename // source file name
private Int line := 1 // pos line number
private Int col := 1 // pos column number
private Int cur // current char
private Int peek // next char
private Int lastLine // line number of last token returned from next()
private TokenVal[] tokens // token accumulator
private Bool inStrLiteral // return if inside a string literal token
}