1 //
2 // Copyright (c) 2006, Brian Frank and Andy Frank
3 // Licensed under the Academic Free License version 3.0
4 //
5 // History:
6 // 3 Sep 05 Brian Frank Creation
7 // 18 May 06 Brian Frank Ported from Java to Fan
8 //
9
10 **
11 ** Tokenizer inputs a Str and output a list of Tokens
12 **
13 class Tokenizer : CompilerSupport
14 {
15
16 //////////////////////////////////////////////////////////////////////////
17 // Constructor
18 //////////////////////////////////////////////////////////////////////////
19
20 **
21 ** Construct with characters of source file. The buffer
22 ** passed must be normalized in that all newlines must be
23 ** represented strictly as \n and not \r or \r\n (see
24 ** File.readAllStr). If isDoc is false, we skip all star-star
25 ** Fandoc comments.
26 **
27 new make(Compiler compiler, Location location, Str buf, Bool isDoc)
28 : super(compiler)
29 {
30 this.buf = buf
31 this.filename = location.file
32 this.isDoc = isDoc
33 this.tokens = TokenVal[,]
34 this.inStrLiteral = false
35
36 // initialize cur and peek
37 cur = peek = ' '
38 if (buf.size > 0) cur = buf[0]
39 if (buf.size > 1) peek = buf[1]
40 pos = 0
41
42 // if first line starts with #, then treat it like an end of
43 // line, so that Unix guys can specify the executable to run
44 if (cur === '#')
45 {
46 while (true)
47 {
48 if (cur === '\n') { consume; break }
49 if (cur === 0) break
50 consume
51 }
52 }
53 }
54
55 //////////////////////////////////////////////////////////////////////////
56 // Access
57 //////////////////////////////////////////////////////////////////////////
58
59 **
60 ** Tokenize the entire input into a list of tokens.
61 **
62 TokenVal[] tokenize()
63 {
64 while (true)
65 {
66 tok := next
67 tokens.add(tok)
68 if (tok.kind === Token.eof) break
69 }
70 return tokens
71 }
72
73 **
74 ** Return the next token in the buffer.
75 **
76 private TokenVal next()
77 {
78 while (true)
79 {
80 // save current line
81 line := this.line
82 col := this.col
83
84 // find next token
85 TokenVal tok := find
86 if (tok == null) continue
87
88 // fill in token's location
89 tok.file = filename
90 tok.line = line
91 tok.col = col
92 tok.newline = lastLine < line
93
94 // save last line
95 lastLine = line
96
97 return tok
98 }
99 return null // TODO - shouldn't need this
100 }
101
102 **
103 ** Find the next token or return null.
104 **
105 private TokenVal find()
106 {
107 // skip whitespace
108 if (cur.isSpace) { consume; return null }
109
110 // raw string literal r"c:\dir\foo.txt"
111 if (cur === 'r' && peek === '"' && !inStrLiteral) return rawStr
112
113 // alpha means keyword or identifier
114 if (cur.isAlpha || cur === '_') return word
115
116 // number or .number (note that + and - are handled as unary operator)
117 if (cur.isDigit) return number
118 if (cur === '.' && peek.isDigit) return number
119
120 // str literal
121 if (cur === '"') return str
122 if (cur === '`') return uri
123 if (cur === '\'') return ch
124
125 // comments
126 if (cur === '*' && peek === '*') return docComment
127 if (cur === '/' && peek === '/') return skipCommentSL
128 if (cur === '/' && peek === '*') return skipCommentML
129
130 // symbols
131 return symbol
132 }
133
134 //////////////////////////////////////////////////////////////////////////
135 // Word
136 //////////////////////////////////////////////////////////////////////////
137
138 **
139 ** Parse a word token: alpha (alpha|number)*
140 ** Words are either keywords or identifiers
141 **
142 private TokenVal word()
143 {
144 // store starting position of word
145 start := pos
146
147 // find end of word to compute length
148 while (cur.isAlphaNum || cur === '_') consume
149
150 // create Str (gc note this string might now reference buf)
151 word := buf[start...pos]
152
153 // check keywords
154 keyword := Token.keywords[word]
155 if (keyword != null)
156 return TokenVal.make(keyword)
157
158 // otherwise this is a normal identifier
159 return TokenVal.make(Token.identifier, word)
160 }
161
162 //////////////////////////////////////////////////////////////////////////
163 // Number
164 //////////////////////////////////////////////////////////////////////////
165
166 **
167 ** Parse a number literal token.
168 **
169 private TokenVal number()
170 {
171 Bool neg := false
172 Int whole := 0
173 Int wholeCount := 0
174 Float fraction := 0.0
175 Int exp := 0
176 Token tok := Token.intLiteral // int or float literal
177
178 // check for hex value
179 if (cur === '0' && peek === 'x')
180 return hex
181
182 // read whole part
183 while (cur.isDigit)
184 {
185 whole = whole*10 + cur.fromDigit
186 consume
187 // it would be nice to actually trap max long value, but
188 // as a simpler catch all we know decimal numbers must
189 // never be bigger than 19 digits
190 wholeCount++
191 if (wholeCount > 19) throw err("Numeric literal too big")
192 if (cur === '_') consume
193 }
194
195 // if dot then read fraction
196 if (cur === '.' && peek.isDigit)
197 {
198 consume
199 for (Float m := 0.1; cur.isDigit; m /= 10.0)
200 {
201 fraction = fraction + cur.fromDigit.toFloat*m
202 consume
203 if (cur === '_') consume
204 }
205 tok = Token.floatLiteral
206 }
207
208 // check for exponent
209 if (cur === 'e' || cur === 'E')
210 {
211 consume
212 Bool negExp := false
213 if (cur === '-') { consume; negExp = true }
214 else if (cur === '+') { consume }
215 if (!cur.isDigit) throw err("Expected exponent digits")
216 while (cur.isDigit)
217 {
218 exp = exp*10 + cur.fromDigit
219 consume
220 if (cur === '_') consume
221 }
222 if (negExp) exp = -exp
223 tok = Token.floatLiteral
224 }
225
226 // check for F suffix
227 Int dur := null
228 if (cur === 'f' || cur === 'F')
229 {
230 consume
231 tok = Token.floatLiteral
232 }
233 // check if a duration
234 else
235 {
236 if (cur === 'n' && peek === 's') { consume; consume; dur = 1 }
237 if (cur === 'm' && peek === 's') { consume; consume; dur = 1000000 }
238 if (cur === 's' && peek === 'e') { consume; consume; if (cur !== 'c') throw err("Expected 'sec' in Duration literal"); consume; dur = 1_000_000_000 }
239 if (cur === 'm' && peek === 'i') { consume; consume; if (cur !== 'n') throw err("Expected 'min' in Duration literal"); consume; dur = 60_000_000_000 }
240 if (cur === 'h' && peek === 'r') { consume; consume; dur = 3_600_000_000_000 }
241 if (cur === 'd' && peek === 'a') { consume; consume; if (cur !== 'y') throw err("Expected 'day' in Duration literal"); consume; dur = 86_400_000_000_000 }
242 }
243
244 // int literal
245 if (tok === Token.intLiteral)
246 {
247 if (fraction !== 0.0 || exp !== 0) throw err("Int literals may not have fractional or exponent components")
248 Int val := whole
249 if (neg) val = -val
250 if (dur != null)
251 return TokenVal.make(Token.durationLiteral, Duration.make(val*dur))
252 else
253 return TokenVal.make(tok, val)
254 }
255
256 // float literal
257 if (tok === Token.floatLiteral)
258 {
259 Float val := whole.toFloat + fraction
260 if (exp !== 0) val = val * 10f.pow(exp.toFloat)
261 if (neg) val = -val
262 if (dur != null)
263 return TokenVal.make(Token.durationLiteral, Duration.make((val*dur.toFloat).toInt))
264 else
265 return TokenVal.make(tok, val)
266 }
267
268 throw err("Internal error")
269 }
270
271 **
272 ** Process hex int/long literal starting with 0x
273 **
274 TokenVal hex()
275 {
276 consume // 0
277 consume // x
278
279 // read first hex
280 Int val := cur.fromDigit(16)
281 if (val == null) throw err("Expecting hex number")
282 consume
283 Int nibCount := 1
284 while (true)
285 {
286 Int nib := cur.fromDigit(16)
287 if (nib == null)
288 {
289 if (cur === '_') { consume; continue }
290 break
291 }
292 nibCount++
293 if (nibCount > 16) throw err("Hex literal too big")
294 val = (val << 4) + nib;
295 consume
296 }
297
298 return TokenVal.make(Token.intLiteral, val)
299 }
300
301 //////////////////////////////////////////////////////////////////////////
302 // String
303 //////////////////////////////////////////////////////////////////////////
304
305 **
306 ** Parse a raw string literal token.
307 **
308 private TokenVal rawStr()
309 {
310 // consume opening 'r' and quote
311 consume
312 consume
313
314 // string contents
315 s := StrBuf.make
316 while (cur !== '"')
317 {
318 if (cur <= 0) throw err("Unexpected end of string literal")
319 s.addChar(cur)
320 consume
321 }
322
323 // close quote
324 consume
325
326 return TokenVal.make(Token.strLiteral, s.toStr)
327 }
328
329 **
330 ** Parse a string literal token.
331 **
332 private TokenVal str()
333 {
334 inStrLiteral = true
335 try
336 {
337 // consume opening quote
338 consume
339
340 // store starting position
341 s := StrBuf.make
342
343 // loop until we find end of string
344 interpolated := false
345 while (true)
346 {
347 if (cur === '"') { consume; break }
348 if (cur === 0) throw err("Unexpected end of string")
349 if (cur === '$')
350 {
351 // if we have detected an interpolated string, then
352 // insert opening paren to treat whole string atomically
353 if (!interpolated)
354 {
355 interpolated = true
356 tokens.add(makeVirtualToken(Token.lparen))
357 }
358
359 // process interpolated string, it returns null
360 // if at end of string literal
361 if (!strInterpolation(s.toStr))
362 {
363 tokens.add(makeVirtualToken(Token.rparen))
364 return null
365 }
366
367 s.clear
368 }
369 else if (cur === '\\')
370 {
371 s.add(escape.toChar)
372 }
373 else
374 {
375 s.addChar(cur)
376 consume
377 }
378 }
379
380 // if interpolated then we add rparen to treat whole atomically
381 if (interpolated)
382 {
383 tokens.add(makeVirtualToken(Token.strLiteral, s.toStr))
384 tokens.add(makeVirtualToken(Token.rparen))
385 return null
386 }
387 else
388 {
389 return TokenVal.make(Token.strLiteral, s.toStr)
390 }
391 }
392 finally
393 {
394 inStrLiteral = false
395 }
396 }
397
398 **
399 ** When we hit a $ inside a string it indicates an embedded
400 ** expression. We make this look like a stream of tokens
401 ** such that:
402 ** "a ${b} c" -> "a " + b + " c"
403 ** Return true if more in the string literal.
404 **
405 private Bool strInterpolation(Str s)
406 {
407 consume // $
408 tokens.add(makeVirtualToken(Token.strLiteral, s))
409 tokens.add(makeVirtualToken(Token.plus))
410
411 // if { we allow an expression b/w {...}
412 if (cur === '{')
413 {
414 tokens.add(makeVirtualToken(Token.lparen))
415 consume
416 while (true)
417 {
418 if (cur === '"') throw err("Unexpected end of string, missing }")
419 tok := next
420 if (tok.kind == Token.rbrace) break
421 tokens.add(tok)
422 }
423 tokens.add(makeVirtualToken(Token.rparen))
424 }
425
426 // else also allow a single identifier with
427 // dotted accessors x, x.y, x.y.z
428 else
429 {
430 tok := next
431 if (tok.kind != Token.identifier) throw err("Expected identifier after \$")
432 tokens.add(tok)
433 while (true)
434 {
435 if (cur !== '.') break
436 tokens.add(next) // dot
437 tok = next
438 if (tok.kind !== Token.identifier) throw err("Expected identifier")
439 tokens.add(tok)
440 }
441 }
442
443 // if at end of string, all done
444 if (cur === '\"')
445 {
446 consume
447 return false
448 }
449
450 // add plus and return true to keep chugging
451 tokens.add(makeVirtualToken(Token.plus))
452 return true
453 }
454
455 **
456 ** Create a virtual token for string interpolation.
457 **
458 private TokenVal makeVirtualToken(Token kind, Obj value := null)
459 {
460 tok := TokenVal.make(kind, value)
461 tok.file = filename
462 tok.line = line
463 tok.col = col
464 return tok
465 }
466
467 //////////////////////////////////////////////////////////////////////////
468 // Uri
469 //////////////////////////////////////////////////////////////////////////
470
471 **
472 ** Parse a uri literal token.
473 **
474 private TokenVal uri()
475 {
476 // consume opening backtick
477 consume
478
479 // store starting position
480 s := StrBuf.make
481
482 // loop until we find end of string
483 while (true)
484 {
485 ch := cur
486 if (ch === '`') { consume; break }
487 if (ch === 0 || ch === '\n') throw err("Unexpected end of uri")
488 if (ch === '$') throw err("Uri interpolation not supported yet")
489 if (ch === '\\') ch = escape
490 else consume
491 s.addChar(ch)
492 }
493
494 return TokenVal.make(Token.uriLiteral, s.toStr)
495 }
496
497 //////////////////////////////////////////////////////////////////////////
498 // Char
499 //////////////////////////////////////////////////////////////////////////
500
501 **
502 ** Parse a char literal token.
503 **
504 private TokenVal ch()
505 {
506 // consume opening quote
507 consume
508
509 // if \ then process as escape
510 Int c
511 if (cur === '\\')
512 {
513 c = escape
514 }
515 else
516 {
517 c = cur
518 consume
519 }
520
521 // expecting ' quote
522 if (cur !== '\'') throw err("Expecting ' close of char literal")
523 consume
524
525 return TokenVal.make(Token.intLiteral, c)
526 }
527
528 **
529 ** Parse an escapse sequence which starts with a \
530 **
531 Int escape()
532 {
533 // consume slash
534 if (cur !== '\\') throw err("Internal error")
535 consume
536
537 // check basics
538 switch (cur)
539 {
540 case 'b': consume; return '\b'
541 case 'f': consume; return '\f'
542 case 'n': consume; return '\n'
543 case 'r': consume; return '\r'
544 case 't': consume; return '\t'
545 case '"': consume; return '"'
546 case '$': consume; return '$'
547 case '\'': consume; return '\''
548 case '`': consume; return '`'
549 case '\\': consume; return '\\'
550 }
551
552 // check for uxxxx
553 if (cur === 'u')
554 {
555 consume
556 n3 := cur.fromDigit(16); consume
557 n2 := cur.fromDigit(16); consume
558 n1 := cur.fromDigit(16); consume
559 n0 := cur.fromDigit(16); consume
560 if (n3 == null || n2 == null || n1 == null || n0 == null) throw err("Invalid hex value for \\uxxxx")
561 return ((n3 << 12) | (n2 << 8) | (n1 << 4) | n0)
562 }
563
564 throw err("Invalid escape sequence")
565 }
566
567 //////////////////////////////////////////////////////////////////////////
568 // Comments
569 //////////////////////////////////////////////////////////////////////////
570
571 **
572 ** Skip a single line // comment
573 **
private TokenVal skipCommentSL()
575 {
576 consume // first slash
577 consume // next slash
578 while (true)
579 {
580 if (cur === '\n') { consume; break }
581 if (cur === 0) break
582 consume
583 }
584 return null
585 }
586
587 **
588 ** Skip a multi line /* comment. Note unlike C/Java,
589 ** slash/star comments can be nested.
590 **
private TokenVal skipCommentML()
592 {
593 consume // first slash
594 consume // next slash
595 depth := 1
596 while (true)
597 {
598 if (cur === '*' && peek === '/') { consume; consume; depth--; if (depth <= 0) break }
599 if (cur === '/' && peek === '*') { consume; consume; depth++; continue }
600 if (cur === 0) break
601 consume
602 }
603 return null
604 }
605
606 **
607 ** Parse a Javadoc style comment into a documentation comment token.
608 **
private TokenVal docComment()
610 {
611 // if doc is off, then just skip the line and be done
612 if (!isDoc) { skipCommentSL; return null }
613
614 while (cur === '*') consume
615 if (cur === ' ') consume
616
617 // parse comment
618 lines := Str[,]
619 s := StrBuf.make
620 while (cur > 0)
621 {
622 // add to buffer and advance
623 c := cur
624 consume
625
626 // if not at newline, then loop
627 if (c !== '\n')
628 {
629 s.addChar(c)
630 continue
631 }
632
633 // add line and reset buffer (but don't add leading empty lines)
634 line := s.toStr
635 if (!lines.isEmpty || !line.trim.isEmpty) lines.add(line)
636 s.clear
637
638 // we at a newline, check for leading whitespace(0+)/star(2+)/whitespace(1)
639 while (cur === ' ' || cur === '\t') consume
640 if (cur !== '*' || peek !== '*') break
641 while (cur === '*') consume
642 if (cur === ' ' || cur === '\t') consume
643 }
644 lines.add(s.toStr)
645
646 // strip trailing empty lines
647 while (!lines.isEmpty)
648 if (lines.last.trim.isEmpty) lines.removeAt(-1)
649 else break
650
651 return TokenVal.make(Token.docComment, lines)
652 }
653
654 //////////////////////////////////////////////////////////////////////////
655 // Symbol
656 //////////////////////////////////////////////////////////////////////////
657
658 **
659 ** Parse a symbol token (typically into an operator).
660 **
661 private TokenVal symbol()
662 {
663 c := cur
664 consume
665 switch (c)
666 {
667 case '\r':
668 throw err("Carriage return \\r not allowed in source")
669 case '!':
670 if (cur === '=')
671 {
672 consume
673 if (cur === '=') { consume; return TokenVal.make(Token.notSame) }
674 return TokenVal.make(Token.notEq)
675 }
676 return TokenVal.make(Token.bang)
677 case '#':
678 return TokenVal.make(Token.pound)
679 case '%':
680 if (cur === '=') { consume; return TokenVal.make(Token.assignPercent) }
681 return TokenVal.make(Token.percent)
682 case '&':
683 if (cur === '=') { consume; return TokenVal.make(Token.assignAmp) }
684 if (cur === '&') { consume; return TokenVal.make(Token.doubleAmp) }
685 return TokenVal.make(Token.amp)
686 case '(':
687 return TokenVal.make(Token.lparen)
688 case ')':
689 return TokenVal.make(Token.rparen)
690 case '*':
691 if (cur === '=') { consume; return TokenVal.make(Token.assignStar) }
692 return TokenVal.make(Token.star)
693 case '+':
694 if (cur === '=') { consume; return TokenVal.make(Token.assignPlus) }
695 if (cur === '+') { consume; return TokenVal.make(Token.increment) }
696 return TokenVal.make(Token.plus)
697 case ',':
698 return TokenVal.make(Token.comma)
699 case '-':
700 if (cur === '>') { consume; return TokenVal.make(Token.arrow) }
701 if (cur === '-') { consume; return TokenVal.make(Token.decrement) }
702 if (cur === '=') { consume; return TokenVal.make(Token.assignMinus) }
703 return TokenVal.make(Token.minus)
704 case '.':
705 if (cur === '.')
706 {
707 consume
708 if (cur === '.') { consume; return TokenVal.make(Token.dotDotDot) }
709 return TokenVal.make(Token.dotDot)
710 }
711 return TokenVal.make(Token.dot)
712 case '/':
713 if (cur === '=') { consume; return TokenVal.make(Token.assignSlash) }
714 return TokenVal.make(Token.slash)
715 case ':':
716 if (cur === ':') { consume; return TokenVal.make(Token.doubleColon) }
717 if (cur === '=') { consume; return TokenVal.make(Token.defAssign) }
718 return TokenVal.make(Token.colon)
719 case ';':
720 return TokenVal.make(Token.semicolon)
721 case '<':
722 if (cur === '=')
723 {
724 consume
725 if (cur === '>') { consume; return TokenVal.make(Token.cmp) }
726 return TokenVal.make(Token.ltEq)
727 }
728 if (cur === '<')
729 {
730 consume
731 if (cur === '=') { consume; return TokenVal.make(Token.assignLshift) }
732 return TokenVal.make(Token.lshift)
733 }
734 return TokenVal.make(Token.lt)
735 case '=':
736 if (cur === '=')
737 {
738 consume
739 if (cur === '=') { consume; return TokenVal.make(Token.same) }
740 return TokenVal.make(Token.eq)
741 }
742 return TokenVal.make(Token.assign)
743 case '>':
744 if (cur === '=') { consume; return TokenVal.make(Token.gtEq) }
745 if (cur === '>')
746 {
747 consume
748 if (cur === '=') { consume; return TokenVal.make(Token.assignRshift) }
749 return TokenVal.make(Token.rshift)
750 }
751 return TokenVal.make(Token.gt)
752 case '?':
753 return TokenVal.make(Token.question)
754 case '@':
755 return TokenVal.make(Token.at)
756 case '[':
757 return TokenVal.make(Token.lbracket)
758 case ']':
759 return TokenVal.make(Token.rbracket)
760 case '^':
761 if (cur === '=') { consume; return TokenVal.make(Token.assignCaret) }
762 return TokenVal.make(Token.caret)
763 case '{':
764 return TokenVal.make(Token.lbrace)
765 case '|':
766 if (cur === '|') { consume; return TokenVal.make(Token.doublePipe) }
767 if (cur === '=') { consume; return TokenVal.make(Token.assignPipe) }
768 return TokenVal.make(Token.pipe)
769 case '}':
770 return TokenVal.make(Token.rbrace)
771 case '~':
772 return TokenVal.make(Token.tilde)
773 }
774
775 if (c === 0)
776 return TokenVal.make(Token.eof)
777
778 throw err("Unexpected symbol: " + c.toChar + " (0x" + c.toHex + ")")
779 }
780
781 //////////////////////////////////////////////////////////////////////////
782 // Utils
783 //////////////////////////////////////////////////////////////////////////
784
785 **
786 ** Return a CompilerException for current location in source.
787 **
788 override CompilerErr err(Str msg, Location loc := null)
789 {
790 if (loc == null) loc = Location.make(filename, line, col)
791 return super.err(msg, loc);
792 }
793
794 ////////////////////////////////////////////////////////////////
795 // Consume
796 ////////////////////////////////////////////////////////////////
797
798 **
799 ** Consume the cur char and advance to next char in buffer:
800 ** - updates cur and peek fields
801 ** - updates the line and col count
802 ** - end of file, sets fields to 0
803 **
804 private Void consume()
805 {
806 // if cur is a line break, then advance line number,
807 // because the char we are getting ready to make cur
808 // is the first char on the next line
809 if (cur === '\n')
810 {
811 line++
812 col = 1
813 }
814 else
815 {
816 col++
817 }
818
819 // get the next character from the buffer, any
820 // problems mean that we have read past the end
821 cur = peek
822 pos++
823 if (pos+1 < buf.size)
824 peek = buf[pos+1] // next peek is cur+1
825 else
826 peek = 0
827 }
828
829 //////////////////////////////////////////////////////////////////////////
830 // Test
831 //////////////////////////////////////////////////////////////////////////
832
833 static Void main()
834 {
835 t1 := Duration.now
836 files := File.make(`/dev/fan/src/sysTest/fan/`).list
837 files.each |File f|
838 {
839 tok := Tokenizer.make(null, Location.make(f.name), f.readAllStr, false).tokenize
840 echo("-- " + f + " [" + tok.size + "]")
841 }
842 t2 := Duration.now
843 echo("Time: " + (t2-t1).toMillis)
844 echo("Time: " + (t2-t1))
845 }
846
847 //////////////////////////////////////////////////////////////////////////
848 // Fields
849 //////////////////////////////////////////////////////////////////////////
850
851 private Str buf // buffer
852 private Int pos // index into buf for cur
853 private Bool isDoc // return documentation comments or if false ignore them
854 private Str filename // source file name
855 private Int line := 1 // pos line number
856 private Int col := 1 // pos column number
857 private Int cur // current char
858 private Int peek // next char
859 private Int lastLine // line number of last token returned from next()
860 private TokenVal[] tokens // token accumulator
861 private Bool inStrLiteral // return if inside a string literal token
862
863
864 }