1 //
2 // Copyright (c) 2006, Brian Frank and Andy Frank
3 // Licensed under the Academic Free License version 3.0
4 //
5 // History:
6 // 3 Sep 05 Brian Frank Creation
7 // 18 May 06 Brian Frank Ported from Java to Fan
8 //
9
10 **
11 ** Tokenizer inputs a Str and output a list of Tokens
12 **
13 class Tokenizer : CompilerSupport
14 {
15
16 //////////////////////////////////////////////////////////////////////////
17 // Constructor
18 //////////////////////////////////////////////////////////////////////////
19
20 **
21 ** Construct with characters of source file. The buffer
22 ** passed must be normalized in that all newlines must be
23 ** represented strictly as \n and not \r or \r\n (see
24 ** File.readAllStr). If isDoc is false, we skip all star-star
25 ** Fandoc comments.
26 **
27 new make(Compiler compiler, Location location, Str buf, Bool isDoc)
28 : super(compiler)
29 {
30 this.buf = buf
31 this.filename = location.file
32 this.isDoc = isDoc
33 this.tokens = TokenVal[,]
34
35 // initialize cur and peek
36 cur = peek = ' '
37 if (buf.size > 0) cur = buf[0]
38 if (buf.size > 1) peek = buf[1]
39 pos = 0
40
41 // if first line starts with #, then treat it like an end of
42 // line, so that Unix guys can specify the executable to run
43 if (cur === '#')
44 {
45 while (true)
46 {
47 if (cur === '\n') { consume; break }
48 if (cur === 0) break
49 consume
50 }
51 }
52 }
53
54 //////////////////////////////////////////////////////////////////////////
55 // Access
56 //////////////////////////////////////////////////////////////////////////
57
58 **
59 ** Tokenize the entire input into a list of tokens.
60 **
61 TokenVal[] tokenize()
62 {
63 while (true)
64 {
65 tok := next
66 tokens.add(tok)
67 if (tok.kind === Token.eof) break
68 }
69 return tokens
70 }
71
72 **
73 ** Return the next token in the buffer.
74 **
75 private TokenVal next()
76 {
77 while (true)
78 {
79 // save current line
80 line := this.line
81 col := this.col
82
83 // find next token
84 TokenVal tok := find
85 if (tok == null) continue
86
87 // fill in token's location
88 tok.file = filename
89 tok.line = line
90 tok.col = col
91 tok.newline = lastLine < line
92
93 // save last line
94 lastLine = line;
95
96 return tok
97 }
98 return null // TODO - shouldn't need this
99 }
100
101 **
102 ** Find the next token or return null.
103 **
104 private TokenVal find()
105 {
106 // skip whitespace
107 if (cur.isSpace) { consume; return null }
108
109 // alpha means keyword or identifier
110 if (cur.isAlpha || cur === '_') return word
111
112 // number or .number (note that + and - are handled as unary operator)
113 if (cur.isDigit) return number
114 if (cur === '.' && peek.isDigit) return number
115
116 // str literal
117 if (cur === '"') return str
118 if (cur === '`') return uri
119 if (cur === '\'') return ch
120
121 // comments
122 if (cur === '*' && peek === '*') return docComment
123 if (cur === '/' && peek === '/') return skipCommentSL
124 if (cur === '/' && peek === '*') return skipCommentML
125
126 // symbols
127 return symbol
128 }
129
130 //////////////////////////////////////////////////////////////////////////
131 // Word
132 //////////////////////////////////////////////////////////////////////////
133
134 **
135 ** Parse a word token: alpha (alpha|number)*
136 ** Words are either keywords or identifiers
137 **
138 private TokenVal word()
139 {
140 // store starting position of word
141 start := pos
142
143 // find end of word to compute length
144 while (cur.isAlphaNum || cur === '_') consume
145
146 // create Str (gc note this string might now reference buf)
147 word := buf[start...pos]
148
149 // check keywords
150 keyword := Token.keywords[word]
151 if (keyword != null)
152 return TokenVal.make(keyword)
153
154 // otherwise this is a normal identifier
155 return TokenVal.make(Token.identifier, word);
156 }
157
158 //////////////////////////////////////////////////////////////////////////
159 // Number
160 //////////////////////////////////////////////////////////////////////////
161
162 **
163 ** Parse a number literal token.
164 **
165 private TokenVal number()
166 {
167 Bool neg := false
168 Int whole := 0
169 Int wholeCount := 0
170 Float fraction := 0.0
171 Int exp := 0
172 Token tok := Token.intLiteral // int or float literal
173
174 // check for hex value
175 if (cur === '0' && peek === 'x')
176 return hex
177
178 // read whole part
179 while (cur.isDigit)
180 {
181 whole = whole*10 + cur.fromDigit
182 consume
183 // it would be nice to actually trap max long value, but
184 // as a simpler catch all we know decimal numbers must
185 // never be bigger than 19 digits
186 wholeCount++
187 if (wholeCount > 19) throw err("Numeric literal too big")
188 if (cur === '_') consume
189 }
190
191 // if dot then read fraction
192 if (cur === '.' && peek.isDigit)
193 {
194 consume
195 for (Float m := 0.1; cur.isDigit; m /= 10.0)
196 {
197 fraction = fraction + cur.fromDigit.toFloat*m
198 consume
199 if (cur === '_') consume
200 }
201 tok = Token.floatLiteral
202 }
203
204 // check for exponent
205 if (cur === 'e' || cur === 'E')
206 {
207 consume
208 Bool negExp := false
209 if (cur === '-') { consume; negExp = true }
210 else if (cur === '+') { consume }
211 if (!cur.isDigit) throw err("Expected exponent digits")
212 while (cur.isDigit)
213 {
214 exp = exp*10 + cur.fromDigit
215 consume
216 if (cur === '_') consume
217 }
218 if (negExp) exp = -exp
219 tok = Token.floatLiteral
220 }
221
222 // check for F suffix
223 Int dur := null
224 if (cur === 'f' || cur === 'F')
225 {
226 consume
227 tok = Token.floatLiteral
228 }
229 // check if a duration
230 else
231 {
232 if (cur === 'n' && peek === 's') { consume; consume; dur = 1 }
233 if (cur === 'm' && peek === 's') { consume; consume; dur = 1000000 }
234 if (cur === 's' && peek === 'e') { consume; consume; if (cur !== 'c') throw err("Expected 'sec' in Duration literal"); consume; dur = 1_000_000_000 }
235 if (cur === 'm' && peek === 'i') { consume; consume; if (cur !== 'n') throw err("Expected 'min' in Duration literal"); consume; dur = 60_000_000_000 }
236 if (cur === 'h' && peek === 'r') { consume; consume; dur = 3_600_000_000_000 }
237 if (cur === 'd' && peek === 'a') { consume; consume; if (cur !== 'y') throw err("Expected 'day' in Duration literal"); consume; dur = 86_400_000_000_000 }
238 }
239
240 // int literal
241 if (tok === Token.intLiteral)
242 {
243 if (fraction !== 0.0 || exp !== 0) throw err("Int literals may not have fractional or exponent components")
244 Int val := whole
245 if (neg) val = -val
246 if (dur != null)
247 return TokenVal.make(Token.durationLiteral, Duration.make(val*dur))
248 else
249 return TokenVal.make(tok, val)
250 }
251
252 // float literal
253 if (tok === Token.floatLiteral)
254 {
255 Float val := whole.toFloat + fraction
256 if (exp !== 0) val = val * 10f.pow(exp.toFloat)
257 if (neg) val = -val
258 if (dur != null)
259 return TokenVal.make(Token.durationLiteral, Duration.make((val*dur.toFloat).toInt))
260 else
261 return TokenVal.make(tok, val)
262 }
263
264 throw err("Internal error")
265 }
266
267 **
268 ** Process hex int/long literal starting with 0x
269 **
270 TokenVal hex()
271 {
272 consume // 0
273 consume // x
274
275 // read first hex
276 Int val := cur.fromDigit(16)
277 if (val == null) throw err("Expecting hex number")
278 consume
279 Int nibCount := 1
280 while (true)
281 {
282 Int nib := cur.fromDigit(16)
283 if (nib == null)
284 {
285 if (cur === '_') { consume; continue }
286 break
287 }
288 nibCount++
289 if (nibCount > 16) throw err("Hex literal too big")
290 val = (val << 4) + nib;
291 consume
292 }
293
294 return TokenVal.make(Token.intLiteral, val)
295 }
296
297 //////////////////////////////////////////////////////////////////////////
298 // String
299 //////////////////////////////////////////////////////////////////////////
300
301 **
302 ** Parse a string literal token.
303 **
304 private TokenVal str()
305 {
306 // consume opening quote
307 consume
308
309 // store starting position
310 s := StrBuf.make
311
312 // loop until we find end of string
313 interpolated := false
314 while (true)
315 {
316 if (cur === '"') { consume; break }
317 if (cur === 0) throw err("Unexpected end of string")
318 if (cur === '$')
319 {
320 // if we have detected an interpolated string, then
321 // insert opening paren to treat whole string atomically
322 if (!interpolated)
323 {
324 interpolated = true
325 tokens.add(makeVirtualToken(Token.lparen))
326 }
327
328 // process interpolated string, it returns null
329 // if at end of string literal
330 if (!strInterpolation(s.toStr))
331 {
332 tokens.add(makeVirtualToken(Token.rparen))
333 return null
334 }
335
336 s.clear
337 }
338 else if (cur === '\\')
339 {
340 s.add(escape.toChar)
341 }
342 else
343 {
344 s.addChar(cur)
345 consume
346 }
347 }
348
349 // if interpolated then we add rparen to treat whole atomically
350 if (interpolated)
351 {
352 tokens.add(makeVirtualToken(Token.strLiteral, s.toStr))
353 tokens.add(makeVirtualToken(Token.rparen));
354 return null;
355 }
356 else
357 {
358 return TokenVal.make(Token.strLiteral, s.toStr)
359 }
360 }
361
362 **
363 ** When we hit a $ inside a string it indicates an embedded
364 ** expression. We make this look like a stream of tokens
365 ** such that:
366 ** "a ${b} c" -> "a " + b + " c"
367 ** Return true if more in the string literal.
368 **
369 private Bool strInterpolation(Str s)
370 {
371 consume // $
372 tokens.add(makeVirtualToken(Token.strLiteral, s))
373 tokens.add(makeVirtualToken(Token.plus))
374
375 // if { we allow an expression b/w {...}
376 if (cur === '{')
377 {
378 tokens.add(makeVirtualToken(Token.lparen));
379 consume
380 while (true)
381 {
382 if (cur === '"') throw err("Unexpected end of string, missing }");
383 tok := next
384 if (tok.kind == Token.rbrace) break
385 tokens.add(tok)
386 }
387 tokens.add(makeVirtualToken(Token.rparen))
388 }
389
390 // else also allow a single identifier with
391 // dotted accessors x, x.y, x.y.z
392 else
393 {
394 tok := next
395 if (tok.kind != Token.identifier) throw err("Expected identifier after \$");
396 tokens.add(tok)
397 while (true)
398 {
399 if (cur !== '.') break
400 tokens.add(next) // dot
401 tok = next
402 if (tok.kind !== Token.identifier) throw err("Expected identifier");
403 tokens.add(tok)
404 }
405 }
406
407 // if at end of string, all done
408 if (cur === '\"')
409 {
410 consume
411 return false
412 }
413
414 // add plus and return true to keep chugging
415 tokens.add(makeVirtualToken(Token.plus))
416 return true
417 }
418
419 **
420 ** Create a virtual token for string interpolation.
421 **
422 private TokenVal makeVirtualToken(Token kind, Obj value := null)
423 {
424 tok := TokenVal.make(kind, value);
425 tok.file = filename
426 tok.line = line
427 tok.col = col
428 return tok
429 }
430
431 //////////////////////////////////////////////////////////////////////////
432 // Uri
433 //////////////////////////////////////////////////////////////////////////
434
435 **
436 ** Parse a uri literal token.
437 **
438 private TokenVal uri()
439 {
440 // consume opening backtick
441 consume
442
443 // store starting position
444 s := StrBuf.make
445
446 // loop until we find end of string
447 while (true)
448 {
449 ch := cur
450 if (ch === '`') { consume; break; }
451 if (ch === 0 || ch === '\n') throw err("Unexpected end of uri")
452 if (ch === '$') throw err("Uri interpolation not supported yet")
453 if (ch === '\\') ch = escape
454 else consume
455 s.addChar(ch)
456 }
457
458 return TokenVal.make(Token.uriLiteral, s.toStr)
459 }
460
461 //////////////////////////////////////////////////////////////////////////
462 // Char
463 //////////////////////////////////////////////////////////////////////////
464
465 **
466 ** Parse a char literal token.
467 **
468 private TokenVal ch()
469 {
470 // consume opening quote
471 consume
472
473 // if \ then process as escape
474 Int c
475 if (cur === '\\')
476 {
477 c = escape
478 }
479 else
480 {
481 c = cur
482 consume
483 }
484
485 // expecting ' quote
486 if (cur !== '\'') throw err("Expecting ' close of char literal")
487 consume
488
489 return TokenVal.make(Token.intLiteral, c)
490 }
491
492 **
493 ** Parse an escapse sequence which starts with a \
494 **
495 Int escape()
496 {
497 // consume slash
498 if (cur !== '\\') throw err("Internal error");
499 consume
500
501 // check basics
502 switch (cur)
503 {
504 case 'b': consume; return '\b'
505 case 'f': consume; return '\f'
506 case 'n': consume; return '\n'
507 case 'r': consume; return '\r'
508 case 't': consume; return '\t'
509 case '"': consume; return '"'
510 case '$': consume; return '$'
511 case '\'': consume; return '\''
512 case '`': consume; return '`'
513 case '\\': consume; return '\\'
514 }
515
516 // check for uxxxx
517 if (cur === 'u')
518 {
519 consume
520 n3 := cur.fromDigit(16); consume
521 n2 := cur.fromDigit(16); consume
522 n1 := cur.fromDigit(16); consume
523 n0 := cur.fromDigit(16); consume
524 if (n3 == null || n2 == null || n1 == null || n0 == null) throw err("Invalid hex value for \\uxxxx");
525 return ((n3 << 12) | (n2 << 8) | (n1 << 4) | n0)
526 }
527
528 throw err("Invalid escape sequence")
529 }
530
531 //////////////////////////////////////////////////////////////////////////
532 // Comments
533 //////////////////////////////////////////////////////////////////////////
534
535 **
536 ** Skip a single line // comment
537 **
538 private TokenVal skipCommentSL()
539 {
540 consume // first slash
541 consume // next slash
542 while (true)
543 {
544 if (cur === '\n') { consume; break }
545 if (cur === 0) break
546 consume
547 }
548 return null
549 }
550
551 **
552 ** Skip a multi line /* comment. Note unlike C/Java,
553 ** slash/star comments can be nested.
554 **
555 private TokenVal skipCommentML()
556 {
557 consume // first slash
558 consume // next slash
559 depth := 1
560 while (true)
561 {
562 if (cur === '*' && peek === '/') { consume; consume; depth--; if (depth <= 0) break }
563 if (cur === '/' && peek === '*') { consume; consume; depth++; continue }
564 if (cur === 0) break
565 consume
566 }
567 return null
568 }
569
570 **
571 ** Parse a Javadoc style comment into a documentation comment token.
572 **
573 private TokenVal docComment()
574 {
575 // if doc is off, then just skip the line and be done
576 if (!isDoc) { skipCommentSL; return null }
577
578 while (cur === '*') consume
579 if (cur === ' ') consume
580
581 // parse comment
582 lines := Str[,]
583 s := StrBuf.make
584 while (cur > 0)
585 {
586 // add to buffer and advance
587 c := cur
588 consume
589
590 // if not at newline, then loop
591 if (c !== '\n')
592 {
593 s.addChar(c)
594 continue
595 }
596
597 // add line and reset buffer (but don't add leading empty lines)
598 line := s.toStr
599 if (!lines.isEmpty || !line.trim.isEmpty) lines.add(line)
600 s.clear
601
602 // we at a newline, check for leading whitespace(0+)/star(2+)/whitespace(1)
603 while (cur === ' ' || cur === '\t') consume
604 if (cur !== '*' || peek !== '*') break
605 while (cur === '*') consume
606 if (cur === ' ' || cur === '\t') consume
607 }
608 lines.add(s.toStr)
609
610 // strip trailing empty lines
611 while (!lines.isEmpty)
612 if (lines.last.trim.isEmpty) lines.removeAt(-1)
613 else break
614
615 return TokenVal.make(Token.docComment, lines)
616 }
617
618 //////////////////////////////////////////////////////////////////////////
619 // Symbol
620 //////////////////////////////////////////////////////////////////////////
621
622 **
623 ** Parse a symbol token (typically into an operator).
624 **
625 private TokenVal symbol()
626 {
627 c := cur
628 consume
629 switch (c)
630 {
631 case '\r':
632 throw err("Carriage return \\r not allowed in source");
633 case '!':
634 if (cur === '=')
635 {
636 consume
637 if (cur === '=') { consume; return TokenVal.make(Token.notSame) }
638 return TokenVal.make(Token.notEq)
639 }
640 return TokenVal.make(Token.bang)
641 case '#':
642 return TokenVal.make(Token.pound)
643 case '%':
644 if (cur === '=') { consume; return TokenVal.make(Token.assignPercent) }
645 return TokenVal.make(Token.percent);
646 case '&':
647 if (cur === '=') { consume; return TokenVal.make(Token.assignAmp) }
648 if (cur === '&') { consume; return TokenVal.make(Token.doubleAmp) }
649 return TokenVal.make(Token.amp)
650 case '(':
651 return TokenVal.make(Token.lparen)
652 case ')':
653 return TokenVal.make(Token.rparen)
654 case '*':
655 if (cur === '=') { consume; return TokenVal.make(Token.assignStar) }
656 return TokenVal.make(Token.star)
657 case '+':
658 if (cur === '=') { consume; return TokenVal.make(Token.assignPlus) }
659 if (cur === '+') { consume; return TokenVal.make(Token.increment) }
660 return TokenVal.make(Token.plus)
661 case ',':
662 return TokenVal.make(Token.comma)
663 case '-':
664 if (cur === '>') { consume; return TokenVal.make(Token.arrow) }
665 if (cur === '-') { consume; return TokenVal.make(Token.decrement) }
666 if (cur === '=') { consume; return TokenVal.make(Token.assignMinus) }
667 return TokenVal.make(Token.minus)
668 case '.':
669 if (cur === '.')
670 {
671 consume
672 if (cur === '.') { consume; return TokenVal.make(Token.dotDotDot) }
673 return TokenVal.make(Token.dotDot)
674 }
675 return TokenVal.make(Token.dot)
676 case '/':
677 if (cur === '=') { consume; return TokenVal.make(Token.assignSlash) }
678 return TokenVal.make(Token.slash)
679 case ':':
680 if (cur === ':') { consume; return TokenVal.make(Token.doubleColon) }
681 if (cur === '=') { consume; return TokenVal.make(Token.defAssign) }
682 return TokenVal.make(Token.colon)
683 case ';':
684 return TokenVal.make(Token.semicolon)
685 case '<':
686 if (cur === '=')
687 {
688 consume
689 if (cur === '>') { consume; return TokenVal.make(Token.cmp) }
690 return TokenVal.make(Token.ltEq)
691 }
692 if (cur === '<')
693 {
694 consume
695 if (cur === '=') { consume; return TokenVal.make(Token.assignLshift) }
696 return TokenVal.make(Token.lshift)
697 }
698 return TokenVal.make(Token.lt)
699 case '=':
700 if (cur === '=')
701 {
702 consume
703 if (cur === '=') { consume; return TokenVal.make(Token.same) }
704 return TokenVal.make(Token.eq)
705 }
706 return TokenVal.make(Token.assign)
707 case '>':
708 if (cur === '=') { consume; return TokenVal.make(Token.gtEq) }
709 if (cur === '>')
710 {
711 consume
712 if (cur === '=') { consume; return TokenVal.make(Token.assignRshift) }
713 return TokenVal.make(Token.rshift)
714 }
715 return TokenVal.make(Token.gt)
716 case '?':
717 return TokenVal.make(Token.question)
718 case '@':
719 return TokenVal.make(Token.at)
720 case '[':
721 return TokenVal.make(Token.lbracket)
722 case ']':
723 return TokenVal.make(Token.rbracket)
724 case '^':
725 if (cur === '=') { consume; return TokenVal.make(Token.assignCaret) }
726 return TokenVal.make(Token.caret)
727 case '{':
728 return TokenVal.make(Token.lbrace)
729 case '|':
730 if (cur === '|') { consume; return TokenVal.make(Token.doublePipe) }
731 if (cur === '=') { consume; return TokenVal.make(Token.assignPipe) }
732 return TokenVal.make(Token.pipe)
733 case '}':
734 return TokenVal.make(Token.rbrace)
735 case '~':
736 return TokenVal.make(Token.tilde)
737 }
738
739 if (c === 0)
740 return TokenVal.make(Token.eof)
741
742 throw err("Unexpected symbol: " + c.toChar + " (0x" + c.toHex + ")")
743 }
744
745 //////////////////////////////////////////////////////////////////////////
746 // Utils
747 //////////////////////////////////////////////////////////////////////////
748
749 **
750 ** Return a CompilerException for current location in source.
751 **
752 override CompilerErr err(Str msg, Location loc := null)
753 {
754 if (loc == null) loc = Location.make(filename, line, col);
755 return super.err(msg, loc);
756 }
757
758 ////////////////////////////////////////////////////////////////
759 // Consume
760 ////////////////////////////////////////////////////////////////
761
762 **
763 ** Consume the cur char and advance to next char in buffer:
764 ** - updates cur and peek fields
765 ** - updates the line and col count
766 ** - end of file, sets fields to 0
767 **
768 private Void consume()
769 {
770 // if cur is a line break, then advance line number,
771 // because the char we are getting ready to make cur
772 // is the first char on the next line
773 if (cur === '\n')
774 {
775 line++
776 col = 1
777 }
778 else
779 {
780 col++
781 }
782
783 // get the next character from the buffer, any
784 // problems mean that we have read past the end
785 cur = peek
786 pos++
787 if (pos+1 < buf.size)
788 peek = buf[pos+1] // next peek is cur+1
789 else
790 peek = 0
791 }
792
793 //////////////////////////////////////////////////////////////////////////
794 // Test
795 //////////////////////////////////////////////////////////////////////////
796
797 static Void main()
798 {
799 t1 := Duration.now
800 files := File.make(`/dev/fan/src/sysTest/fan/`).list
801 files.each |File f|
802 {
803 tok := Tokenizer.make(null, Location.make(f.name), f.readAllStr, false).tokenize
804 echo("-- " + f + " [" + tok.size + "]")
805 }
806 t2 := Duration.now
807 echo("Time: " + (t2-t1).toMillis)
808 echo("Time: " + (t2-t1))
809 }
810
811 //////////////////////////////////////////////////////////////////////////
812 // Fields
813 //////////////////////////////////////////////////////////////////////////
814
815 private Str buf // buffer
816 private Int pos // index into buf for cur
817 private Bool isDoc // return documentation comments or if false ignore them
818 private Str filename // source file name
819 private Int line := 1 // pos line number
820 private Int col := 1 // pos column number
821 private Int cur // current char
822 private Int peek // next char
823 private Int lastLine // line number of last token returned from next()
824 private TokenVal[] tokens // token accumulator
825
826
827 }
2 // Copyright (c) 2006, Brian Frank and Andy Frank
3 // Licensed under the Academic Free License version 3.0
4 //
5 // History:
6 // 3 Sep 05 Brian Frank Creation
7 // 18 May 06 Brian Frank Ported from Java to Fan
8 //
9
10 **
11 ** Tokenizer inputs a Str and output a list of Tokens
12 **
13 class Tokenizer : CompilerSupport
14 {
15
16 //////////////////////////////////////////////////////////////////////////
17 // Constructor
18 //////////////////////////////////////////////////////////////////////////
19
20 **
21 ** Construct with characters of source file. The buffer
22 ** passed must be normalized in that all newlines must be
23 ** represented strictly as \n and not \r or \r\n (see
24 ** File.readAllStr). If isDoc is false, we skip all star-star
25 ** Fandoc comments.
26 **
27 new make(Compiler compiler, Location location, Str buf, Bool isDoc)
28 : super(compiler)
29 {
30 this.buf = buf
31 this.filename = location.file
32 this.isDoc = isDoc
33 this.tokens = TokenVal[,]
34
35 // initialize cur and peek
36 cur = peek = ' '
37 if (buf.size > 0) cur = buf[0]
38 if (buf.size > 1) peek = buf[1]
39 pos = 0
40
41 // if first line starts with #, then treat it like an end of
42 // line, so that Unix guys can specify the executable to run
43 if (cur === '#')
44 {
45 while (true)
46 {
47 if (cur === '\n') { consume; break }
48 if (cur === 0) break
49 consume
50 }
51 }
52 }
53
54 //////////////////////////////////////////////////////////////////////////
55 // Access
56 //////////////////////////////////////////////////////////////////////////
57
58 **
59 ** Tokenize the entire input into a list of tokens.
60 **
61 TokenVal[] tokenize()
62 {
63 while (true)
64 {
65 tok := next
66 tokens.add(tok)
67 if (tok.kind === Token.eof) break
68 }
69 return tokens
70 }
71
72 **
73 ** Return the next token in the buffer.
74 **
75 private TokenVal next()
76 {
77 while (true)
78 {
79 // save current line
80 line := this.line
81 col := this.col
82
83 // find next token
84 TokenVal tok := find
85 if (tok == null) continue
86
87 // fill in token's location
88 tok.file = filename
89 tok.line = line
90 tok.col = col
91 tok.newline = lastLine < line
92
93 // save last line
94 lastLine = line;
95
96 return tok
97 }
98 return null // TODO - shouldn't need this
99 }
100
101 **
102 ** Find the next token or return null.
103 **
104 private TokenVal find()
105 {
106 // skip whitespace
107 if (cur.isSpace) { consume; return null }
108
109 // alpha means keyword or identifier
110 if (cur.isAlpha || cur === '_') return word
111
112 // number or .number (note that + and - are handled as unary operator)
113 if (cur.isDigit) return number
114 if (cur === '.' && peek.isDigit) return number
115
116 // str literal
117 if (cur === '"') return str
118 if (cur === '`') return uri
119 if (cur === '\'') return ch
120
121 // comments
122 if (cur === '*' && peek === '*') return docComment
123 if (cur === '/' && peek === '/') return skipCommentSL
124 if (cur === '/' && peek === '*') return skipCommentML
125
126 // symbols
127 return symbol
128 }
129
130 //////////////////////////////////////////////////////////////////////////
131 // Word
132 //////////////////////////////////////////////////////////////////////////
133
134 **
135 ** Parse a word token: alpha (alpha|number)*
136 ** Words are either keywords or identifiers
137 **
138 private TokenVal word()
139 {
140 // store starting position of word
141 start := pos
142
143 // find end of word to compute length
144 while (cur.isAlphaNum || cur === '_') consume
145
146 // create Str (gc note this string might now reference buf)
147 word := buf[start...pos]
148
149 // check keywords
150 keyword := Token.keywords[word]
151 if (keyword != null)
152 return TokenVal.make(keyword)
153
154 // otherwise this is a normal identifier
155 return TokenVal.make(Token.identifier, word);
156 }
157
158 //////////////////////////////////////////////////////////////////////////
159 // Number
160 //////////////////////////////////////////////////////////////////////////
161
162 **
163 ** Parse a number literal token.
164 **
165 private TokenVal number()
166 {
167 Bool neg := false
168 Int whole := 0
169 Int wholeCount := 0
170 Float fraction := 0.0
171 Int exp := 0
172 Token tok := Token.intLiteral // int or float literal
173
174 // check for hex value
175 if (cur === '0' && peek === 'x')
176 return hex
177
178 // read whole part
179 while (cur.isDigit)
180 {
181 whole = whole*10 + cur.fromDigit
182 consume
183 // it would be nice to actually trap max long value, but
184 // as a simpler catch all we know decimal numbers must
185 // never be bigger than 19 digits
186 wholeCount++
187 if (wholeCount > 19) throw err("Numeric literal too big")
188 if (cur === '_') consume
189 }
190
191 // if dot then read fraction
192 if (cur === '.' && peek.isDigit)
193 {
194 consume
195 for (Float m := 0.1; cur.isDigit; m /= 10.0)
196 {
197 fraction = fraction + cur.fromDigit.toFloat*m
198 consume
199 if (cur === '_') consume
200 }
201 tok = Token.floatLiteral
202 }
203
204 // check for exponent
205 if (cur === 'e' || cur === 'E')
206 {
207 consume
208 Bool negExp := false
209 if (cur === '-') { consume; negExp = true }
210 else if (cur === '+') { consume }
211 if (!cur.isDigit) throw err("Expected exponent digits")
212 while (cur.isDigit)
213 {
214 exp = exp*10 + cur.fromDigit
215 consume
216 if (cur === '_') consume
217 }
218 if (negExp) exp = -exp
219 tok = Token.floatLiteral
220 }
221
222 // check for F suffix
223 Int dur := null
224 if (cur === 'f' || cur === 'F')
225 {
226 consume
227 tok = Token.floatLiteral
228 }
229 // check if a duration
230 else
231 {
232 if (cur === 'n' && peek === 's') { consume; consume; dur = 1 }
233 if (cur === 'm' && peek === 's') { consume; consume; dur = 1000000 }
234 if (cur === 's' && peek === 'e') { consume; consume; if (cur !== 'c') throw err("Expected 'sec' in Duration literal"); consume; dur = 1_000_000_000 }
235 if (cur === 'm' && peek === 'i') { consume; consume; if (cur !== 'n') throw err("Expected 'min' in Duration literal"); consume; dur = 60_000_000_000 }
236 if (cur === 'h' && peek === 'r') { consume; consume; dur = 3_600_000_000_000 }
237 if (cur === 'd' && peek === 'a') { consume; consume; if (cur !== 'y') throw err("Expected 'day' in Duration literal"); consume; dur = 86_400_000_000_000 }
238 }
239
240 // int literal
241 if (tok === Token.intLiteral)
242 {
243 if (fraction !== 0.0 || exp !== 0) throw err("Int literals may not have fractional or exponent components")
244 Int val := whole
245 if (neg) val = -val
246 if (dur != null)
247 return TokenVal.make(Token.durationLiteral, Duration.make(val*dur))
248 else
249 return TokenVal.make(tok, val)
250 }
251
252 // float literal
253 if (tok === Token.floatLiteral)
254 {
255 Float val := whole.toFloat + fraction
256 if (exp !== 0) val = val * 10f.pow(exp.toFloat)
257 if (neg) val = -val
258 if (dur != null)
259 return TokenVal.make(Token.durationLiteral, Duration.make((val*dur.toFloat).toInt))
260 else
261 return TokenVal.make(tok, val)
262 }
263
264 throw err("Internal error")
265 }
266
267 **
268 ** Process hex int/long literal starting with 0x
269 **
270 TokenVal hex()
271 {
272 consume // 0
273 consume // x
274
275 // read first hex
276 Int val := cur.fromDigit(16)
277 if (val == null) throw err("Expecting hex number")
278 consume
279 Int nibCount := 1
280 while (true)
281 {
282 Int nib := cur.fromDigit(16)
283 if (nib == null)
284 {
285 if (cur === '_') { consume; continue }
286 break
287 }
288 nibCount++
289 if (nibCount > 16) throw err("Hex literal too big")
290 val = (val << 4) + nib;
291 consume
292 }
293
294 return TokenVal.make(Token.intLiteral, val)
295 }
296
297 //////////////////////////////////////////////////////////////////////////
298 // String
299 //////////////////////////////////////////////////////////////////////////
300
301 **
302 ** Parse a string literal token.
303 **
304 private TokenVal str()
305 {
306 // consume opening quote
307 consume
308
309 // store starting position
310 s := StrBuf.make
311
312 // loop until we find end of string
313 interpolated := false
314 while (true)
315 {
316 if (cur === '"') { consume; break }
317 if (cur === 0) throw err("Unexpected end of string")
318 if (cur === '$')
319 {
320 // if we have detected an interpolated string, then
321 // insert opening paren to treat whole string atomically
322 if (!interpolated)
323 {
324 interpolated = true
325 tokens.add(makeVirtualToken(Token.lparen))
326 }
327
328 // process interpolated string, it returns null
329 // if at end of string literal
330 if (!strInterpolation(s.toStr))
331 {
332 tokens.add(makeVirtualToken(Token.rparen))
333 return null
334 }
335
336 s.clear
337 }
338 else if (cur === '\\')
339 {
340 s.add(escape.toChar)
341 }
342 else
343 {
344 s.addChar(cur)
345 consume
346 }
347 }
348
349 // if interpolated then we add rparen to treat whole atomically
350 if (interpolated)
351 {
352 tokens.add(makeVirtualToken(Token.strLiteral, s.toStr))
353 tokens.add(makeVirtualToken(Token.rparen));
354 return null;
355 }
356 else
357 {
358 return TokenVal.make(Token.strLiteral, s.toStr)
359 }
360 }
361
362 **
363 ** When we hit a $ inside a string it indicates an embedded
364 ** expression. We make this look like a stream of tokens
365 ** such that:
366 ** "a ${b} c" -> "a " + b + " c"
367 ** Return true if more in the string literal.
368 **
369 private Bool strInterpolation(Str s)
370 {
371 consume // $
372 tokens.add(makeVirtualToken(Token.strLiteral, s))
373 tokens.add(makeVirtualToken(Token.plus))
374
375 // if { we allow an expression b/w {...}
376 if (cur === '{')
377 {
378 tokens.add(makeVirtualToken(Token.lparen));
379 consume
380 while (true)
381 {
382 if (cur === '"') throw err("Unexpected end of string, missing }");
383 tok := next
384 if (tok.kind == Token.rbrace) break
385 tokens.add(tok)
386 }
387 tokens.add(makeVirtualToken(Token.rparen))
388 }
389
390 // else also allow a single identifier with
391 // dotted accessors x, x.y, x.y.z
392 else
393 {
394 tok := next
395 if (tok.kind != Token.identifier) throw err("Expected identifier after \$");
396 tokens.add(tok)
397 while (true)
398 {
399 if (cur !== '.') break
400 tokens.add(next) // dot
401 tok = next
402 if (tok.kind !== Token.identifier) throw err("Expected identifier");
403 tokens.add(tok)
404 }
405 }
406
407 // if at end of string, all done
408 if (cur === '\"')
409 {
410 consume
411 return false
412 }
413
414 // add plus and return true to keep chugging
415 tokens.add(makeVirtualToken(Token.plus))
416 return true
417 }
418
419 **
420 ** Create a virtual token for string interpolation.
421 **
422 private TokenVal makeVirtualToken(Token kind, Obj value := null)
423 {
424 tok := TokenVal.make(kind, value);
425 tok.file = filename
426 tok.line = line
427 tok.col = col
428 return tok
429 }
430
431 //////////////////////////////////////////////////////////////////////////
432 // Uri
433 //////////////////////////////////////////////////////////////////////////
434
435 **
436 ** Parse a uri literal token.
437 **
438 private TokenVal uri()
439 {
440 // consume opening backtick
441 consume
442
443 // store starting position
444 s := StrBuf.make
445
446 // loop until we find end of string
447 while (true)
448 {
449 ch := cur
450 if (ch === '`') { consume; break; }
451 if (ch === 0 || ch === '\n') throw err("Unexpected end of uri")
452 if (ch === '$') throw err("Uri interpolation not supported yet")
453 if (ch === '\\') ch = escape
454 else consume
455 s.addChar(ch)
456 }
457
458 return TokenVal.make(Token.uriLiteral, s.toStr)
459 }
460
461 //////////////////////////////////////////////////////////////////////////
462 // Char
463 //////////////////////////////////////////////////////////////////////////
464
465 **
466 ** Parse a char literal token.
467 **
468 private TokenVal ch()
469 {
470 // consume opening quote
471 consume
472
473 // if \ then process as escape
474 Int c
475 if (cur === '\\')
476 {
477 c = escape
478 }
479 else
480 {
481 c = cur
482 consume
483 }
484
485 // expecting ' quote
486 if (cur !== '\'') throw err("Expecting ' close of char literal")
487 consume
488
489 return TokenVal.make(Token.intLiteral, c)
490 }
491
492 **
493 ** Parse an escapse sequence which starts with a \
494 **
495 Int escape()
496 {
497 // consume slash
498 if (cur !== '\\') throw err("Internal error");
499 consume
500
501 // check basics
502 switch (cur)
503 {
504 case 'b': consume; return '\b'
505 case 'f': consume; return '\f'
506 case 'n': consume; return '\n'
507 case 'r': consume; return '\r'
508 case 't': consume; return '\t'
509 case '"': consume; return '"'
510 case '$': consume; return '$'
511 case '\'': consume; return '\''
512 case '`': consume; return '`'
513 case '\\': consume; return '\\'
514 }
515
516 // check for uxxxx
517 if (cur === 'u')
518 {
519 consume
520 n3 := cur.fromDigit(16); consume
521 n2 := cur.fromDigit(16); consume
522 n1 := cur.fromDigit(16); consume
523 n0 := cur.fromDigit(16); consume
524 if (n3 == null || n2 == null || n1 == null || n0 == null) throw err("Invalid hex value for \\uxxxx");
525 return ((n3 << 12) | (n2 << 8) | (n1 << 4) | n0)
526 }
527
528 throw err("Invalid escape sequence")
529 }
530
531 //////////////////////////////////////////////////////////////////////////
532 // Comments
533 //////////////////////////////////////////////////////////////////////////
534
535 **
536 ** Skip a single line // comment
537 **
538 private TokenVal skipCommentSL()
539 {
540 consume // first slash
541 consume // next slash
542 while (true)
543 {
544 if (cur === '\n') { consume; break }
545 if (cur === 0) break
546 consume
547 }
548 return null
549 }
550
551 **
552 ** Skip a multi line /* comment. Note unlike C/Java,
553 ** slash/star comments can be nested.
554 **
555 private TokenVal skipCommentML()
556 {
557 consume // first slash
558 consume // next slash
559 depth := 1
560 while (true)
561 {
562 if (cur === '*' && peek === '/') { consume; consume; depth--; if (depth <= 0) break }
563 if (cur === '/' && peek === '*') { consume; consume; depth++; continue }
564 if (cur === 0) break
565 consume
566 }
567 return null
568 }
569
570 **
571 ** Parse a Javadoc style comment into a documentation comment token.
572 **
573 private TokenVal docComment()
574 {
575 // if doc is off, then just skip the line and be done
576 if (!isDoc) { skipCommentSL; return null }
577
578 while (cur === '*') consume
579 if (cur === ' ') consume
580
581 // parse comment
582 lines := Str[,]
583 s := StrBuf.make
584 while (cur > 0)
585 {
586 // add to buffer and advance
587 c := cur
588 consume
589
590 // if not at newline, then loop
591 if (c !== '\n')
592 {
593 s.addChar(c)
594 continue
595 }
596
597 // add line and reset buffer (but don't add leading empty lines)
598 line := s.toStr
599 if (!lines.isEmpty || !line.trim.isEmpty) lines.add(line)
600 s.clear
601
602 // we at a newline, check for leading whitespace(0+)/star(2+)/whitespace(1)
603 while (cur === ' ' || cur === '\t') consume
604 if (cur !== '*' || peek !== '*') break
605 while (cur === '*') consume
606 if (cur === ' ' || cur === '\t') consume
607 }
608 lines.add(s.toStr)
609
610 // strip trailing empty lines
611 while (!lines.isEmpty)
612 if (lines.last.trim.isEmpty) lines.removeAt(-1)
613 else break
614
615 return TokenVal.make(Token.docComment, lines)
616 }
617
618 //////////////////////////////////////////////////////////////////////////
619 // Symbol
620 //////////////////////////////////////////////////////////////////////////
621
622 **
623 ** Parse a symbol token (typically into an operator).
624 **
625 private TokenVal symbol()
626 {
627 c := cur
628 consume
629 switch (c)
630 {
631 case '\r':
632 throw err("Carriage return \\r not allowed in source");
633 case '!':
634 if (cur === '=')
635 {
636 consume
637 if (cur === '=') { consume; return TokenVal.make(Token.notSame) }
638 return TokenVal.make(Token.notEq)
639 }
640 return TokenVal.make(Token.bang)
641 case '#':
642 return TokenVal.make(Token.pound)
643 case '%':
644 if (cur === '=') { consume; return TokenVal.make(Token.assignPercent) }
645 return TokenVal.make(Token.percent);
646 case '&':
647 if (cur === '=') { consume; return TokenVal.make(Token.assignAmp) }
648 if (cur === '&') { consume; return TokenVal.make(Token.doubleAmp) }
649 return TokenVal.make(Token.amp)
650 case '(':
651 return TokenVal.make(Token.lparen)
652 case ')':
653 return TokenVal.make(Token.rparen)
654 case '*':
655 if (cur === '=') { consume; return TokenVal.make(Token.assignStar) }
656 return TokenVal.make(Token.star)
657 case '+':
658 if (cur === '=') { consume; return TokenVal.make(Token.assignPlus) }
659 if (cur === '+') { consume; return TokenVal.make(Token.increment) }
660 return TokenVal.make(Token.plus)
661 case ',':
662 return TokenVal.make(Token.comma)
663 case '-':
664 if (cur === '>') { consume; return TokenVal.make(Token.arrow) }
665 if (cur === '-') { consume; return TokenVal.make(Token.decrement) }
666 if (cur === '=') { consume; return TokenVal.make(Token.assignMinus) }
667 return TokenVal.make(Token.minus)
668 case '.':
669 if (cur === '.')
670 {
671 consume
672 if (cur === '.') { consume; return TokenVal.make(Token.dotDotDot) }
673 return TokenVal.make(Token.dotDot)
674 }
675 return TokenVal.make(Token.dot)
676 case '/':
677 if (cur === '=') { consume; return TokenVal.make(Token.assignSlash) }
678 return TokenVal.make(Token.slash)
679 case ':':
680 if (cur === ':') { consume; return TokenVal.make(Token.doubleColon) }
681 if (cur === '=') { consume; return TokenVal.make(Token.defAssign) }
682 return TokenVal.make(Token.colon)
683 case ';':
684 return TokenVal.make(Token.semicolon)
685 case '<':
686 if (cur === '=')
687 {
688 consume
689 if (cur === '>') { consume; return TokenVal.make(Token.cmp) }
690 return TokenVal.make(Token.ltEq)
691 }
692 if (cur === '<')
693 {
694 consume
695 if (cur === '=') { consume; return TokenVal.make(Token.assignLshift) }
696 return TokenVal.make(Token.lshift)
697 }
698 return TokenVal.make(Token.lt)
699 case '=':
700 if (cur === '=')
701 {
702 consume
703 if (cur === '=') { consume; return TokenVal.make(Token.same) }
704 return TokenVal.make(Token.eq)
705 }
706 return TokenVal.make(Token.assign)
707 case '>':
708 if (cur === '=') { consume; return TokenVal.make(Token.gtEq) }
709 if (cur === '>')
710 {
711 consume
712 if (cur === '=') { consume; return TokenVal.make(Token.assignRshift) }
713 return TokenVal.make(Token.rshift)
714 }
715 return TokenVal.make(Token.gt)
716 case '?':
717 return TokenVal.make(Token.question)
718 case '@':
719 return TokenVal.make(Token.at)
720 case '[':
721 return TokenVal.make(Token.lbracket)
722 case ']':
723 return TokenVal.make(Token.rbracket)
724 case '^':
725 if (cur === '=') { consume; return TokenVal.make(Token.assignCaret) }
726 return TokenVal.make(Token.caret)
727 case '{':
728 return TokenVal.make(Token.lbrace)
729 case '|':
730 if (cur === '|') { consume; return TokenVal.make(Token.doublePipe) }
731 if (cur === '=') { consume; return TokenVal.make(Token.assignPipe) }
732 return TokenVal.make(Token.pipe)
733 case '}':
734 return TokenVal.make(Token.rbrace)
735 case '~':
736 return TokenVal.make(Token.tilde)
737 }
738
739 if (c === 0)
740 return TokenVal.make(Token.eof)
741
742 throw err("Unexpected symbol: " + c.toChar + " (0x" + c.toHex + ")")
743 }
744
745 //////////////////////////////////////////////////////////////////////////
746 // Utils
747 //////////////////////////////////////////////////////////////////////////
748
749 **
750 ** Return a CompilerException for current location in source.
751 **
752 override CompilerErr err(Str msg, Location loc := null)
753 {
754 if (loc == null) loc = Location.make(filename, line, col);
755 return super.err(msg, loc);
756 }
757
758 ////////////////////////////////////////////////////////////////
759 // Consume
760 ////////////////////////////////////////////////////////////////
761
762 **
763 ** Consume the cur char and advance to next char in buffer:
764 ** - updates cur and peek fields
765 ** - updates the line and col count
766 ** - end of file, sets fields to 0
767 **
768 private Void consume()
769 {
770 // if cur is a line break, then advance line number,
771 // because the char we are getting ready to make cur
772 // is the first char on the next line
773 if (cur === '\n')
774 {
775 line++
776 col = 1
777 }
778 else
779 {
780 col++
781 }
782
783 // get the next character from the buffer, any
784 // problems mean that we have read past the end
785 cur = peek
786 pos++
787 if (pos+1 < buf.size)
788 peek = buf[pos+1] // next peek is cur+1
789 else
790 peek = 0
791 }
792
793 //////////////////////////////////////////////////////////////////////////
794 // Test
795 //////////////////////////////////////////////////////////////////////////
796
797 static Void main()
798 {
799 t1 := Duration.now
800 files := File.make(`/dev/fan/src/sysTest/fan/`).list
801 files.each |File f|
802 {
803 tok := Tokenizer.make(null, Location.make(f.name), f.readAllStr, false).tokenize
804 echo("-- " + f + " [" + tok.size + "]")
805 }
806 t2 := Duration.now
807 echo("Time: " + (t2-t1).toMillis)
808 echo("Time: " + (t2-t1))
809 }
810
811 //////////////////////////////////////////////////////////////////////////
812 // Fields
813 //////////////////////////////////////////////////////////////////////////
814
815 private Str buf // buffer
816 private Int pos // index into buf for cur
817 private Bool isDoc // return documentation comments or if false ignore them
818 private Str filename // source file name
819 private Int line := 1 // pos line number
820 private Int col := 1 // pos column number
821 private Int cur // current char
822 private Int peek // next char
823 private Int lastLine // line number of last token returned from next()
824 private TokenVal[] tokens // token accumulator
825
826
827 }