go-libvirt-plain/internal/lvgen/lvlexer.go
2017-11-16 15:49:40 -05:00

335 lines
8.2 KiB
Go

// Copyright 2017 The go-libvirt Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package lvgen
import (
"fmt"
"io"
"io/ioutil"
"strings"
"unicode"
"unicode/utf8"
)
// eof is returned by the lexer when there's no more input.
const eof = -1
// oneRuneTokens lists the runes the lexer will consider to be tokens when it
// finds them. These are returned to the parser using the integer value of their
// runes.
var oneRuneTokens = `{}[]<>(),=;:*`
var keywords = map[string]int{
"hyper": HYPER,
"int": INT,
"short": SHORT,
"char": CHAR,
"bool": BOOL,
"case": CASE,
"const": CONST,
"default": DEFAULT,
"double": DOUBLE,
"enum": ENUM,
"float": FLOAT,
"opaque": OPAQUE,
"string": STRING,
"struct": STRUCT,
"switch": SWITCH,
"typedef": TYPEDEF,
"union": UNION,
"unsigned": UNSIGNED,
"void": VOID,
"program": PROGRAM,
"version": VERSION,
}
// item is a lexeme, or what the lexer returns to the parser.
type item struct {
typ int
val string
line, column int
}
// String will display lexer items for humans to debug. There are some
// calculations here due to the way goyacc arranges token values; see the
// generated file y.go for an idea what's going on here, but the basic idea is
// that the lower token type values are reserved for single-rune tokens, which
// the lexer reports using the value of the rune itself. Everything else is
// allocated a range of type value up above all the possible single-rune values.
func (i item) String() string {
tokType := i.typ
if tokType >= yyPrivate {
if tokType < yyPrivate+len(yyTok2) {
tokType = yyTok2[tokType-yyPrivate]
}
}
rv := fmt.Sprintf("%s %q %d:%d", yyTokname(tokType), i.val, i.line, i.column)
return rv
}
// Lexer stores the state of this lexer.
type Lexer struct {
input string // the string we're scanning.
start int // start position of the item.
pos int // current position in the input.
line int // the current line (for error reporting).
column int // current position within the current line.
width int // width of the last rune scanned.
items chan item // channel of scanned lexer items (lexemes).
lastItem item // The last item the lexer handed the parser
}
// NewLexer will return a new lexer for the passed-in reader.
func NewLexer(rdr io.Reader) (*Lexer, error) {
l := &Lexer{}
b, err := ioutil.ReadAll(rdr)
if err != nil {
return nil, err
}
l.input = string(b)
l.items = make(chan item)
return l, nil
}
// Run starts the lexer, and should be called in a goroutine.
func (l *Lexer) Run() {
for state := lexText; state != nil; {
state = state(l)
}
close(l.items)
}
// emit returns a token to the parser.
func (l *Lexer) emit(t int) {
l.items <- item{t, l.input[l.start:l.pos], l.line, l.column}
l.start = l.pos
}
// Lex gets the next token.
func (l *Lexer) Lex(st *yySymType) int {
s := <-l.items
l.lastItem = s
st.val = s.val
return int(s.typ)
}
// Error is called by the parser when it finds a problem.
func (l *Lexer) Error(s string) {
fmt.Printf("parse error at %d:%d: %v\n", l.lastItem.line+1, l.lastItem.column+1, s)
fmt.Printf("error at %q\n", l.lastItem.val)
}
// errorf is used by the lexer to report errors. It inserts an ERROR token into
// the items channel, and sets the state to nil, which stops the lexer's state
// machine.
func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
l.items <- item{ERROR, fmt.Sprintf(format, args), l.line, l.column}
return nil
}
// next returns the rune at the current location, and advances to the next rune
// in the input.
func (l *Lexer) next() (r rune) {
if l.pos >= len(l.input) {
l.width = 0
return eof
}
r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
l.pos += l.width
l.column++
if r == '\n' {
l.line++
l.column = 0
}
return r
}
// ignore discards the current text from start to pos.
func (l *Lexer) ignore() {
l.start = l.pos
}
// backup moves back one character, but can only be called once per next() call.
func (l *Lexer) backup() {
l.pos -= l.width
if l.column > 0 {
l.column--
} else {
l.line--
}
l.width = 0
}
// peek looks ahead at the next rune in the stream without consuming it.
func (l *Lexer) peek() rune {
r := l.next()
l.backup()
return r
}
// accept will advance to the next rune if it's contained in the string of valid
// runes passed in by the caller.
func (l *Lexer) accept(valid string) bool {
if strings.IndexRune(valid, l.next()) >= 0 {
return true
}
l.backup()
return false
}
// acceptRun advances over a number of valid runes, stopping as soon as it hits
// one not on the list.
func (l *Lexer) acceptRun(valid string) {
for strings.IndexRune(valid, l.next()) >= 0 {
}
l.backup()
}
// keyword checks whether the current lexeme is a keyword or not. If so it
// returns the keyword's token id, otherwise it returns IDENTIFIER.
func (l *Lexer) keyword() int {
ident := l.input[l.start:l.pos]
tok, ok := keywords[ident]
if ok == true {
return int(tok)
}
return IDENTIFIER
}
// oneRuneToken determines whether a rune is a token. If so it returns the token
// id and true, otherwise it returns false.
func (l *Lexer) oneRuneToken(r rune) (int, bool) {
if strings.IndexRune(oneRuneTokens, r) >= 0 {
return int(r), true
}
return 0, false
}
// State functions
type stateFn func(*Lexer) stateFn
// lexText is the master lex routine. The lexer is started in this state.
func lexText(l *Lexer) stateFn {
for {
if strings.HasPrefix(l.input[l.pos:], "/*") {
return lexBlockComment
}
r := l.next()
if r == eof {
break
}
if unicode.IsSpace(r) {
l.ignore()
return lexText
}
if l.column == 1 && r == '%' {
l.backup()
return lexDirective
}
if unicode.IsLetter(r) {
l.backup()
return lexIdent
}
if unicode.IsNumber(r) || r == '-' {
l.backup()
return lexNumber
}
if t, isToken := l.oneRuneToken(r); isToken == true {
l.emit(t)
}
}
return nil
}
// lexBlockComment is used when we find a comment marker '/*' in the input.
func lexBlockComment(l *Lexer) stateFn {
for {
if strings.HasPrefix(l.input[l.pos:], "*/") {
// Found the end. Advance past the '*/' and discard the comment body.
l.next()
l.next()
l.ignore()
return lexText
}
if l.next() == eof {
return l.errorf("unterminated block comment")
}
}
}
// lexIdent handles identifiers.
func lexIdent(l *Lexer) stateFn {
for {
r := l.next()
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' {
continue
}
l.backup()
break
}
// We may have a keyword, so check for that before emitting.
l.emit(l.keyword())
return lexText
}
// lexNumber handles decimal and hexadecimal numbers. Decimal numbers may begin
// with a '-'; hex numbers begin with '0x' and do not accept leading '-'.
func lexNumber(l *Lexer) stateFn {
// Leading '-' is ok
digits := "0123456789"
neg := l.accept("-")
if !neg {
// allow '0x' for hex numbers, as long as there's not a leading '-'.
r := l.peek()
if r == '0' {
l.next()
if l.accept("x") {
digits = "0123456789ABCDEFabcdef"
}
}
}
// followed by any number of digits
l.acceptRun(digits)
r := l.peek()
if unicode.IsLetter(r) {
l.next()
return l.errorf("invalid number: %q", l.input[l.start:l.pos])
}
l.emit(CONSTANT)
return lexText
}
// lexDirective handles lines beginning with '%'. These are used to emit C code
// directly to the output file. For now we're ignoring them, but some of the
// constants in the protocol file do depend on values from #included header
// files, so that may need to change.
func lexDirective(l *Lexer) stateFn {
for {
r := l.next()
if r == '\n' {
l.ignore()
return lexText
}
if r == eof {
return l.errorf("unterminated directive")
}
}
}