From 6e924657b1a53b2dc0e8946cfe17001e4267c5a9 Mon Sep 17 00:00:00 2001 From: Geoff Hickey Date: Fri, 3 Nov 2017 13:37:16 -0400 Subject: [PATCH] Move the rpcgen lexer to its own file. --- internal/constants/constants.go | 5 +- internal/lvgen/generate.go | 285 +---------------------------- internal/lvgen/lvlexer.go | 305 ++++++++++++++++++++++++++++++++ 3 files changed, 310 insertions(+), 285 deletions(-) create mode 100644 internal/lvgen/lvlexer.go diff --git a/internal/constants/constants.go b/internal/constants/constants.go index d321bec..4b7a90c 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package constants provides shared data for the libvirt package. +// Package constants provides shared data for the libvirt package. This file +// includes only things not generated automatically by the parser that runs on +// libvirt's remote_protocol.x file - see constants.gen.go for the generated +// definitions. package constants const ( diff --git a/internal/lvgen/generate.go b/internal/lvgen/generate.go index b534d23..7f6ff4a 100644 --- a/internal/lvgen/generate.go +++ b/internal/lvgen/generate.go @@ -21,7 +21,6 @@ package lvgen import ( "fmt" "io" - "io/ioutil" "os" "strconv" "strings" @@ -61,6 +60,7 @@ type ConstItem struct { Val string } +// Generator holds all the information parsed out of the protocol file. type Generator struct { // Enums holds the list of enums found by the parser. Enums []ConstItem @@ -224,289 +224,6 @@ func fixAbbrevs(s string) string { return s } -// TODO: Move this lexer to its own file? - -// eof is returned by the lexer when there's no more input. -const eof = -1 - -type item struct { - typ int - val string - line, column int -} - -// String will display lexer items for humans to debug. There are some -// calculations here due to the way goyacc arranges token values; see the -// generated file y.go for an idea what's going on here, but the basic idea is -// that the lower token type values are reserved for single-rune tokens, which -// the lexer reports using the value of the rune itself. Everything else is -// allocated a range of type value up above all the possible single-rune values. -func (i item) String() string { - tokType := i.typ - if tokType >= yyPrivate { - if tokType < yyPrivate+len(yyTok2) { - tokType = yyTok2[tokType-yyPrivate] - } - } - rv := fmt.Sprintf("%s %q %d:%d", yyTokname(tokType), i.val, i.line, i.column) - return rv -} - -// Lexer stores the state of this lexer. -type Lexer struct { - input string // the string we're scanning. - start int // start position of the item. - pos int // current position in the input. - line int // the current line (for error reporting). - column int // current position within the current line. - width int // width of the last rune scanned. - items chan item // channel of scanned lexer items (lexemes). - lastItem item // The last item the lexer handed the parser -} - -// NewLexer will return a new lexer for the passed-in reader. -func NewLexer(rdr io.Reader) (*Lexer, error) { - l := &Lexer{} - - b, err := ioutil.ReadAll(rdr) - if err != nil { - return nil, err - } - l.input = string(b) - l.items = make(chan item) - - return l, nil -} - -// Run starts the lexer, and should be called in a goroutine. -func (l *Lexer) Run() { - for state := lexText; state != nil; { - state = state(l) - } - close(l.items) -} - -// emit returns a token to the parser. -func (l *Lexer) emit(t int) { - l.items <- item{t, l.input[l.start:l.pos], l.line, l.column} - l.start = l.pos -} - -// Lex gets the next token. -func (l *Lexer) Lex(st *yySymType) int { - s := <-l.items - l.lastItem = s - st.val = s.val - // fmt.Println("Lex returning", s) - return int(s.typ) -} - -// Error is called by the parser when it finds a problem. -func (l *Lexer) Error(s string) { - fmt.Printf("parse error at %d:%d: %v\n", l.lastItem.line+1, l.lastItem.column+1, s) - fmt.Printf("error at %q\n", l.lastItem.val) -} - -// errorf is used by the lexer to report errors. It inserts an ERROR token into -// the items channel, and sets the state to nil, which stops the lexer's state -// machine. -func (l *Lexer) errorf(format string, args ...interface{}) stateFn { - l.items <- item{ERROR, fmt.Sprintf(format, args), l.line, l.column} - return nil -} - -// next returns the rune at the current location, and advances to the next rune -// in the input. -func (l *Lexer) next() (r rune) { - if l.pos >= len(l.input) { - l.width = 0 - return eof - } - r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) - l.pos += l.width - l.column++ - if r == '\n' { - l.line++ - l.column = 0 - } - return r -} - -// ignore discards the current text from start to pos. -func (l *Lexer) ignore() { - l.start = l.pos -} - -// backup moves back one character, but can only be called once per next() call. -func (l *Lexer) backup() { - l.pos -= l.width - if l.column > 0 { - l.column-- - } else { - l.line-- - } - l.width = 0 -} - -// peek looks ahead at the next rune in the stream without consuming it. -func (l *Lexer) peek() rune { - r := l.next() - l.backup() - return r -} - -// accept will advance to the next rune if it's contained in the string of valid -// runes passed in by the caller. -func (l *Lexer) accept(valid string) bool { - if strings.IndexRune(valid, l.next()) >= 0 { - return true - } - l.backup() - return false -} - -// acceptRun advances over a number of valid runes, stopping as soon as it hits -// one not on the list. -func (l *Lexer) acceptRun(valid string) { - for strings.IndexRune(valid, l.next()) >= 0 { - } - l.backup() -} - -// keyword checks whether the current lexeme is a keyword or not. If so it -// returns the keyword's token id, otherwise it returns IDENTIFIER. -func (l *Lexer) keyword() int { - ident := l.input[l.start:l.pos] - tok, ok := keywords[ident] - if ok == true { - return int(tok) - } - return IDENTIFIER -} - -// oneRuneToken determines whether a rune is a token. If so it returns the token -// id and true, otherwise it returns false. -func (l *Lexer) oneRuneToken(r rune) (int, bool) { - if strings.IndexRune(oneRuneTokens, r) >= 0 { - return int(r), true - } - - return 0, false -} - -// State functions -type stateFn func(*Lexer) stateFn - -// lexText is the master lex routine. The lexer is started in this state. -func lexText(l *Lexer) stateFn { - for { - if strings.HasPrefix(l.input[l.pos:], "/*") { - return lexBlockComment - } - r := l.next() - if r == eof { - break - } - if unicode.IsSpace(r) { - l.ignore() - return lexText - } - if l.column == 1 && r == '%' { - l.backup() - return lexDirective - } - if unicode.IsLetter(r) { - l.backup() - return lexIdent - } - if unicode.IsNumber(r) || r == '-' { - l.backup() - return lexNumber - } - if t, isToken := l.oneRuneToken(r); isToken == true { - l.emit(t) - } - } - - return nil -} - -// lexBlockComment is used when we find a comment marker '/*' in the input. -func lexBlockComment(l *Lexer) stateFn { - for { - if strings.HasPrefix(l.input[l.pos:], "*/") { - // Found the end. Advance past the '*/' and discard the comment body. - l.next() - l.next() - l.ignore() - return lexText - } - if l.next() == eof { - return l.errorf("unterminated block comment") - } - } -} - -// lexIdent handles identifiers. -func lexIdent(l *Lexer) stateFn { - for { - r := l.next() - if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' { - continue - } - l.backup() - break - } - // We may have a keyword, so check for that before emitting. - l.emit(l.keyword()) - - return lexText -} - -// lexNumber handles decimal and hexadecimal numbers. Decimal numbers may begin -// with a '-'; hex numbers begin with '0x' and do not accept leading '-'. -func lexNumber(l *Lexer) stateFn { - // Leading '-' is ok - digits := "0123456789" - neg := l.accept("-") - if !neg { - // allow '0x' for hex numbers, as long as there's not a leading '-'. - r := l.peek() - if r == '0' { - l.next() - if l.accept("x") { - digits = "0123456789ABCDEFabcdef" - } - } - } - // followed by any number of digits - l.acceptRun(digits) - r := l.peek() - if unicode.IsLetter(r) { - l.next() - return l.errorf("invalid number: %q", l.input[l.start:l.pos]) - } - l.emit(CONSTANT) - return lexText -} - -// lexDirective handles lines beginning with '%'. These are used to emit C code -// directly to the output file. For now we're ignoring them, but some of the -// constants in the protocol file do depend on values from #included header -// files, so that may need to change. -func lexDirective(l *Lexer) stateFn { - for { - r := l.next() - if r == '\n' { - l.ignore() - return lexText - } - if r == eof { - return l.errorf("unterminated directive") - } - } -} - //--------------------------------------------------------------------------- // Routines called by the parser's actions. //--------------------------------------------------------------------------- diff --git a/internal/lvgen/lvlexer.go b/internal/lvgen/lvlexer.go new file mode 100644 index 0000000..7504e43 --- /dev/null +++ b/internal/lvgen/lvlexer.go @@ -0,0 +1,305 @@ +// Copyright 2017 The go-libvirt Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lvgen + +import ( + "fmt" + "io" + "io/ioutil" + "strings" + "unicode" + "unicode/utf8" +) + +// eof is returned by the lexer when there's no more input. +const eof = -1 + +type item struct { + typ int + val string + line, column int +} + +// String will display lexer items for humans to debug. There are some +// calculations here due to the way goyacc arranges token values; see the +// generated file y.go for an idea what's going on here, but the basic idea is +// that the lower token type values are reserved for single-rune tokens, which +// the lexer reports using the value of the rune itself. Everything else is +// allocated a range of type value up above all the possible single-rune values. +func (i item) String() string { + tokType := i.typ + if tokType >= yyPrivate { + if tokType < yyPrivate+len(yyTok2) { + tokType = yyTok2[tokType-yyPrivate] + } + } + rv := fmt.Sprintf("%s %q %d:%d", yyTokname(tokType), i.val, i.line, i.column) + return rv +} + +// Lexer stores the state of this lexer. +type Lexer struct { + input string // the string we're scanning. + start int // start position of the item. + pos int // current position in the input. + line int // the current line (for error reporting). + column int // current position within the current line. + width int // width of the last rune scanned. + items chan item // channel of scanned lexer items (lexemes). + lastItem item // The last item the lexer handed the parser +} + +// NewLexer will return a new lexer for the passed-in reader. +func NewLexer(rdr io.Reader) (*Lexer, error) { + l := &Lexer{} + + b, err := ioutil.ReadAll(rdr) + if err != nil { + return nil, err + } + l.input = string(b) + l.items = make(chan item) + + return l, nil +} + +// Run starts the lexer, and should be called in a goroutine. +func (l *Lexer) Run() { + for state := lexText; state != nil; { + state = state(l) + } + close(l.items) +} + +// emit returns a token to the parser. +func (l *Lexer) emit(t int) { + l.items <- item{t, l.input[l.start:l.pos], l.line, l.column} + l.start = l.pos +} + +// Lex gets the next token. +func (l *Lexer) Lex(st *yySymType) int { + s := <-l.items + l.lastItem = s + st.val = s.val + // fmt.Println("Lex returning", s) + return int(s.typ) +} + +// Error is called by the parser when it finds a problem. +func (l *Lexer) Error(s string) { + fmt.Printf("parse error at %d:%d: %v\n", l.lastItem.line+1, l.lastItem.column+1, s) + fmt.Printf("error at %q\n", l.lastItem.val) +} + +// errorf is used by the lexer to report errors. It inserts an ERROR token into +// the items channel, and sets the state to nil, which stops the lexer's state +// machine. +func (l *Lexer) errorf(format string, args ...interface{}) stateFn { + l.items <- item{ERROR, fmt.Sprintf(format, args), l.line, l.column} + return nil +} + +// next returns the rune at the current location, and advances to the next rune +// in the input. +func (l *Lexer) next() (r rune) { + if l.pos >= len(l.input) { + l.width = 0 + return eof + } + r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) + l.pos += l.width + l.column++ + if r == '\n' { + l.line++ + l.column = 0 + } + return r +} + +// ignore discards the current text from start to pos. +func (l *Lexer) ignore() { + l.start = l.pos +} + +// backup moves back one character, but can only be called once per next() call. +func (l *Lexer) backup() { + l.pos -= l.width + if l.column > 0 { + l.column-- + } else { + l.line-- + } + l.width = 0 +} + +// peek looks ahead at the next rune in the stream without consuming it. +func (l *Lexer) peek() rune { + r := l.next() + l.backup() + return r +} + +// accept will advance to the next rune if it's contained in the string of valid +// runes passed in by the caller. +func (l *Lexer) accept(valid string) bool { + if strings.IndexRune(valid, l.next()) >= 0 { + return true + } + l.backup() + return false +} + +// acceptRun advances over a number of valid runes, stopping as soon as it hits +// one not on the list. +func (l *Lexer) acceptRun(valid string) { + for strings.IndexRune(valid, l.next()) >= 0 { + } + l.backup() +} + +// keyword checks whether the current lexeme is a keyword or not. If so it +// returns the keyword's token id, otherwise it returns IDENTIFIER. +func (l *Lexer) keyword() int { + ident := l.input[l.start:l.pos] + tok, ok := keywords[ident] + if ok == true { + return int(tok) + } + return IDENTIFIER +} + +// oneRuneToken determines whether a rune is a token. If so it returns the token +// id and true, otherwise it returns false. +func (l *Lexer) oneRuneToken(r rune) (int, bool) { + if strings.IndexRune(oneRuneTokens, r) >= 0 { + return int(r), true + } + + return 0, false +} + +// State functions +type stateFn func(*Lexer) stateFn + +// lexText is the master lex routine. The lexer is started in this state. +func lexText(l *Lexer) stateFn { + for { + if strings.HasPrefix(l.input[l.pos:], "/*") { + return lexBlockComment + } + r := l.next() + if r == eof { + break + } + if unicode.IsSpace(r) { + l.ignore() + return lexText + } + if l.column == 1 && r == '%' { + l.backup() + return lexDirective + } + if unicode.IsLetter(r) { + l.backup() + return lexIdent + } + if unicode.IsNumber(r) || r == '-' { + l.backup() + return lexNumber + } + if t, isToken := l.oneRuneToken(r); isToken == true { + l.emit(t) + } + } + + return nil +} + +// lexBlockComment is used when we find a comment marker '/*' in the input. +func lexBlockComment(l *Lexer) stateFn { + for { + if strings.HasPrefix(l.input[l.pos:], "*/") { + // Found the end. Advance past the '*/' and discard the comment body. + l.next() + l.next() + l.ignore() + return lexText + } + if l.next() == eof { + return l.errorf("unterminated block comment") + } + } +} + +// lexIdent handles identifiers. +func lexIdent(l *Lexer) stateFn { + for { + r := l.next() + if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' { + continue + } + l.backup() + break + } + // We may have a keyword, so check for that before emitting. + l.emit(l.keyword()) + + return lexText +} + +// lexNumber handles decimal and hexadecimal numbers. Decimal numbers may begin +// with a '-'; hex numbers begin with '0x' and do not accept leading '-'. +func lexNumber(l *Lexer) stateFn { + // Leading '-' is ok + digits := "0123456789" + neg := l.accept("-") + if !neg { + // allow '0x' for hex numbers, as long as there's not a leading '-'. + r := l.peek() + if r == '0' { + l.next() + if l.accept("x") { + digits = "0123456789ABCDEFabcdef" + } + } + } + // followed by any number of digits + l.acceptRun(digits) + r := l.peek() + if unicode.IsLetter(r) { + l.next() + return l.errorf("invalid number: %q", l.input[l.start:l.pos]) + } + l.emit(CONSTANT) + return lexText +} + +// lexDirective handles lines beginning with '%'. These are used to emit C code +// directly to the output file. For now we're ignoring them, but some of the +// constants in the protocol file do depend on values from #included header +// files, so that may need to change. +func lexDirective(l *Lexer) stateFn { + for { + r := l.next() + if r == '\n' { + l.ignore() + return lexText + } + if r == eof { + return l.errorf("unterminated directive") + } + } +}