parser: implement basic dosini parsing #2
Merged
amery
merged 3 commits from pr-amery-parser
into main
1 year ago
8 changed files with 392 additions and 2 deletions
@ -0,0 +1,144 @@
|
||||
package parser |
||||
|
||||
import "asciigoat.org/core/lexer" |
||||
|
||||
// Run parses the source
|
||||
func (p *Parser) Run() error { |
||||
p.setDefaults() |
||||
p.pos.Reset() |
||||
|
||||
return lexer.Run(p.lexStart) |
||||
} |
||||
|
||||
func (p *Parser) lexStart() (lexer.StateFn, error) { |
||||
for { |
||||
r, _, err := p.src.ReadRune() |
||||
switch { |
||||
case err != nil: |
||||
return p.emitError("", err) |
||||
case IsNewLine(r): |
||||
// new line
|
||||
p.lexMoreNewLine(r) |
||||
p.stepLine() |
||||
case IsSpace(r): |
||||
// whitespace
|
||||
p.stepRune() |
||||
case IsCommentStart(r): |
||||
// switch to comment lexer
|
||||
p.src.UnreadRune() |
||||
return p.lexComment, nil |
||||
case IsSectionStart(r): |
||||
// section
|
||||
return p.lexSectionStart, nil |
||||
default: |
||||
// entry
|
||||
p.src.UnreadRune() |
||||
return p.lexEntryStart, nil |
||||
} |
||||
} |
||||
} |
||||
|
||||
func (p *Parser) lexMoreNewLine(r1 rune) { |
||||
// r1 is warrantied to be either '\r' or '\n'
|
||||
r2, _, err := p.src.ReadRune() |
||||
switch r1 { |
||||
case '\n': |
||||
switch { |
||||
case r2 == '\r': |
||||
// LN CR
|
||||
case err == nil: |
||||
// LN
|
||||
p.src.UnreadRune() |
||||
default: |
||||
// LN EOF
|
||||
} |
||||
case '\r': |
||||
switch { |
||||
case r2 == '\n': |
||||
// CR LN
|
||||
case err == nil: |
||||
// CR
|
||||
p.src.UnreadRune() |
||||
default: |
||||
// CR EOF
|
||||
} |
||||
default: |
||||
panic("unreachable") |
||||
} |
||||
} |
||||
|
||||
func (p *Parser) lexComment() (lexer.StateFn, error) { |
||||
// until the end of the line
|
||||
p.src.AcceptAll(IsNotNewLine) |
||||
|
||||
err := p.emitString(TokenComment) |
||||
return p.lexStart, err |
||||
} |
||||
|
||||
func (p *Parser) lexSectionStart() (lexer.StateFn, error) { |
||||
if err := p.emitString(TokenSectionStart); err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
// remove whitespace between `[` and the name
|
||||
if p.src.AcceptAll(IsSpaceNotNewLine) { |
||||
p.stepString() |
||||
} |
||||
|
||||
if !p.src.AcceptAll(IsName) { |
||||
// no name
|
||||
return p.emitError("section name missing", lexer.ErrUnacceptableRune) |
||||
} |
||||
|
||||
if err := p.emitString(TokenSectionName); err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
// remove whitespace between the name andthe closing `]`
|
||||
if p.src.AcceptAll(IsSpaceNotNewLine) { |
||||
p.stepString() |
||||
} |
||||
|
||||
r, _, err := p.src.ReadRune() |
||||
switch { |
||||
case err != nil: |
||||
return p.emitError("", err) |
||||
case IsSectionEnd(r): |
||||
err := p.emitString(TokenSectionEnd) |
||||
return p.lexStart, err |
||||
default: |
||||
return p.emitInvalidRune(r) |
||||
} |
||||
} |
||||
|
||||
func (p *Parser) lexEntryStart() (lexer.StateFn, error) { |
||||
p.src.AcceptAll(IsName) |
||||
if err := p.emitString(TokenFieldKey); err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
// ignore whitespace between key and the '=' sign
|
||||
if p.src.AcceptAll(IsSpaceNotNewLine) { |
||||
p.stepString() |
||||
} |
||||
|
||||
r, _, err := p.src.ReadRune() |
||||
switch { |
||||
case err != nil: |
||||
return p.emitError("", err) |
||||
case r != RuneFieldEqual: |
||||
return p.emitInvalidRune(r) |
||||
} |
||||
|
||||
// ignore whitespace between the '=' and the value
|
||||
if p.src.AcceptAll(IsSpaceNotNewLine) { |
||||
p.stepString() |
||||
} |
||||
|
||||
p.src.AcceptAll(IsNotNewLine) |
||||
if err := p.emitString(TokenFieldValue); err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
return p.lexStart, err |
||||
} |
@ -0,0 +1,67 @@
|
||||
package parser |
||||
|
||||
import ( |
||||
"strings" |
||||
|
||||
"asciigoat.org/core/lexer" |
||||
) |
||||
|
||||
const ( |
||||
RuneComment = ';' // RuneComment is the standard dosini comment character
|
||||
RuneCommentExtra = '#' // RuneCommentExtra is UNIX shell's comment character
|
||||
RuneSectionStart = '[' // RuneSectionStart indicates the start of a section declaration
|
||||
RuneSectionEnd = ']' // RuneSectionEnd indiciates the end of a section declaration
|
||||
RuneFieldEqual = '=' // RuneFieldEqual separates field keys from their values
|
||||
) |
||||
|
||||
var ( |
||||
// RunesComment is a string containing all runes acceptable to start comments
|
||||
RunesComment = string([]rune{ |
||||
RuneComment, |
||||
RuneCommentExtra, |
||||
}) |
||||
// RunesSpecial is a string containing all the runes with special meaning
|
||||
RunesSpecial = string([]rune{ |
||||
RuneComment, |
||||
RuneCommentExtra, |
||||
RuneSectionStart, |
||||
RuneSectionEnd, |
||||
RuneFieldEqual, |
||||
}) |
||||
) |
||||
|
||||
var ( |
||||
// IsNewLine tells if the rune indicates a line break or the start of one
|
||||
IsNewLine = lexer.NewIsIn("\r\n") |
||||
// IsNotNewLine tells if the rune is not a line break nor the start of one
|
||||
IsNotNewLine = lexer.NewIsNot(IsNewLine) |
||||
// IsSpace tells if the rune is considered whitespace by Unicode
|
||||
IsSpace = lexer.IsSpace |
||||
// IsNotSpace tells if the rune is not considered whitespace by Unicode
|
||||
IsNotSpace = lexer.NewIsNot(IsSpace) |
||||
// IsCommentStart ...
|
||||
IsCommentStart = lexer.NewIsIn(RunesComment) |
||||
) |
||||
|
||||
// IsSpaceNotNewLine indicates a rune is whitespace but not a new line
|
||||
func IsSpaceNotNewLine(r rune) bool { |
||||
return IsSpace(r) && !IsNewLine(r) |
||||
} |
||||
|
||||
// IsSectionStart indicates the rune starts the section declaration
|
||||
func IsSectionStart(r rune) bool { return r == RuneSectionStart } |
||||
|
||||
// IsSectionEnd indicates the rune ends the section declaration
|
||||
func IsSectionEnd(r rune) bool { return r == RuneSectionEnd } |
||||
|
||||
// IsName indicates a rune is acceptable for section or field names
|
||||
func IsName(r rune) bool { |
||||
switch { |
||||
case IsSpace(r): |
||||
return false |
||||
case strings.ContainsRune(RunesSpecial, r): |
||||
return false |
||||
default: |
||||
return true |
||||
} |
||||
} |
@ -0,0 +1,109 @@
|
||||
// Package parser parses dosini-style files
|
||||
package parser |
||||
|
||||
import ( |
||||
"io" |
||||
"log" |
||||
|
||||
"asciigoat.org/core/lexer" |
||||
) |
||||
|
||||
// Parser parses a dosini-style document
|
||||
type Parser struct { |
||||
src *lexer.Reader |
||||
pos lexer.Position |
||||
|
||||
// OnToken is called for each identified token. if it returns an error
|
||||
// parsing is interrupted.
|
||||
OnToken func(pos lexer.Position, typ TokenType, value string) error |
||||
|
||||
// OnError is called in case of a parsing error, and it's allowed
|
||||
// to replace the error returned by [Parser.Run].
|
||||
// OnError is called for io.EOF, but [Parser.Run] will consider it
|
||||
// normal termination.
|
||||
OnError func(pos lexer.Position, content string, err error) error |
||||
} |
||||
|
||||
func defaultOnToken(pos lexer.Position, typ TokenType, value string) error { |
||||
log.Printf("%s:%v:%v: %q", typ, pos.Line, pos.Column, value) |
||||
return nil |
||||
} |
||||
|
||||
func defaultOnError(pos lexer.Position, content string, err error) error { |
||||
log.Printf("%s:%v:%v: %q: %s", "error", pos.Line, pos.Column, content, err) |
||||
|
||||
return lexer.Error{ |
||||
Line: pos.Line, |
||||
Column: pos.Column, |
||||
|
||||
Content: content, |
||||
Err: err, |
||||
} |
||||
} |
||||
|
||||
func (p *Parser) setDefaults() { |
||||
if p.OnToken == nil { |
||||
p.OnToken = defaultOnToken |
||||
} |
||||
if p.OnError == nil { |
||||
p.OnError = defaultOnError |
||||
} |
||||
} |
||||
|
||||
func (p *Parser) emitString(typ TokenType) error { |
||||
s := p.src.Emit() |
||||
err := p.OnToken(p.pos, typ, s) |
||||
p.pos.StepN(len(s)) |
||||
|
||||
return err |
||||
} |
||||
|
||||
func (p *Parser) emitError(content string, err error) (lexer.StateFn, error) { |
||||
err2 := p.OnError(p.pos, content, err) |
||||
switch { |
||||
case err2 != nil: |
||||
// return wrapped error
|
||||
return nil, err2 |
||||
default: |
||||
// return original error
|
||||
return nil, err |
||||
} |
||||
} |
||||
|
||||
func (p *Parser) emitInvalidRune(r rune) (lexer.StateFn, error) { |
||||
return p.emitError(string([]rune{r}), lexer.ErrUnacceptableRune) |
||||
} |
||||
|
||||
// stepLine discards the data and moves the position
|
||||
// to the next line.
|
||||
func (p *Parser) stepLine() { |
||||
p.src.Discard() |
||||
p.pos.StepLine() |
||||
} |
||||
|
||||
// stepRune discards the data and moves the position
|
||||
// one rune forward on the same line.
|
||||
func (p *Parser) stepRune() { |
||||
p.src.Discard() |
||||
p.pos.Step() |
||||
} |
||||
|
||||
// stepString discards the data and moves the position
|
||||
// forward on the same line the length of the discarded
|
||||
// content.
|
||||
func (p *Parser) stepString() { |
||||
s := p.src.Emit() |
||||
p.pos.StepN(len(s)) |
||||
} |
||||
|
||||
// NewParser creates a dosini-style parser using
|
||||
// an [io.Reader] as source
|
||||
func NewParser(r io.Reader) *Parser { |
||||
if r == nil { |
||||
return nil |
||||
} |
||||
|
||||
return &Parser{ |
||||
src: lexer.NewReader(r), |
||||
} |
||||
} |
@ -0,0 +1,31 @@
|
||||
package parser |
||||
|
||||
//go:generate go run golang.org/x/tools/cmd/stringer -type=TokenType
|
||||
|
||||
// A TokenType is a type of Token
|
||||
type TokenType uint |
||||
|
||||
const ( |
||||
// TokenUnknown represents a Token that hasn't been identified
|
||||
TokenUnknown TokenType = iota |
||||
// TokenSectionStart indicates the opening marker of a section declaration.
|
||||
// The left squared bracket.
|
||||
TokenSectionStart |
||||
// TokenSectionEnd indicates the closing marker of a section declaration.
|
||||
// The right squared bracket.
|
||||
TokenSectionEnd |
||||
// TokenSectionName represents the section name between the squared brackets
|
||||
TokenSectionName |
||||
// TokenSectionSubname represents a secondary name in the section represented
|
||||
// between quotes after the section name.
|
||||
// e.g.
|
||||
// [section_name "section_subname"]
|
||||
TokenSectionSubname |
||||
// TokenComment represents a comment, including the initial ';' or '#' until
|
||||
// the end of the line.
|
||||
TokenComment |
||||
// TokenFieldKey represents a field name in a `key = value` entry
|
||||
TokenFieldKey |
||||
// TokenFieldValue represents a field value in a `key = value` entry
|
||||
TokenFieldValue |
||||
) |
@ -0,0 +1,30 @@
|
||||
// Code generated by "stringer -type=TokenType"; DO NOT EDIT.
|
||||
|
||||
package parser |
||||
|
||||
import "strconv" |
||||
|
||||
func _() { |
||||
// An "invalid array index" compiler error signifies that the constant values have changed.
|
||||
// Re-run the stringer command to generate them again.
|
||||
var x [1]struct{} |
||||
_ = x[TokenUnknown-0] |
||||
_ = x[TokenSectionStart-1] |
||||
_ = x[TokenSectionEnd-2] |
||||
_ = x[TokenSectionName-3] |
||||
_ = x[TokenSectionSubname-4] |
||||
_ = x[TokenComment-5] |
||||
_ = x[TokenFieldKey-6] |
||||
_ = x[TokenFieldValue-7] |
||||
} |
||||
|
||||
const _TokenType_name = "TokenUnknownTokenSectionStartTokenSectionEndTokenSectionNameTokenSectionSubnameTokenCommentTokenFieldKeyTokenFieldValue" |
||||
|
||||
var _TokenType_index = [...]uint8{0, 12, 29, 44, 60, 79, 91, 104, 119} |
||||
|
||||
func (i TokenType) String() string { |
||||
if i >= TokenType(len(_TokenType_index)-1) { |
||||
return "TokenType(" + strconv.FormatInt(int64(i), 10) + ")" |
||||
} |
||||
return _TokenType_name[_TokenType_index[i]:_TokenType_index[i+1]] |
||||
} |
Loading…
Reference in new issue