build-sys: use local asciigoat.org/core [DO-NOT-MERGE]

Signed-off-by: Alejandro Mery <amery@jpi.io>
parser: implement initial tokeniser
2023-08-30 20:54:44 +00:00 · 2023-08-30 20:54:21 +00:00 · 2023-08-30 20:54:21 +00:00 · 2023-08-30 20:54:21 +00:00 · 2023-08-30 20:47:12 +00:00
10 changed files with 156 additions and 286 deletions
@@ -1,47 +0,0 @@
-# asciigoat's INI parser
-
-[![Go Reference][godoc-badge]][godoc]
-[![Go Report Card][goreport-badge]][goreport]
-
-`asciigoat.org/ini` is a simple Go library that very loosly parses
-[`INI`-style][wikipedia-dosini] documents allowing the implementation
-of stricter parsers of similar form.
-
-**asciigoat** is [MIT](https://opensource.org/license/mit/) licensed.
-
-[godoc]: https://pkg.go.dev/asciigoat.org/ini
-[godoc-badge]: https://pkg.go.dev/badge/asciigoat.org/ini.svg
-[goreport]: https://goreportcard.com/report/asciigoat.org/ini
-[goreport-badge]: https://goreportcard.com/badge/asciigoat.org/ini
-
-[godoc-lexer]: https://pkg.go.dev/asciigoat.org/core/lexer
-[godoc-parser-parser]: https://pkg.go.dev/asciigoat.org/ini/parser#Parser
-
-[wikipedia-dosini]: https://en.wikipedia.org/wiki/INI_file
-
-## Parser
-
-[`parser.Parser`][godoc-parser-parser] uses
-[`asciigoat`'s lexer][godoc-lexer] to process an `INI`-style document
-emiting tokens and errors via callbacks.
-
-## Other Implementations
-
-Other implementations exist, and they are mature and feature-rich, but they
-are highly opinionated about what's a valid file. Built around maps they don't
-allow repeating names and constraint what characters can be used.
-
-These are great when you can adapt, or already agree, to their conditions but
-that's not always the case when you are parsing configuration files from
-other applications and that's what [asciigoat.org/ini][godoc] attempts to solve.
-
-* [gcfg](https://pkg.go.dev/gopkg.in/gcfg.v1)
-* [unknwon's go-ini](https://github.com/go-ini/ini)
-* [wlevene's GoINI](https://github.com/wlevene/ini)
-
-## See also
-
-* [asciigoat.org/core](https://asciigoat.org/core)
-* [oss.jpi.io](https://oss.jpi.io)
-* [INI file][wikipedia-dosini] (_wikipedia_)
-* [TOML](https://www.kelche.co/blog/go/toml/)
@@ -2,6 +2,8 @@ module asciigoat.org/ini

 go 1.19

+replace asciigoat.org/core => ../core
+
 require (
 	asciigoat.org/core v0.3.6
 	github.com/mgechev/revive v1.3.3
@@ -1,5 +1,3 @@
-asciigoat.org/core v0.3.6 h1:b1vL090OxylmSOwLQryjrmC8FhhCtktMyeJSy1e6LwI=
-asciigoat.org/core v0.3.6/go.mod h1:tXj+JUutxRbcO40ZQRuUVaZ4rnYz1kAZ0nblisV8u74=
 github.com/BurntSushi/toml v1.3.2 h1:o7IhLm0Msx3BaB+n3Ag7L8EVlByGnpq14C4YWiu/gL8=
 github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
 github.com/chavacava/garif v0.0.0-20230608123814-4bd63c2919ab h1:5JxePczlyGAtj6R1MUEFZ/UFud6FfsOejq7xLC2ZIb0=
@@ -1,6 +1,8 @@
 package parser

-import "asciigoat.org/core/lexer"
+import (
+	"asciigoat.org/core/lexer"
+)

 // Run parses the source
 func (p *Parser) Run() error {
@@ -15,43 +17,36 @@ func (p *Parser) lexStart() (lexer.StateFn, error) {
 		r, _, err := p.src.ReadRune()
 		switch {
 		case err != nil:
+			// read error
 			return p.emitError("", err)
 		case IsNewLine(r):
 			// new line
-			p.lexMoreNewLine(r)
+			p.lexNewLine(r)
 			p.stepLine()
 		case IsSpace(r):
 			// whitespace
 			p.stepRune()
-		case IsCommentStart(r):
-			// switch to comment lexer
-			p.src.UnreadRune()
-			return p.lexComment, nil
-		case IsSectionStart(r):
-			// section
-			return p.lexSectionStart, nil
 		default:
-			// entry
+			// token
 			p.src.UnreadRune()
-			return p.lexEntryStart, nil
+			return p.lexToken, nil
 		}
 	}
 }

-func (p *Parser) lexMoreNewLine(r1 rune) {
-	// r1 is warrantied to be either '\r' or '\n'
+func (p *Parser) lexToken() (lexer.StateFn, error) {
+	p.src.AcceptAll(IsNotSpace)
+
+	p.pushString(TokenUnknown)
+
+	return p.lexStart, nil
+}
+
+func (p *Parser) lexNewLine(r1 rune) {
+	// r1 is warrantied to be either \n or \r
 	r2, _, err := p.src.ReadRune()
+
 	switch r1 {
-	case '\n':
-		switch {
-		case r2 == '\r':
-			// LN CR
-		case err == nil:
-			// LN
-			p.src.UnreadRune()
-		default:
-			// LN EOF
-		}
 	case '\r':
 		switch {
 		case r2 == '\n':
@@ -62,83 +57,17 @@ func (p *Parser) lexMoreNewLine(r1 rune) {
 		default:
 			// CR EOF
 		}
+	case '\n':
+		switch {
+		case r2 == '\r':
+			// LN CR
+		case err == nil:
+			// LN
+			p.src.UnreadRune()
+		default:
+			// LN EOF
+		}
 	default:
 		panic("unreachable")
 	}
 }
-
-func (p *Parser) lexComment() (lexer.StateFn, error) {
-	// until the end of the line
-	p.src.AcceptAll(IsNotNewLine)
-
-	err := p.emitString(TokenComment)
-	return p.lexStart, err
-}
-
-func (p *Parser) lexSectionStart() (lexer.StateFn, error) {
-	if err := p.emitString(TokenSectionStart); err != nil {
-		return nil, err
-	}
-
-	// remove whitespace between `[` and the name
-	if p.src.AcceptAll(IsSpaceNotNewLine) {
-		p.stepString()
-	}
-
-	if !p.src.AcceptAll(IsName) {
-		// no name
-		return p.emitError("section name missing", lexer.ErrUnacceptableRune)
-	}
-
-	if err := p.emitString(TokenSectionName); err != nil {
-		return nil, err
-	}
-
-	// remove whitespace between the name andthe closing `]`
-	if p.src.AcceptAll(IsSpaceNotNewLine) {
-		p.stepString()
-	}
-
-	r, _, err := p.src.ReadRune()
-	switch {
-	case err != nil:
-		return p.emitError("", err)
-	case IsSectionEnd(r):
-		err := p.emitString(TokenSectionEnd)
-		return p.lexStart, err
-	default:
-		return p.emitInvalidRune(r)
-	}
-}
-
-func (p *Parser) lexEntryStart() (lexer.StateFn, error) {
-	p.src.AcceptAll(IsName)
-	if err := p.emitString(TokenFieldKey); err != nil {
-		return nil, err
-	}
-
-	// ignore whitespace between key and the '=' sign
-	if p.src.AcceptAll(IsSpaceNotNewLine) {
-		p.stepString()
-	}
-
-	r, _, err := p.src.ReadRune()
-	switch {
-	case err != nil:
-		return p.emitError("", err)
-	case r != RuneFieldEqual:
-		return p.emitInvalidRune(r)
-	}
-
-	// ignore whitespace between the '=' and the value
-	if p.src.AcceptAll(IsSpaceNotNewLine) {
-		p.stepString()
-	}
-
-	p.src.AcceptAll(IsNotNewLine)
-	if err := p.emitString(TokenFieldValue); err != nil {
-		return nil, err
-	}
-
-	return p.lexStart, err
-}
@@ -0,0 +1,15 @@
+package parser
+
+import "asciigoat.org/core/lexer"
+
+func (p *Parser) emitError(content string, err error) (lexer.StateFn, error) {
+	err2 := p.OnError(p.pos, content, err)
+	switch {
+	case err2 != nil:
+		// return wrapped error
+		return nil, err2
+	default:
+		// return original error
+		return nil, err
+	}
+}
@@ -0,0 +1,44 @@
+package parser
+
+import "log"
+
+func (p *Parser) push(tok Token) {
+	n := len(p.queue)
+	p.queue = append(p.queue, tok)
+
+	log.Printf("queue[%v]: %s", n, tok)
+}
+
+func (p *Parser) pushString(typ TokenType) {
+	s := p.src.Emit()
+
+	el := Token{
+		Type:     typ,
+		Value:    s,
+		Position: p.pos,
+	}
+
+	p.pos.StepN(len(s))
+
+	p.push(el)
+}
+
+// stepLine discards the data and moves the position
+// to the next line
+func (p *Parser) stepLine() {
+	p.src.Discard()
+	p.pos.StepLine()
+}
+
+// stepRune discards the data and moves the position
+// on rune forward on the same line
+func (p *Parser) stepRune() {
+	p.src.Discard()
+	p.pos.Step()
+}
+
+func (p *Parser) stepString() string {
+	s := p.src.Emit()
+	p.pos.StepN(len(s))
+	return s
+}
@@ -1,67 +1,19 @@
 package parser

-import (
-	"strings"
-
-	"asciigoat.org/core/lexer"
-)
-
-const (
-	RuneComment      = ';' // RuneComment is the standard dosini comment character
-	RuneCommentExtra = '#' // RuneCommentExtra is UNIX shell's comment character
-	RuneSectionStart = '[' // RuneSectionStart indicates the start of a section declaration
-	RuneSectionEnd   = ']' // RuneSectionEnd indiciates the end of a section declaration
-	RuneFieldEqual   = '=' // RuneFieldEqual separates field keys from their values
-)
+import "asciigoat.org/core/lexer"

 var (
-	// RunesComment is a string containing all runes acceptable to start comments
-	RunesComment = string([]rune{
-		RuneComment,
-		RuneCommentExtra,
-	})
-	// RunesSpecial is a string containing all the runes with special meaning
-	RunesSpecial = string([]rune{
-		RuneComment,
-		RuneCommentExtra,
-		RuneSectionStart,
-		RuneSectionEnd,
-		RuneFieldEqual,
-	})
-)
-
-var (
-	// IsNewLine tells if the rune indicates a line break or the start of one
-	IsNewLine = lexer.NewIsIn("\r\n")
-	// IsNotNewLine tells if the rune is not a line break nor the start of one
-	IsNotNewLine = lexer.NewIsNot(IsNewLine)
-	// IsSpace tells if the rune is considered whitespace by Unicode
+	// IsNewLine tells if a rune represents a line break or the start of one
+	IsNewLine = lexer.NewIsIn("\n\r")
+	// IsSpace tells if a rune is considered whitespace by unicode
 	IsSpace = lexer.IsSpace
-	// IsNotSpace tells if the rune is not considered whitespace by Unicode
+	// IsNotNewLine tells if a rune is anything other than line breaks
+	IsNotNewLine = lexer.NewIsNot(IsNewLine)
+	// IsNotSpace tells if a rune is anything other than whitespace
 	IsNotSpace = lexer.NewIsNot(IsSpace)
-	// IsCommentStart ...
-	IsCommentStart = lexer.NewIsIn(RunesComment)
 )

 // IsSpaceNotNewLine indicates a rune is whitespace but not a new line
 func IsSpaceNotNewLine(r rune) bool {
 	return IsSpace(r) && !IsNewLine(r)
 }
-
-// IsSectionStart indicates the rune starts the section declaration
-func IsSectionStart(r rune) bool { return r == RuneSectionStart }
-
-// IsSectionEnd indicates the rune ends the section declaration
-func IsSectionEnd(r rune) bool { return r == RuneSectionEnd }
-
-// IsName indicates a rune is acceptable for section or field names
-func IsName(r rune) bool {
-	switch {
-	case IsSpace(r):
-		return false
-	case strings.ContainsRune(RunesSpecial, r):
-		return false
-	default:
-		return true
-	}
-}
@@ -3,7 +3,6 @@ package parser

 import (
 	"io"
-	"log"

 	"asciigoat.org/core/lexer"
 )
@@ -11,91 +10,61 @@ import (
 // Parser parses a dosini-style document
 type Parser struct {
 	src *lexer.Reader
-	pos lexer.Position

-	// OnToken is called for each identified token. if it returns an error
-	// parsing is interrupted.
-	OnToken func(pos lexer.Position, typ TokenType, value string) error
+	pos   lexer.Position
+	queue []Token

-	// OnError is called in case of a parsing error, and it's allowed
-	// to replace the error returned by [Parser.Run].
-	// OnError is called for io.EOF, but [Parser.Run] will consider it
-	// normal termination.
+	// OnSection is called after a [section] is parsed.
+	// Returning an error will abort the process.
+	OnSection func(pos lexer.Position, name, subname string, hasSubname bool) error
+
+	// OnField is called after a `key = value` entry is parsed
+	// Returning an error will abort the process.
+	OnField func(pos lexer.Position, key, value string) error
+
+	// OnComment is called after a comment is parsed
+	// Returning an error will abort the process.
+	OnComment func(pos lexer.Position, comment string) error
+
+	// OnError is called after each parsing error, which you are allowed to
+	// override.
+	// OnError is called for EOF as well, but this error isn't returned as such by
+	// Parser.Run(). The caller will receive (nil, nil) instead indicating the
+	// processes terminated correctly.
 	OnError func(pos lexer.Position, content string, err error) error
 }

-func defaultOnToken(pos lexer.Position, typ TokenType, value string) error {
-	log.Printf("%s:%v:%v: %q", typ, pos.Line, pos.Column, value)
-	return nil
-}
+func defaultOnSection(_ lexer.Position, _, _ string, _ bool) error { return nil }
+func defaultOnField(_ lexer.Position, _, _ string) error           { return nil }
+func defaultOnComment(_ lexer.Position, _ string) error            { return nil }

 func defaultOnError(pos lexer.Position, content string, err error) error {
-	log.Printf("%s:%v:%v: %q: %s", "error", pos.Line, pos.Column, content, err)
-
-	return lexer.Error{
-		Line:   pos.Line,
-		Column: pos.Column,
-
+	return &lexer.Error{
+		Line:    pos.Line,
+		Column:  pos.Column,
 		Content: content,
 		Err:     err,
 	}
 }

 func (p *Parser) setDefaults() {
-	if p.OnToken == nil {
-		p.OnToken = defaultOnToken
+	if p.OnSection == nil {
+		p.OnSection = defaultOnSection
 	}
+
+	if p.OnField == nil {
+		p.OnField = defaultOnField
+	}
+
+	if p.OnComment == nil {
+		p.OnComment = defaultOnComment
+	}
+
 	if p.OnError == nil {
 		p.OnError = defaultOnError
 	}
 }

-func (p *Parser) emitString(typ TokenType) error {
-	s := p.src.Emit()
-	err := p.OnToken(p.pos, typ, s)
-	p.pos.StepN(len(s))
-
-	return err
-}
-
-func (p *Parser) emitError(content string, err error) (lexer.StateFn, error) {
-	err2 := p.OnError(p.pos, content, err)
-	switch {
-	case err2 != nil:
-		// return wrapped error
-		return nil, err2
-	default:
-		// return original error
-		return nil, err
-	}
-}
-
-func (p *Parser) emitInvalidRune(r rune) (lexer.StateFn, error) {
-	return p.emitError(string([]rune{r}), lexer.ErrUnacceptableRune)
-}
-
-// stepLine discards the data and moves the position
-// to the next line.
-func (p *Parser) stepLine() {
-	p.src.Discard()
-	p.pos.StepLine()
-}
-
-// stepRune discards the data and moves the position
-// one rune forward on the same line.
-func (p *Parser) stepRune() {
-	p.src.Discard()
-	p.pos.Step()
-}
-
-// stepString discards the data and moves the position
-// forward on the same line the length of the discarded
-// content.
-func (p *Parser) stepString() {
-	s := p.src.Emit()
-	p.pos.StepN(len(s))
-}
-
 // NewParser creates a dosini-style parser using
 // an [io.Reader] as source
 func NewParser(r io.Reader) *Parser {
@@ -1,6 +1,11 @@
 package parser

 //go:generate go run golang.org/x/tools/cmd/stringer -type=TokenType
+import (
+	"fmt"
+
+	"asciigoat.org/core/lexer"
+)

 // A TokenType is a type of Token
 type TokenType uint
@@ -8,12 +13,6 @@ type TokenType uint
 const (
 	// TokenUnknown represents a Token that hasn't been identified
 	TokenUnknown TokenType = iota
-	// TokenSectionStart indicates the opening marker of a section declaration.
-	// The left squared bracket.
-	TokenSectionStart
-	// TokenSectionEnd indicates the closing marker of a section declaration.
-	// The right squared bracket.
-	TokenSectionEnd
 	// TokenSectionName represents the section name between the squared brackets
 	TokenSectionName
 	// TokenSectionSubname represents a secondary name in the section represented
@@ -22,10 +21,21 @@ const (
 	// [section_name "section_subname"]
 	TokenSectionSubname
 	// TokenComment represents a comment, including the initial ';' or '#' until
-	// the end of the line.
+	// the end of the line
 	TokenComment
 	// TokenFieldKey represents a field name in a `key = value` entry
 	TokenFieldKey
 	// TokenFieldValue represents a field value in a `key = value` entry
 	TokenFieldValue
 )
+
+// A Token is an element from the document
+type Token struct {
+	Type     TokenType
+	Position lexer.Position
+	Value    string
+}
+
+func (t Token) String() string {
+	return fmt.Sprintf("%s:%v:%v: %q", t.Type, t.Position.Line, t.Position.Column, t.Value)
+}
@@ -9,18 +9,16 @@ func _() {
 	// Re-run the stringer command to generate them again.
 	var x [1]struct{}
 	_ = x[TokenUnknown-0]
-	_ = x[TokenSectionStart-1]
-	_ = x[TokenSectionEnd-2]
-	_ = x[TokenSectionName-3]
-	_ = x[TokenSectionSubname-4]
-	_ = x[TokenComment-5]
-	_ = x[TokenFieldKey-6]
-	_ = x[TokenFieldValue-7]
+	_ = x[TokenSectionName-1]
+	_ = x[TokenSectionSubname-2]
+	_ = x[TokenComment-3]
+	_ = x[TokenFieldKey-4]
+	_ = x[TokenFieldValue-5]
 }

-const _TokenType_name = "TokenUnknownTokenSectionStartTokenSectionEndTokenSectionNameTokenSectionSubnameTokenCommentTokenFieldKeyTokenFieldValue"
+const _TokenType_name = "TokenUnknownTokenSectionNameTokenSectionSubnameTokenCommentTokenFieldKeyTokenFieldValue"

-var _TokenType_index = [...]uint8{0, 12, 29, 44, 60, 79, 91, 104, 119}
+var _TokenType_index = [...]uint8{0, 12, 28, 47, 59, 72, 87}

 func (i TokenType) String() string {
 	if i >= TokenType(len(_TokenType_index)-1) {
Author	SHA1	Message	Date
amery	62328d9e43	build-sys: use local asciigoat.org/core [DO-NOT-MERGE] Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-08-30 20:54:44 +00:00
amery	5be0785a55	parser: implement initial tokeniser only logging position, errors and non-whitespace elements Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-08-30 20:54:21 +00:00
amery	35b9d56b3d	parser: add internal []Token queue to the Parser Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-08-30 20:54:21 +00:00
amery	604ecfaed2	parser: introduce Token and TokenType enum Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-08-30 20:54:21 +00:00
amery	5288cd4537	parser: add placeholder for ini Parser Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-08-30 20:47:12 +00:00