parser: Unescaped [WIP]

Signed-off-by: Alejandro Mery <amery@jpi.io>
build-sys: use local darvaza.org/core [DO-NOT-MERGE]
2023-09-05 13:22:39 +00:00 · 2023-09-05 13:22:09 +00:00 · 2023-09-05 13:22:09 +00:00 · 2023-09-05 13:22:09 +00:00 · 2023-09-05 13:22:09 +00:00 · 2023-09-05 13:22:09 +00:00
7 changed files with 279 additions and 7 deletions
@@ -0,0 +1,7 @@
+{
+    "cSpell.words": [
+        "asciigoat",
+        "Subname",
+        "unescapes"
+    ]
+}
@@ -2,6 +2,11 @@ module asciigoat.org/ini

 go 1.19

+replace (
+	asciigoat.org/core => ../core
+	darvaza.org/core => ../../darvaza.org/core
+)
+
 require (
 	asciigoat.org/core v0.3.9
 	github.com/mgechev/revive v1.3.3
@@ -1,5 +1,3 @@
-asciigoat.org/core v0.3.9 h1:hgDDz4ecm3ZvehX++m8A/IzAt+B5oDPiRtxatzfUHPQ=
-asciigoat.org/core v0.3.9/go.mod h1:CAaHwyw8MpAq4a1MYtN2dxJrsK+hmIdW50OndaQZYPI=
 github.com/BurntSushi/toml v1.3.2 h1:o7IhLm0Msx3BaB+n3Ag7L8EVlByGnpq14C4YWiu/gL8=
 github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
 github.com/chavacava/garif v0.1.0 h1:2JHa3hbYf5D9dsgseMKAmc/MZ109otzgNFk5s87H9Pc=
@@ -0,0 +1,88 @@
+package parser
+
+import (
+	"strings"
+
+	"asciigoat.org/core/lexer"
+)
+
+type commaArrayParser struct {
+	TextParser
+
+	out []string
+}
+
+func (p *commaArrayParser) lexStart() (lexer.StateFn, error) {
+	for {
+		r, _, err := p.ReadRune()
+		switch {
+		case err != nil:
+			// EOF
+			return nil, err
+		case r == RuneQuotes:
+			// Quoted Value
+			return p.lexQuotedString, nil
+		case IsNewLine(r):
+			// new lines are acceptable when parsing a string for
+			// comma delimited arrays. but make sure we discard it
+			// complete
+			p.UnreadRune()
+			p.AcceptNewLine()
+			p.Discard()
+		case lexer.IsSpace(r):
+			// discard whitespace outside quotes
+			p.Discard()
+		default:
+			p.UnreadRune()
+			return p.lexWord, nil
+		}
+	}
+}
+
+func (p *commaArrayParser) lexWord() (lexer.StateFn, error) {
+	for {
+		r, _, err := p.ReadRune()
+		switch {
+		case err != nil:
+			// done. store what we got and move on
+			_, s := p.Emit()
+			p.out = append(p.out, s)
+			return nil, err
+		case r == ',':
+			// done
+			_, s := p.Emit()
+			// remove comma, trim and append to output
+			s = strings.TrimRightFunc(s[:len(s)-1], IsSpace)
+			p.out = append(p.out, s)
+			return p.lexStart, nil
+		}
+	}
+}
+
+func (p *commaArrayParser) lexQuotedString() (lexer.StateFn, error) {
+	s, err := lexQuotedString(&p.TextParser)
+	if err != nil {
+		return nil, err
+	}
+
+	p.Discard()
+	p.out = append(p.out, s)
+	return p.lexStart, nil
+}
+
+func (p *commaArrayParser) Run() ([]string, error) {
+	err := lexer.Run(p.lexStart)
+
+	return p.out, err
+}
+
+// SplitCommaArray splits comma separated strings, removing whitespace
+// and respecting quoted literals.
+func SplitCommaArray(s string) ([]string, error) {
+	if s != "" {
+		var p commaArrayParser
+		p.InitString(s)
+		return p.Run()
+	}
+	return nil, nil
+}
@@ -1,6 +1,8 @@
 package parser

 import (
+	"io/fs"
+
 	"asciigoat.org/core/lexer"
 )

@@ -25,3 +27,36 @@ func ErrPlusPosition(pos lexer.Position, e *lexer.Error) *lexer.Error {

 	return NewError(pos, e.Content, e.Hint, e.Err)
 }
+
+// NewErrIncompleteQuotedString returns a [lexer.Error]
+// indicating the quoted string being parsed wasn't correctly
+// terminated
+func NewErrIncompleteQuotedString(p *TextParser) *lexer.Error {
+	return newErrIncomplete(p, "incomplete quoted string")
+}
+
+// NewErrIncompleteEscaped returns a [lexer.Error]
+// indicating the text being parsed wasn't correctly
+// terminated
+func NewErrIncompleteEscaped(p *TextParser) *lexer.Error {
+	return newErrIncomplete(p, "incomplete escaped string")
+}
+
+func newErrIncomplete(p *TextParser, hint string) *lexer.Error {
+	pos, s := p.Emit()
+	pos.Add(GetPositionalLength(s))
+
+	return NewError(pos, s, hint, fs.ErrInvalid)
+}
+
+// NewErrInvalidEscapeSequence returns a [lexer.Error] indicating
+// the specified sequence, at the end of the accepted buffer,
+// is invalid
+func NewErrInvalidEscapeSequence(p *TextParser, seq string) *lexer.Error {
+	pos, s := p.Position(), p.String()
+
+	s = s[:len(s)-len(seq)]
+	pos.Add(GetPositionalLength(s))
+
+	return NewError(pos, seq, "invalid escape character", fs.ErrInvalid)
+}
@@ -7,11 +7,13 @@ import (
 )

 const (
-	RuneComment      = ';' // RuneComment is the standard dosini comment character
-	RuneCommentExtra = '#' // RuneCommentExtra is UNIX shell's comment character
-	RuneSectionStart = '[' // RuneSectionStart indicates the start of a section declaration
-	RuneSectionEnd   = ']' // RuneSectionEnd indiciates the end of a section declaration
-	RuneFieldEqual   = '=' // RuneFieldEqual separates field keys from their values
+	RuneComment      = ';'  // RuneComment is the standard INI comment character
+	RuneCommentExtra = '#'  // RuneCommentExtra is UNIX shell's comment character
+	RuneSectionStart = '['  // RuneSectionStart indicates the start of a section declaration
+	RuneSectionEnd   = ']'  // RuneSectionEnd indicates the end of a section declaration
+	RuneFieldEqual   = '='  // RuneFieldEqual separates field keys from their values
+	RuneQuotes       = '"'  // RuneQuotes indicates the start and end of a quoted value
+	RuneEscape       = '\\' // RuneEscape indicates the next rune is escaped
 )

 var (
@@ -27,6 +29,8 @@ var (
 		RuneSectionStart,
 		RuneSectionEnd,
 		RuneFieldEqual,
+		RuneQuotes,
+		RuneEscape,
 	})
 )

@@ -0,0 +1,135 @@
+package parser
+
+import (
+	"strings"
+
+	"asciigoat.org/core/lexer"
+)
+
+// AcceptQuotedString consumes a quoted string from the source
+// and returns it unquoted and unescaped
+func (p *TextParser) AcceptQuotedString() (string, bool, error) {
+	r, _, err := p.ReadRune()
+	switch {
+	case err != nil:
+		// nothing here
+		return "", false, err
+	case r != RuneQuotes:
+		// not for us
+		p.UnreadRune()
+		return "", false, nil
+	default:
+		// let's roll
+		s, err := lexQuotedString(p)
+		switch {
+		case err != nil:
+			// bad quoted string
+			return "", false, err
+		default:
+			// success
+			return s, true, nil
+		}
+	}
+}
+
+func lexQuotedString(p *TextParser) (string, *lexer.Error) {
+	s, ok, err := lexQuotedStringNoEscape(p)
+	switch {
+	case err != nil:
+		return "", err
+	case ok:
+		return s, nil
+	default:
+		// escape character detected
+		return lexQuotedStringEscaped(p)
+	}
+}
+
+func lexQuotedStringNoEscape(p *TextParser) (string, bool, *lexer.Error) {
+	for {
+		r, _, err := p.ReadRune()
+		switch {
+		case err != nil:
+			// incomplete
+			return "", false, NewErrIncompleteQuotedString(p)
+		case r == RuneQuotes:
+			// end, just remove the quotes
+			s := p.String()
+			l := len(s)
+			return s[1 : l-1], true, nil
+		case r == RuneEscape:
+			// things just got complicated...
+			p.UnreadRune()
+			return "", false, nil
+		case IsNewLine(r):
+			// new lines within quoted values are acceptable
+			p.UnreadRune()
+			p.AcceptNewLine()
+		default:
+			// continue
+		}
+	}
+}
+
+// Unquoted removes quotes the content and unescapes the content
+func lexQuotedStringEscaped(p *TextParser) (string, *lexer.Error) {
+	var result strings.Builder
+
+	// append what was accepted before the escape character
+	_, _ = result.WriteString(p.String()[1:])
+
+	for {
+		r, _, err := p.ReadRune()
+		switch {
+		case err != nil:
+			// incomplete quoted
+			return "", NewErrIncompleteQuotedString(p)
+		case r == RuneQuotes:
+			// end
+			return result.String(), nil
+		case r == RuneEscape:
+			// escaped
+			r2, _, err := p.ReadRune()
+			switch {
+			case err != nil:
+				// incomplete escaped
+				return "", NewErrIncompleteEscaped(p)
+			case IsNewLine(r2):
+				// escaped new line, skip
+				p.UnreadRune()
+				p.AcceptNewLine()
+			default:
+				// TODO: check valid escape character and
+				// append to result
+				s := string([]rune{r, r2})
+				err := NewErrInvalidEscapeSequence(p, s)
+				return "", err
+			}
+		default:
+			// normal, append to result
+			_, _ = result.WriteRune(r)
+		}
+	}
+}
+
+// Unquoted removes quotes and unescapes the content
+func Unquoted(s string) (string, error) {
+	var p TextParser
+	if s == "" {
+		return "", nil
+	}
+
+	p.InitString(s)
+	unquoted, ok, err := p.AcceptQuotedString()
+	switch {
+	case err != nil:
+		// bad string
+		return "", err
+	case ok:
+		// success
+		return unquoted, nil
+	default:
+		// not quoted
+		return s, nil
+	}
+}
Author	SHA1	Message	Date
amery	3bf20948c0	parser: Unescaped [WIP] Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-09-05 13:22:39 +00:00
amery	0dd29272e9	build-sys: use local darvaza.org/core [DO-NOT-MERGE] Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-09-05 13:22:09 +00:00
amery	7fab1a799a	build-sys: use local asciigoat.org/core [DO-NOT-MERGE] Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-09-05 13:22:09 +00:00
amery	16dfde1503	vscode: add Subname to the dictionary Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-09-05 13:22:09 +00:00
amery	41d7c6e04d	vscode: add unescapes to the dictionary Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-09-05 13:22:09 +00:00
amery	48adaeb8a8	vscode: add asciigoat to the dictionary Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-09-05 13:22:09 +00:00
amery	99ca8d0b3b	Merge branch 'pr-amery-basic' into next-amery	2023-09-05 13:22:01 +00:00
amery	986b6d1c6d	Merge pull request 'parser: Unquoted(), AcceptQuotedString() and SplitCommaArray' (#9 ) Reviewed-on: #9	2023-09-05 15:20:38 +02:00
amery	d41cd781d9	parser: introduce SplitCommaArray to splits comma separated strings removing whitespace and respecting quoted literals. Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-09-04 18:58:06 +00:00
amery	651fcb6215	parser: Unquoted(), AcceptQuotedString() TODO: reduce quoted strings with escaped characters Signed-off-by: Alejandro Mery <amery@jpi.io>	2023-09-04 18:58:06 +00:00