diff --git a/parser/comma_array.go b/parser/comma_array.go new file mode 100644 index 0000000..2503172 --- /dev/null +++ b/parser/comma_array.go @@ -0,0 +1,88 @@ +package parser + +import ( + "strings" + + "asciigoat.org/core/lexer" +) + +type commaArrayParser struct { + TextParser + + out []string +} + +func (p *commaArrayParser) lexStart() (lexer.StateFn, error) { + for { + r, _, err := p.ReadRune() + switch { + case err != nil: + // EOF + return nil, err + case r == RuneQuotes: + // Quoted Value + return p.lexQuotedString, nil + case IsNewLine(r): + // new lines are acceptable when parsing a string for + // comma delimited arrays. but make sure we discard it + // complete + p.UnreadRune() + p.AcceptNewLine() + p.Discard() + case lexer.IsSpace(r): + // discard whitespace outside quotes + p.Discard() + default: + p.UnreadRune() + return p.lexWord, nil + } + } +} + +func (p *commaArrayParser) lexWord() (lexer.StateFn, error) { + for { + r, _, err := p.ReadRune() + switch { + case err != nil: + // done. store what we got and move on + _, s := p.Emit() + p.out = append(p.out, s) + return nil, err + case r == ',': + // done + _, s := p.Emit() + // remove comma, trim and append to output + s = strings.TrimRightFunc(s[:len(s)-1], IsSpace) + p.out = append(p.out, s) + return p.lexStart, nil + } + } +} + +func (p *commaArrayParser) lexQuotedString() (lexer.StateFn, error) { + s, err := lexQuotedString(&p.TextParser) + if err != nil { + return nil, err + } + + p.Discard() + p.out = append(p.out, s) + return p.lexStart, nil +} + +func (p *commaArrayParser) Run() ([]string, error) { + err := lexer.Run(p.lexStart) + + return p.out, err +} + +// SplitCommaArray splits comma separated strings, removing whitespace +// and respecting quoted literals. +func SplitCommaArray(s string) ([]string, error) { + if s != "" { + var p commaArrayParser + p.InitString(s) + return p.Run() + } + return nil, nil +} diff --git a/parser/error.go b/parser/error.go index 3cc5a53..ca8f19d 100644 --- a/parser/error.go +++ b/parser/error.go @@ -1,6 +1,8 @@ package parser import ( + "io/fs" + "asciigoat.org/core/lexer" ) @@ -25,3 +27,17 @@ func ErrPlusPosition(pos lexer.Position, e *lexer.Error) *lexer.Error { return NewError(pos, e.Content, e.Hint, e.Err) } + +// NewErrIncompleteQuotedString returns a [lexer.Error] +// indicating the quoted string being parsed wasn't correctly +// terminated +func NewErrIncompleteQuotedString(p *TextParser) *lexer.Error { + return newErrIncomplete(p, "incomplete quoted string") +} + +func newErrIncomplete(p *TextParser, hint string) *lexer.Error { + pos, s := p.Emit() + pos.Add(GetPositionalLength(s)) + + return NewError(pos, s, hint, fs.ErrInvalid) +} diff --git a/parser/lexer_runes.go b/parser/lexer_runes.go index 1d8d080..e410a71 100644 --- a/parser/lexer_runes.go +++ b/parser/lexer_runes.go @@ -7,11 +7,13 @@ import ( ) const ( - RuneComment = ';' // RuneComment is the standard dosini comment character - RuneCommentExtra = '#' // RuneCommentExtra is UNIX shell's comment character - RuneSectionStart = '[' // RuneSectionStart indicates the start of a section declaration - RuneSectionEnd = ']' // RuneSectionEnd indiciates the end of a section declaration - RuneFieldEqual = '=' // RuneFieldEqual separates field keys from their values + RuneComment = ';' // RuneComment is the standard INI comment character + RuneCommentExtra = '#' // RuneCommentExtra is UNIX shell's comment character + RuneSectionStart = '[' // RuneSectionStart indicates the start of a section declaration + RuneSectionEnd = ']' // RuneSectionEnd indicates the end of a section declaration + RuneFieldEqual = '=' // RuneFieldEqual separates field keys from their values + RuneQuotes = '"' // RuneQuotes indicates the start and end of a quoted value + RuneEscape = '\\' // RuneEscape indicates the next rune is escaped ) var ( @@ -27,6 +29,8 @@ var ( RuneSectionStart, RuneSectionEnd, RuneFieldEqual, + RuneQuotes, + RuneEscape, }) ) diff --git a/parser/text_quoted.go b/parser/text_quoted.go new file mode 100644 index 0000000..59f14b3 --- /dev/null +++ b/parser/text_quoted.go @@ -0,0 +1,97 @@ +package parser + +import ( + "strings" + + "asciigoat.org/core/lexer" +) + +// AcceptQuotedString consumes a quoted string from the source +// and returns it unquoted and unescaped +func (p *TextParser) AcceptQuotedString() (string, bool, error) { + r, _, err := p.ReadRune() + switch { + case err != nil: + // nothing here + return "", false, err + case r != RuneQuotes: + // not for us + p.UnreadRune() + return "", false, nil + default: + // let's roll + s, err := lexQuotedString(p) + switch { + case err != nil: + // bad quoted string + return "", false, err + default: + // success + return s, true, nil + } + } +} + +func lexQuotedString(p *TextParser) (string, *lexer.Error) { + for { + r, _, err := p.ReadRune() + switch { + case err != nil: + // incomplete + return "", NewErrIncompleteQuotedString(p) + case r == RuneQuotes: + // end, remove quotes and process escaped characters + return lexReturnUnescapedQuotedString(p) + case r == RuneEscape: + // escaped, take another + _, _, err := p.ReadRune() + if err != nil { + // incomplete + return "", NewErrIncompleteQuotedString(p) + } + case IsNewLine(r): + // new lines within quoted values are acceptable + p.UnreadRune() + p.AcceptNewLine() + default: + // continue + } + } +} + +func lexReturnUnescapedQuotedString(p *TextParser) (string, *lexer.Error) { + // remove quotes + s := p.String() + l := len(s) + s = s[1 : l-1] + + if strings.ContainsRune(s, RuneEscape) { + // TODO: implement unescaping + err := NewError(p.Position(), s, "escaped characters", lexer.ErrNotImplemented) + return "", err + } + + return s, nil +} + +// Unquoted removes quotes the content and unescapes the content +func Unquoted(s string) (string, error) { + var p TextParser + if s == "" { + return "", nil + } + + p.InitString(s) + unquoted, ok, err := p.AcceptQuotedString() + switch { + case err != nil: + // bad string + return "", err + case ok: + // success + return unquoted, nil + default: + // not quoted + return s, nil + } +}