From fee7165fbf67bf4690e859e75579c03a12604842 Mon Sep 17 00:00:00 2001 From: Alejandro Mery Date: Wed, 30 Aug 2023 00:25:15 +0000 Subject: [PATCH] parser: implement initial tokeniser only logging position, errors and non-whitespace elements Signed-off-by: Alejandro Mery --- parser/lexer.go | 66 +++++++++++++++++++++++++++++++++++++++++-- parser/lexer_runes.go | 19 +++++++++++++ 2 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 parser/lexer_runes.go diff --git a/parser/lexer.go b/parser/lexer.go index e62bfe9..fcffe77 100644 --- a/parser/lexer.go +++ b/parser/lexer.go @@ -1,11 +1,73 @@ package parser -import "asciigoat.org/core/lexer" +import ( + "asciigoat.org/core/lexer" +) // Run parses the source func (p *Parser) Run() error { p.setDefaults() p.pos.Reset() - return lexer.Run(nil) + return lexer.Run(p.lexStart) +} + +func (p *Parser) lexStart() (lexer.StateFn, error) { + for { + r, _, err := p.src.ReadRune() + switch { + case err != nil: + // read error + return p.emitError("", err) + case IsNewLine(r): + // new line + p.lexNewLine(r) + p.stepLine() + case IsSpace(r): + // whitespace + p.stepRune() + default: + // token + p.src.UnreadRune() + return p.lexToken, nil + } + } +} + +func (p *Parser) lexToken() (lexer.StateFn, error) { + p.src.AcceptAll(IsNotSpace) + + p.pushString(TokenUnknown) + + return p.lexStart, nil +} + +func (p *Parser) lexNewLine(r1 rune) { + // r1 is warrantied to be either \n or \r + r2, _, err := p.src.ReadRune() + + switch r1 { + case '\r': + switch { + case r2 == '\n': + // CR LN + case err == nil: + // CR + p.src.UnreadRune() + default: + // CR EOF + } + case '\n': + switch { + case r2 == '\r': + // LN CR + case err == nil: + // LN + p.src.UnreadRune() + default: + // LN EOF + } + default: + panic("unreachable") + } } diff --git a/parser/lexer_runes.go b/parser/lexer_runes.go new file mode 100644 index 0000000..308cbe9 --- /dev/null +++ b/parser/lexer_runes.go @@ -0,0 +1,19 @@ +package parser + +import "asciigoat.org/core/lexer" + +var ( + // IsNewLine tells if a rune represents a line break or the start of one + IsNewLine = lexer.NewIsIn("\n\r") + // IsSpace tells if a rune is considered whitespace by unicode + IsSpace = lexer.IsSpace + // IsNotNewLine tells if a rune is anything other than line breaks + IsNotNewLine = lexer.NewIsNot(IsNewLine) + // IsNotSpace tells if a rune is anything other than whitespace + IsNotSpace = lexer.NewIsNot(IsSpace) +) + +// IsSpaceNotNewLine indicates a rune is whitespace but not a new line +func IsSpaceNotNewLine(r rune) bool { + return IsSpace(r) && !IsNewLine(r) +}