jtr/DevDisciples.Parsing/Lexer.cs

namespace DevDisciples.Parsing;

public abstract partial class Lexer<TToken> where TToken : Enum
{
    protected List<Rule> Rules { get; init; } = default!;

    protected abstract TToken EndOfSource { get; }

    public List<Token> Lex(string file, string source)
    {
        var ctx = new Context(file, new Source(file, source), new List<Token>());

        while (!ctx.Source.Ended())
        {
            var matched = false;

            for (var i = 0; i < Rules.Count; i++)
            {
                if (Rules[i](ctx))
                {
                    matched = true;
                    break;
                }
            }

            if (!matched)
            {
                Report.Halt(ctx.Source, $"Unexpected character '{ctx.Source.Current}'.");
            }
        }

        ctx.AddToken(EndOfSource, "<EOF>", ctx.Source.Line, ctx.Source.Column);

        return ctx.Tokens;
    }

    protected static bool Match(Context ctx, TToken type, char @char)
    {
        if (!ctx.Source.Match(@char)) return false;

        var line = ctx.Source.Line;
        var column = ctx.Source.Column;
        var lexeme = ctx.Source.Extract();

        ctx.Source.Column += 1;

        ctx.AddToken(type, lexeme, line, column);

        return true;
    }

    /*
     * Do not use this method for keywords!
     * This will treat an identifier named 'ifelse' as separated 'if' and 'else' tokens.
     */
    protected static bool Match(Context ctx, TToken token, string sequence)
    {
        if (!ctx.Source.Match(sequence)) return false;

        var line = ctx.Source.Line;
        var column = ctx.Source.Column;
        var lexeme = ctx.Source.Extract();
        ctx.Source.Column += sequence.Length;
        ctx.Tokens.Add(new Token(ctx.File, token, lexeme, line, column));

        return true;
    }
}