Mokey的词法分析

2026 年 4 月 12 日

1407 字

7 分钟

Token

Monkey语言: 分为表达式和语句, 下面给出一些语法.

// Integers & arithmetic expressions...
let version = 1 + (50 / 2) - (8 * 3);

// ... and strings
let name = "The Monkey programming language";

// ... booleans
let isMonkeyFastNow = true;

// ... arrays & hash maps
let people = [{"name": "Anna", "age": 24}, {"name": "Bob", "age": 99}];

当然现在的进度还没有实现这么复杂的语句.

词法分析就是输出一系列的token先给出token 的定义

pub const Token = struct {
    Type: TokenType,
    Literal: []const u8, // 原始切片，用于报错信息的打印
};

对于Monkey有如下的一些token

pub const TokenType = union(enum(u8)) {
    illegal,
    eof,
    ident,
    int: i64, // 少一次转换

    // 运算符
    assign,
    plus,
    minus,
    bang,
    asterisk,
    slash,
    lt,
    gt,
    eq,
    not_eq,

    // 分隔符
    comma,
    semicolon,
    lparen,
    rparen,
    lbrace,
    rbrace,

    // 关键字
    function,
    let,
    true,
    false,
    @"if",
    @"else",
    @"return",
};

这里重点讲一下为什么int我用i64, 因为想避免str -> int 的一个开销, 为什么用枚举就比较好理解

Lexer

使用状态机的思想, 测试用例来自原书

先思考测试用例来决定我们要编写怎么样的代码(直接抄的)

test "nextToken_From go" {
    const gpa = std.testing.allocator;
    const input =
        \\let five = 5; 
        \\let ten = 10; 
        \\let add = fn(x, y) { 
        \\x + y; 
        \\}; 
        \\let result = add(five, ten); 
        \\!-/*5; 
        \\5 < 10 > 5; 
        \\if (5 < 10) { 
        \\return true; 
        \\} else { 
        \\return false; 
        \\} 
        \\10 == 10; 
        \\10 != 9;
    ;
    var l: Lexer = .init(input);
    const tokenList = try l.nextToken(gpa);
    defer gpa.free(tokenList);
    const Expected = struct {
        expected_type: std.meta.Tag(token.TokenType),
        expected_literal: []const u8,
    };

    const tests = [_]Expected{
        .{ .expected_type = .let, .expected_literal = "let" },
        .{ .expected_type = .ident, .expected_literal = "five" },
        .{ .expected_type = .assign, .expected_literal = "=" },
        .{ .expected_type = .int, .expected_literal = "5" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .let, .expected_literal = "let" },
        .{ .expected_type = .ident, .expected_literal = "ten" },
        .{ .expected_type = .assign, .expected_literal = "=" },
        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .let, .expected_literal = "let" },
        .{ .expected_type = .ident, .expected_literal = "add" },
        .{ .expected_type = .assign, .expected_literal = "=" },
        .{ .expected_type = .function, .expected_literal = "fn" },
        .{ .expected_type = .lparen, .expected_literal = "(" },
        .{ .expected_type = .ident, .expected_literal = "x" },
        .{ .expected_type = .comma, .expected_literal = "," },
        .{ .expected_type = .ident, .expected_literal = "y" },
        .{ .expected_type = .rparen, .expected_literal = ")" },
        .{ .expected_type = .lbrace, .expected_literal = "{" },
        .{ .expected_type = .ident, .expected_literal = "x" },
        .{ .expected_type = .plus, .expected_literal = "+" },
        .{ .expected_type = .ident, .expected_literal = "y" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .rbrace, .expected_literal = "}" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .let, .expected_literal = "let" },
        .{ .expected_type = .ident, .expected_literal = "result" },
        .{ .expected_type = .assign, .expected_literal = "=" },
        .{ .expected_type = .ident, .expected_literal = "add" },
        .{ .expected_type = .lparen, .expected_literal = "(" },
        .{ .expected_type = .ident, .expected_literal = "five" },
        .{ .expected_type = .comma, .expected_literal = "," },
        .{ .expected_type = .ident, .expected_literal = "ten" },
        .{ .expected_type = .rparen, .expected_literal = ")" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .bang, .expected_literal = "!" },
        .{ .expected_type = .minus, .expected_literal = "-" },
        .{ .expected_type = .slash, .expected_literal = "/" },
        .{ .expected_type = .asterisk, .expected_literal = "*" },
        .{ .expected_type = .int, .expected_literal = "5" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .int, .expected_literal = "5" },
        .{ .expected_type = .lt, .expected_literal = "<" },
        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .gt, .expected_literal = ">" },
        .{ .expected_type = .int, .expected_literal = "5" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },

        .{ .expected_type = .@"if", .expected_literal = "if" },
        .{ .expected_type = .lparen, .expected_literal = "(" },
        .{ .expected_type = .int, .expected_literal = "5" },
        .{ .expected_type = .lt, .expected_literal = "<" },
        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .rparen, .expected_literal = ")" },
        .{ .expected_type = .lbrace, .expected_literal = "{" },
        .{ .expected_type = .@"return", .expected_literal = "return" },
        .{ .expected_type = .true, .expected_literal = "true" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .rbrace, .expected_literal = "}" },
        .{ .expected_type = .@"else", .expected_literal = "else" },
        .{ .expected_type = .lbrace, .expected_literal = "{" },
        .{ .expected_type = .@"return", .expected_literal = "return" },
        .{ .expected_type = .false, .expected_literal = "false" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .rbrace, .expected_literal = "}" },

        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .eq, .expected_literal = "==" },
        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },
        .{ .expected_type = .int, .expected_literal = "10" },
        .{ .expected_type = .not_eq, .expected_literal = "!=" },
        .{ .expected_type = .int, .expected_literal = "9" },
        .{ .expected_type = .semicolon, .expected_literal = ";" },

        .{ .expected_type = .eof, .expected_literal = "" },
    };
    //
    for (tests, 0..) |expected, i| {
        const actual = tokenList[i];

        try std.testing.expectEqual(expected.expected_type, std.meta.activeTag(actual.Type));

        try std.testing.expectEqualStrings(expected.expected_literal, actual.Literal);
    }

    try std.testing.expectEqual(tests.len, tokenList.len);
}

接下来进行分词

const token = @import("./token.zig");
const std = @import("std");

pub const Lexer = struct {
    const State = enum { start, int, indentifer };
    atPosition: u8,
    rdPosition: u8,
    ch: ?u8,
    source: []const u8,
    pub fn init(source: []const u8) Lexer {
        var l: Lexer = .{ .source = source, .atPosition = 0, .rdPosition = 0, .ch = 0 };
        l.readChar();
        return l;
    }
    pub fn nextToken(l: *Lexer, gpa: std.mem.Allocator) ![]token.Token {
        var list: std.ArrayList(token.Token) = .empty;
        state: switch (State.start) {
            .start => {
                if (l.ch == null) {
                    try list.append(gpa, .{ .Type = .eof, .Literal = "" });
                    break :state;
                }
                while (std.ascii.isWhitespace(l.ch.?)) {
                    l.readChar();
                }
                const start = l.atPosition;
                if (std.ascii.isDigit(l.ch.?)) {
                    continue :state .int;
                }
                if (std.ascii.isAlphabetic(l.ch.?) or (l.ch.? == '_')) {
                    continue :state .indentifer;
                }
                if (l.ch) |ch| {
                    var token_type: token.TokenType = switch (ch) {
                        '+' => .plus,
                        '-' => .minus,
                        '*' => .asterisk,
                        '/' => .slash,
                        '<' => .lt,
                        '>' => .gt,
                        ',' => .comma,
                        ';' => .semicolon,
                        '(' => .lparen,
                        ')' => .rparen,
                        '{' => .lbrace,
                        '}' => .rbrace,
                        else => .illegal,
                    };
                    if (ch == '=') {
                        if (l.peekChar() == '=') {
                            l.readChar();
                            token_type = .eq;
                        } else {
                            token_type = .assign;
                        }
                    } else if (ch == '!') {
                        if (l.peekChar() == '=') {
                            l.readChar();
                            token_type = .not_eq;
                        } else {
                            token_type = .bang;
                        }
                    }
                    try list.append(gpa, .{ .Type = token_type, .Literal = l.source[start..l.rdPosition] });
                    l.readChar();
                    continue :state .start;
                }
            },
            .int => {
                const start = l.atPosition;
                // while (l.ch) |chr| {
                //     if (!std.ascii.isDigit(chr)) break;
                //     l.readChar();
                // }
                while (l.ch != null and std.ascii.isDigit(l.ch.?)) {
                    l.readChar();
                }
                const str = l.source[start..l.atPosition];
                const num = try std.fmt.parseInt(i64, str, 10);
                try list.append(gpa, .{ .Type = .{ .int = num }, .Literal = str });
                continue :state .start;
            },
            .indentifer => {
                const start = l.atPosition;
                while (l.ch != null and (std.ascii.isDigit(l.ch.?) or
                    std.ascii.isAlphabetic(l.ch.?) or
                    l.ch.? == '_'))
                {
                    l.readChar();
                }

                const str = l.source[start..l.atPosition];
                const tokentype = l.lookIndent(str);
                try list.append(gpa, .{ .Type = tokentype, .Literal = str });
                continue :state .start;
            },
        }
        return list.toOwnedSlice(gpa);
    }

    pub fn readChar(l: *Lexer) void {
        l.atPosition = l.rdPosition;
        if (l.rdPosition >= l.source.len) {
            l.ch = null;
            return;
        } else {
            l.ch = l.source[l.rdPosition];
        }
        l.rdPosition += 1;
    }

    pub fn peekChar(l: *Lexer) u8 {
        if (l.rdPosition >= l.source.len) {
            return 0;
        }
        return l.source[l.rdPosition];
    }

    pub fn lookIndent(l: *Lexer, ident: []const u8) token.TokenType {
        _ = l;
        if (token.keywords.get(ident)) |token_type| {
            return token_type;
        }
        return .ident;
    }
};

这里没什么好讲的(bushi

Mokey的词法分析

https://momo.motues.top/blog/zig/自制解释器/词法分析/

作者

Motues

发布时间

2026 年 4 月 12 日

许可协议

CC BY-NC-SA 4.0