Building a Parser from scratch

01 - Basic expressions and Tokenizer

001 Tokenizer Parser

解析语法, 人们一开始尝试使用正则表达式, 但是正则表达式不适合解析, 我们确实使用了它(只限于标记器模块)

另一门课

// __test__/run.js
const {Parser} = require('../src/Parser')
const parser = new Parser()

const program = `42`

const ast = parser.parse(program)

console.log(JSON.stringify(ast, null, 2))
// src/Parser.js
/**
 * Letter parser: recursive descent implementation
 */
class Parser {
    /**
     * Parses a string into an AST
     */
    parse(string) {
        this._string = string

        // Parse recursively starting from the main
        // entry point, the Program

        return this.Program()
    }

    /**
     * Main entry point
     *
     * Program
     *      : NumericLiteral
     *      ;
     */
    Program() {
        return this.NumericLiteral()
    }

    /**
     * NumericLiteral
     *      : NUMBER
     *      ;
     */
    NumericLiteral() {
        return {
            type: 'NumericLiteral',
            value: Number(this._string)
        }
    }

}

module.exports = {
    Parser
}

002 Numbers Strings

// src/Tokenizer.js
/**
 * Tokenizer class.
 *
 * Lazily pulls a token from a stream.
 */
class Tokenizer {
    /**
     * Initializes the string.
     */
    init(string) {
        this._string = string
        // 指针, 指向当前解析字符位置
        this._cursor = 0
    }

    hasMoreTokens() {
        return this._cursor < this._string.length;
    }

    /**
     * Whther the tokenizer reached EOF.
     */
    isEOF() {
        return this._cursor === this._string.length;
    }

    /**
     * Obtains next token.
     */
    getNextToken() {
        if (!this.hasMoreTokens()) {
            return null
        }

        // 从当前指针处开始
        const string = this._string.slice(this._cursor)

        // Numbers
        if (!Number.isNaN(Number(string[0]))) {
            let number = ''
            while (!Number.isNaN(Number(string[this._cursor]))) {
                number += string[this._cursor++]
            }
            return {
                type: 'NUMBER',
                value: number
            }
        }

        // String
        if (string[0] === '"') {
            let s = ''
            do {
                s += string[this._cursor++]
            } while (string[this._cursor] !== '"' && !this.isEOF())
            s += this._cursor++ // skip "
            return {
                type: 'STRING',
                value: s
            }
        }
    }
}

module.exports = {
    Tokenizer
}
// src/Parser.js
/**
 * Letter parser: recursive descent implementation
 */
const {Tokenizer} = require('./Tokenizer')

class Parser {
    /**
     * Initializes the parser.
     */
    constructor() {
        this._string = ''
        this._tokenizer = new Tokenizer()
    }


    /**
     * Parses a string into an AST
     */
    parse(string) {
        this._string = string
        this._tokenizer.init(string)

        // Prime the tokenizer to obtain the frist
        // token which is our lookahead.The lookahead is
        // used for predictive parsing.
        // 预测解析 向后看一个char LL(1)
        this._lookahead = this._tokenizer.getNextToken()

        // Parse recursively starting from the main
        // entry point, the Program

        return this.Program()
    }

    /**
     * Main entry point
     *
     * Program
     *      : Literal
     *      ;
     */
    Program() {
        return {
            type: 'Program',
            body: this.Literal()
        }
    }

    /**
     * Literal
     *      : NumericLiteral
     *      : StringLiteral
     *      ;
     */
    Literal() {
        switch (this._lookahead.type) {
            case 'NUMBER':
                return this.NumericLiteral()
            case 'STRING':
                return this.StringLiteral()
        }
        throw new SyntaxError(`Literal: unexpected literal production`)
    }

    /**
     * StringLiteral
     *      : STRING
     *      ;
     */
    StringLiteral() {
        const token = this._eat('STRING')
        return {
            type: 'StringLiteral',
            // 去除 " "
            value: token.value.slice(1, -1)
        }
    }

    /**
     * NumericLiteral
     *      : NUMBER
     *      ;
     */
    NumericLiteral() {
        // _eat 消耗当前token, 让解析器前移
        const token = this._eat('NUMBER')
        return {
            type: 'NumericLiteral',
            value: Number(token.value)
        }
    }

    /**
     * Expects a token of a given type.
     */
    _eat(tokenType) {
        const token = this._lookahead

        if (token == null) {
            throw new SyntaxError(
                `Unexpected end of input, expected: "${tokenType}"`
            )
        }

        if (token.type !== tokenType) {
            throw new SyntaxError(
                `Unexpected token: "${token.value}", expected: "${tokenType}"`
            )
        }

        // Advance to next token.
        this._lookahead = this._tokenizer.getNextToken()

        return token
    }
}

module.exports = {
    Parser
}

003 From State Machines to Regular Expressions

// src/Tokenizer.js
/**
 * Tokenizer spec.
 */
const Spec = [
    // Whitespace:
    [/^\s+/, null],
    // Comments
    // Skip single-line comments:
    [/^\/\/.*/, null],
    // Skip multi-line comments:
    [/^\/\*[\s\S]*?\*\//, null],
    // Numbers:
    [/^\d+/, 'NUMBER'],
    // STRING
    [/^"[^"]*"/, 'STRING'],
    [/^'[^']*'/, 'STRING']
]

/**
 * Tokenizer class.
 *
 * Lazily pulls a token from a stream.
 */
class Tokenizer {
    // ...

    /**
     * Obtains next token.
     */
    getNextToken() {
        if (!this.hasMoreTokens()) {
            return null
        }

        // 从当前指针处开始
        const string = this._string.slice(this._cursor)

        for (const [regexp, tokenType] of Spec) {
            const tokenValue = this._match(regexp, string)

            // Coun't match this rule,continue.
            if (tokenValue == null) {
                continue
            }

            // Should skip token,e.g.whitespace.
            if (tokenType == null) {
                return this.getNextToken()
            }

            return {
                type: tokenType,
                value: tokenValue
            }
        }

        throw new SyntaxError(`Unexpected token: "${string[0]}"`)
    }

    /**
     * Matches a token for a regular expression.
     */
    _match(regexp, string) {
        const matched = regexp.exec(string)
        if (matched == null) {
            return null
        }
        // 推进cursor
        this._cursor += matched[0].length
        return matched[0]
    }
}

module.exports = {
    Tokenizer
}

02 - Program structure

001 Statements and Statement list

// __test__/run.js
const {Parser} = require('../src/Parser')
const assert = require("assert");
const parser = new Parser()

/**
 * List of tests
 */
const tests = [
    require('./literals-test'),
    require('./statement-list-test'),
]

/**
 * For manual tests
 */
function exec() {
    const program = `
    /**
     * start
     */
    // number
    42;
    // string
    "a";
    'b';
    
`

    const ast = parser.parse(program)

    console.log(JSON.stringify(ast, null, 2))
}

// Manual test
// exec()

/**
 * test function
 */
function test(program, expected) {
    const ast = parser.parse(program)
    assert.deepEqual(ast, expected)
}

tests.forEach(testRun => testRun(test))

console.log('All assertions passed!')
// src/Parser.js
class Parser {
    // ...
    Program() {
        return {
            type: 'Program',
            body: this.StatementList()
        }
    }

    /**
     * 涉及到左递归,但是递归下降的无法实现
     * StatementList
     *      : Statement
     *      重构为右递归
     *      : StatementList Statement -> Statement Statement Statement Statement
     *      ;
     */
    StatementList() {
        const statementList = [this.Statement()]

        while (this._lookahead != null) {
            statementList.push(this.Statement())
        }

        return statementList
    }

    /**
     * Statement
     *     : ExpressionStatement
     *     ;
     */
    Statement() {
        return this.ExpressionStatement()
    }

    /**
     * ExpressionStatement
     *      : Expression ';'
     *      ;
     */
    ExpressionStatement() {
        const expression = this.Expression()
        this._eat(';')
        return {
            type: 'ExpressionStatement',
            expression
        }
    }

    /**
     * Expression
     *      : Literal
     *      ;
     */
    Expression() {
        return this.Literal()
    }
}
// src/Tokenizer.js
/**
 * Tokenizer spec.
 */
const Spec = [
    // ...
    // Symbols,delimiters:
    [/^;/, ';'],
    // ...
]
// __test__/statement-list-test.js
module.exports = test => {
    test(` 
        /**
         * start
         */
        // number
        42;
        // string
        "a";
        'b';
    `, {
        "type": "Program",
        "body": [
            {
                "type": "ExpressionStatement",
                "expression": {
                    "type": "NumericLiteral",
                    "value": 42
                }
            },
            {
                "type": "ExpressionStatement",
                "expression": {
                    "type": "StringLiteral",
                    "value": "a"
                }
            },
            {
                "type": "ExpressionStatement",
                "expression": {
                    "type": "StringLiteral",
                    "value": "b"
                }
            }
        ]
    })
}
// __test__/literals-test.js
module.exports = test => {
    // NumericLiteral
    test(`42;`, {
            "type": "Program",
            "body": [
                {
                    expression: {
                        type: 'NumericLiteral',
                        value: 42
                    },
                    type: 'ExpressionStatement'
                }
            ],
        }
    )
    // StringLiteral
    test(`"a";`, {
            "type": "Program",
            "body": [
                {
                    expression: {
                        type: 'StringLiteral',
                        value: 'a'
                    },
                    type: 'ExpressionStatement'
                }
            ],
        }
    )
    test(`'b';`, {
            "type": "Program",
            "body": [
                {
                    expression: {
                        type: 'StringLiteral',
                        value: 'b'
                    },
                    type: 'ExpressionStatement'
                }
            ],
        }
    )
}

002 Blocks nested scopes

// __test__/empty-statement-test.js
module.exports = test => {
    test(';', {
        type: 'Program',
        body: [
            {
                type: 'EmptyStatement'
            }
        ]
    })
}
// __test__/block-test.js
module.exports = test => {
    test(`
        {
            42;
            
            "hello";
        }
    `, {
        type: 'Program',
        body: [
            {
                type: 'BlockStatement',
                body: [
                    {
                        type: 'ExpressionStatement',
                        expression: {
                            type: 'NumericLiteral',
                            value: 42
                        },
                    },
                    {
                        type: 'ExpressionStatement',
                        expression: {
                            type: 'StringLiteral',
                            value: 'hello'
                        },
                    }
                ]
            }
        ]
    })
    // empty block
    test(`
        { 
        
        }
    `, {
        type: 'Program',
        body: [
            {
                type: 'BlockStatement',
                body: []
            }
        ]
    })
    // nest block

    test(`
        {
            42;
            {
                "hello";
            }
        }
    `, {
        type: 'Program',
        body: [
            {
                type: 'BlockStatement',
                body: [
                    {
                        expression: {
                            type: 'NumericLiteral',
                            value: 42
                        },
                        type: 'ExpressionStatement'
                    },
                    {
                        body: [
                            {
                                expression: {
                                    type: 'StringLiteral',
                                    value: 'hello'
                                },
                                type: 'ExpressionStatement'
                            }
                        ],
                        type: 'BlockStatement'
                    }
                ],
            }]
    })
}
// src/Tokenizer.js
/**
 * Tokenizer spec.
 */
const Spec = [
    // ...
    // Symbols,delimiters:
    [/^;/, ';'],
    [/^\{/, '{'],
    [/^\}/, '}'],
    // ...
]
// src/Parser.js
class Parser {
    // ...
    StatementList(stopLookahead = null) {
        const statementList = [this.Statement()]

        while (this._lookahead != null &&
        this._lookahead.type !== stopLookahead) {
            statementList.push(this.Statement())
        }

        return statementList
    }

    /**
     * Statement
     *     : ExpressionStatement
     *     | BlockStatement
     *     | EmptyStatement
     *     ;
     */
    Statement() {
        switch (this._lookahead.type) {
            case ';':
                return this.EmptyStatement()
            case '{':
                return this.BlockStatement()
            default:
                return this.ExpressionStatement()
        }
    }

    /**
     * EmptyStatement
     *      : ';'
     *      ;
     */
    EmptyStatement() {
        this._eat(';')
        return {
            type: 'EmptyStatement'
        }
    }

    /**
     * BlockStatement
     *      : '{' OptStatementList '}'
     *      ;
     */
    BlockStatement() {
        this._eat('{')
        const body = this._lookahead.type !== '}' ?
            this.StatementList('}') : []
        this._eat('}')
        return {
            type: 'BlockStatement',
            body
        }
    }
} 

003 Different AST formats

// src/Parser.js
/**
 * Letter parser: recursive descent implementation
 */
const {Tokenizer} = require('./Tokenizer')

// Default AST node factories
const DefaultFactory = {
    Program(body) {
        return {
            type: 'Program',
            body
        }
    },

    EmptyStatement() {
        return {
            type: 'EmptyStatement'
        }
    },

    BlockStatement(body) {
        return {
            type: 'BlockStatement',
            body
        }
    },

    ExpressionStatement(expression) {
        return {
            type: 'ExpressionStatement',
            expression
        }
    },

    StringLiteral(value) {
        return {
            type: 'StringLiteral',
            value
        }
    },

    NumericLiteral(value) {
        return {
            type: 'NumericLiteral',
            value
        }
    }
}

// S-expression AST node factories
const SExpressionFactory = {
    Program(body) {
        return ['begin', body]
    },

    EmptyStatement() {
        return {}
    },

    BlockStatement(body) {
        return ['begin', body]
    },

    ExpressionStatement(expression) {
        return expression
    },

    StringLiteral(value) {
        return `${value}`
    },

    NumericLiteral(value) {
        return value
    }
}

const AST_MODE = 'default'
const factory =
    AST_MODE === 'default' ?
        DefaultFactory : SExpressionFactory

class Parser {
    // ...

    /**
     * Main entry point
     *
     * Program
     *      : Literal
     *      ;
     */
    Program() {
        return factory.Program(this.StatementList())
    }

    // ...

    /**
     * EmptyStatement
     *      : ';'
     *      ;
     */
    EmptyStatement() {
        this._eat(';')
        return factory.EmptyStatement()
    }

    /**
     * BlockStatement
     *      : '{' OptStatementList '}'
     *      ;
     */
    BlockStatement() {
        this._eat('{')
        const body = this._lookahead.type !== '}' ?
            this.StatementList('}') : []
        this._eat('}')
        return factory.BlockStatement(body)
    }

    /**
     * ExpressionStatement
     *      : Expression ';'
     *      ;
     */
    ExpressionStatement() {
        const expression = this.Expression()
        this._eat(';')
        return factory.ExpressionStatement(expression)
    }

    // ...

    /**
     * StringLiteral
     *      : STRING
     *      ;
     */
    StringLiteral() {
        const token = this._eat('STRING')
        return factory.StringLiteral(token.value.slice(1, -1))
    }

    /**
     * NumericLiteral
     *      : NUMBER
     *      ;
     */
    NumericLiteral() {
        // _eat 消耗当前token, 让解析器前移
        const token = this._eat('NUMBER')
        return factory.NumericLiteral(Number(token.value))
    }

    // ...
}

module.exports = {
    Parser
}

004 Binary Expressions


  目录