01 - Basic expressions and Tokenizer
001 Tokenizer Parser
// __test__/run.js
const {Parser} = require('../src/Parser')
const parser = new Parser()
const program = `42`
const ast = parser.parse(program)
console.log(JSON.stringify(ast, null, 2))
// src/Parser.js
/**
* Letter parser: recursive descent implementation
*/
class Parser {
/**
* Parses a string into an AST
*/
parse(string) {
this._string = string
// Parse recursively starting from the main
// entry point, the Program
return this.Program()
}
/**
* Main entry point
*
* Program
* : NumericLiteral
* ;
*/
Program() {
return this.NumericLiteral()
}
/**
* NumericLiteral
* : NUMBER
* ;
*/
NumericLiteral() {
return {
type: 'NumericLiteral',
value: Number(this._string)
}
}
}
module.exports = {
Parser
}
002 Numbers Strings
// src/Tokenizer.js
/**
* Tokenizer class.
*
* Lazily pulls a token from a stream.
*/
class Tokenizer {
/**
* Initializes the string.
*/
init(string) {
this._string = string
// 指针, 指向当前解析字符位置
this._cursor = 0
}
hasMoreTokens() {
return this._cursor < this._string.length;
}
/**
* Whther the tokenizer reached EOF.
*/
isEOF() {
return this._cursor === this._string.length;
}
/**
* Obtains next token.
*/
getNextToken() {
if (!this.hasMoreTokens()) {
return null
}
// 从当前指针处开始
const string = this._string.slice(this._cursor)
// Numbers
if (!Number.isNaN(Number(string[0]))) {
let number = ''
while (!Number.isNaN(Number(string[this._cursor]))) {
number += string[this._cursor++]
}
return {
type: 'NUMBER',
value: number
}
}
// String
if (string[0] === '"') {
let s = ''
do {
s += string[this._cursor++]
} while (string[this._cursor] !== '"' && !this.isEOF())
s += this._cursor++ // skip "
return {
type: 'STRING',
value: s
}
}
}
}
module.exports = {
Tokenizer
}
// src/Parser.js
/**
* Letter parser: recursive descent implementation
*/
const {Tokenizer} = require('./Tokenizer')
class Parser {
/**
* Initializes the parser.
*/
constructor() {
this._string = ''
this._tokenizer = new Tokenizer()
}
/**
* Parses a string into an AST
*/
parse(string) {
this._string = string
this._tokenizer.init(string)
// Prime the tokenizer to obtain the frist
// token which is our lookahead.The lookahead is
// used for predictive parsing.
// 预测解析 向后看一个char LL(1)
this._lookahead = this._tokenizer.getNextToken()
// Parse recursively starting from the main
// entry point, the Program
return this.Program()
}
/**
* Main entry point
*
* Program
* : Literal
* ;
*/
Program() {
return {
type: 'Program',
body: this.Literal()
}
}
/**
* Literal
* : NumericLiteral
* : StringLiteral
* ;
*/
Literal() {
switch (this._lookahead.type) {
case 'NUMBER':
return this.NumericLiteral()
case 'STRING':
return this.StringLiteral()
}
throw new SyntaxError(`Literal: unexpected literal production`)
}
/**
* StringLiteral
* : STRING
* ;
*/
StringLiteral() {
const token = this._eat('STRING')
return {
type: 'StringLiteral',
// 去除 " "
value: token.value.slice(1, -1)
}
}
/**
* NumericLiteral
* : NUMBER
* ;
*/
NumericLiteral() {
// _eat 消耗当前token, 让解析器前移
const token = this._eat('NUMBER')
return {
type: 'NumericLiteral',
value: Number(token.value)
}
}
/**
* Expects a token of a given type.
*/
_eat(tokenType) {
const token = this._lookahead
if (token == null) {
throw new SyntaxError(
`Unexpected end of input, expected: "${tokenType}"`
)
}
if (token.type !== tokenType) {
throw new SyntaxError(
`Unexpected token: "${token.value}", expected: "${tokenType}"`
)
}
// Advance to next token.
this._lookahead = this._tokenizer.getNextToken()
return token
}
}
module.exports = {
Parser
}
003 From State Machines to Regular Expressions
// src/Tokenizer.js
/**
* Tokenizer spec.
*/
const Spec = [
// Whitespace:
[/^\s+/, null],
// Comments
// Skip single-line comments:
[/^\/\/.*/, null],
// Skip multi-line comments:
[/^\/\*[\s\S]*?\*\//, null],
// Numbers:
[/^\d+/, 'NUMBER'],
// STRING
[/^"[^"]*"/, 'STRING'],
[/^'[^']*'/, 'STRING']
]
/**
* Tokenizer class.
*
* Lazily pulls a token from a stream.
*/
class Tokenizer {
// ...
/**
* Obtains next token.
*/
getNextToken() {
if (!this.hasMoreTokens()) {
return null
}
// 从当前指针处开始
const string = this._string.slice(this._cursor)
for (const [regexp, tokenType] of Spec) {
const tokenValue = this._match(regexp, string)
// Coun't match this rule,continue.
if (tokenValue == null) {
continue
}
// Should skip token,e.g.whitespace.
if (tokenType == null) {
return this.getNextToken()
}
return {
type: tokenType,
value: tokenValue
}
}
throw new SyntaxError(`Unexpected token: "${string[0]}"`)
}
/**
* Matches a token for a regular expression.
*/
_match(regexp, string) {
const matched = regexp.exec(string)
if (matched == null) {
return null
}
// 推进cursor
this._cursor += matched[0].length
return matched[0]
}
}
module.exports = {
Tokenizer
}
02 - Program structure
001 Statements and Statement list
// __test__/run.js
const {Parser} = require('../src/Parser')
const assert = require("assert");
const parser = new Parser()
/**
* List of tests
*/
const tests = [
require('./literals-test'),
require('./statement-list-test'),
]
/**
* For manual tests
*/
function exec() {
const program = `
/**
* start
*/
// number
42;
// string
"a";
'b';
`
const ast = parser.parse(program)
console.log(JSON.stringify(ast, null, 2))
}
// Manual test
// exec()
/**
* test function
*/
function test(program, expected) {
const ast = parser.parse(program)
assert.deepEqual(ast, expected)
}
tests.forEach(testRun => testRun(test))
console.log('All assertions passed!')
// src/Parser.js
class Parser {
// ...
Program() {
return {
type: 'Program',
body: this.StatementList()
}
}
/**
* 涉及到左递归,但是递归下降的无法实现
* StatementList
* : Statement
* 重构为右递归
* : StatementList Statement -> Statement Statement Statement Statement
* ;
*/
StatementList() {
const statementList = [this.Statement()]
while (this._lookahead != null) {
statementList.push(this.Statement())
}
return statementList
}
/**
* Statement
* : ExpressionStatement
* ;
*/
Statement() {
return this.ExpressionStatement()
}
/**
* ExpressionStatement
* : Expression ';'
* ;
*/
ExpressionStatement() {
const expression = this.Expression()
this._eat(';')
return {
type: 'ExpressionStatement',
expression
}
}
/**
* Expression
* : Literal
* ;
*/
Expression() {
return this.Literal()
}
}
// src/Tokenizer.js
/**
* Tokenizer spec.
*/
const Spec = [
// ...
// Symbols,delimiters:
[/^;/, ';'],
// ...
]
// __test__/statement-list-test.js
module.exports = test => {
test(`
/**
* start
*/
// number
42;
// string
"a";
'b';
`, {
"type": "Program",
"body": [
{
"type": "ExpressionStatement",
"expression": {
"type": "NumericLiteral",
"value": 42
}
},
{
"type": "ExpressionStatement",
"expression": {
"type": "StringLiteral",
"value": "a"
}
},
{
"type": "ExpressionStatement",
"expression": {
"type": "StringLiteral",
"value": "b"
}
}
]
})
}
// __test__/literals-test.js
module.exports = test => {
// NumericLiteral
test(`42;`, {
"type": "Program",
"body": [
{
expression: {
type: 'NumericLiteral',
value: 42
},
type: 'ExpressionStatement'
}
],
}
)
// StringLiteral
test(`"a";`, {
"type": "Program",
"body": [
{
expression: {
type: 'StringLiteral',
value: 'a'
},
type: 'ExpressionStatement'
}
],
}
)
test(`'b';`, {
"type": "Program",
"body": [
{
expression: {
type: 'StringLiteral',
value: 'b'
},
type: 'ExpressionStatement'
}
],
}
)
}
002 Blocks nested scopes
// __test__/empty-statement-test.js
module.exports = test => {
test(';', {
type: 'Program',
body: [
{
type: 'EmptyStatement'
}
]
})
}
// __test__/block-test.js
module.exports = test => {
test(`
{
42;
"hello";
}
`, {
type: 'Program',
body: [
{
type: 'BlockStatement',
body: [
{
type: 'ExpressionStatement',
expression: {
type: 'NumericLiteral',
value: 42
},
},
{
type: 'ExpressionStatement',
expression: {
type: 'StringLiteral',
value: 'hello'
},
}
]
}
]
})
// empty block
test(`
{
}
`, {
type: 'Program',
body: [
{
type: 'BlockStatement',
body: []
}
]
})
// nest block
test(`
{
42;
{
"hello";
}
}
`, {
type: 'Program',
body: [
{
type: 'BlockStatement',
body: [
{
expression: {
type: 'NumericLiteral',
value: 42
},
type: 'ExpressionStatement'
},
{
body: [
{
expression: {
type: 'StringLiteral',
value: 'hello'
},
type: 'ExpressionStatement'
}
],
type: 'BlockStatement'
}
],
}]
})
}
// src/Tokenizer.js
/**
* Tokenizer spec.
*/
const Spec = [
// ...
// Symbols,delimiters:
[/^;/, ';'],
[/^\{/, '{'],
[/^\}/, '}'],
// ...
]
// src/Parser.js
class Parser {
// ...
StatementList(stopLookahead = null) {
const statementList = [this.Statement()]
while (this._lookahead != null &&
this._lookahead.type !== stopLookahead) {
statementList.push(this.Statement())
}
return statementList
}
/**
* Statement
* : ExpressionStatement
* | BlockStatement
* | EmptyStatement
* ;
*/
Statement() {
switch (this._lookahead.type) {
case ';':
return this.EmptyStatement()
case '{':
return this.BlockStatement()
default:
return this.ExpressionStatement()
}
}
/**
* EmptyStatement
* : ';'
* ;
*/
EmptyStatement() {
this._eat(';')
return {
type: 'EmptyStatement'
}
}
/**
* BlockStatement
* : '{' OptStatementList '}'
* ;
*/
BlockStatement() {
this._eat('{')
const body = this._lookahead.type !== '}' ?
this.StatementList('}') : []
this._eat('}')
return {
type: 'BlockStatement',
body
}
}
}
003 Different AST formats
// src/Parser.js
/**
* Letter parser: recursive descent implementation
*/
const {Tokenizer} = require('./Tokenizer')
// Default AST node factories
const DefaultFactory = {
Program(body) {
return {
type: 'Program',
body
}
},
EmptyStatement() {
return {
type: 'EmptyStatement'
}
},
BlockStatement(body) {
return {
type: 'BlockStatement',
body
}
},
ExpressionStatement(expression) {
return {
type: 'ExpressionStatement',
expression
}
},
StringLiteral(value) {
return {
type: 'StringLiteral',
value
}
},
NumericLiteral(value) {
return {
type: 'NumericLiteral',
value
}
}
}
// S-expression AST node factories
const SExpressionFactory = {
Program(body) {
return ['begin', body]
},
EmptyStatement() {
return {}
},
BlockStatement(body) {
return ['begin', body]
},
ExpressionStatement(expression) {
return expression
},
StringLiteral(value) {
return `${value}`
},
NumericLiteral(value) {
return value
}
}
const AST_MODE = 'default'
const factory =
AST_MODE === 'default' ?
DefaultFactory : SExpressionFactory
class Parser {
// ...
/**
* Main entry point
*
* Program
* : Literal
* ;
*/
Program() {
return factory.Program(this.StatementList())
}
// ...
/**
* EmptyStatement
* : ';'
* ;
*/
EmptyStatement() {
this._eat(';')
return factory.EmptyStatement()
}
/**
* BlockStatement
* : '{' OptStatementList '}'
* ;
*/
BlockStatement() {
this._eat('{')
const body = this._lookahead.type !== '}' ?
this.StatementList('}') : []
this._eat('}')
return factory.BlockStatement(body)
}
/**
* ExpressionStatement
* : Expression ';'
* ;
*/
ExpressionStatement() {
const expression = this.Expression()
this._eat(';')
return factory.ExpressionStatement(expression)
}
// ...
/**
* StringLiteral
* : STRING
* ;
*/
StringLiteral() {
const token = this._eat('STRING')
return factory.StringLiteral(token.value.slice(1, -1))
}
/**
* NumericLiteral
* : NUMBER
* ;
*/
NumericLiteral() {
// _eat 消耗当前token, 让解析器前移
const token = this._eat('NUMBER')
return factory.NumericLiteral(Number(token.value))
}
// ...
}
module.exports = {
Parser
}