Building Custom JS String Parsers Full Tutorial
A full tutorial on building custom string parsers in JavaScript. Covers tokenizing, recursive descent parsing, building an AST, parsing arithmetic expressions, parsing JSON manually, error handling with line/column tracking, and when to use custom parsers vs regex or existing libraries.
Parsing transforms a flat string into structured data. While regex handles simple patterns, complex formats (nested brackets, operator precedence, recursive structures) need a proper parser. This guide builds parsers step by step, from simple tokenizers to recursive descent parsers that produce abstract syntax trees.
When to Write a Custom Parser
| Approach | Best For |
|---|---|
String.split() / String.match() | Simple delimited data |
| Regex | Flat patterns without nesting |
| Custom tokenizer | Token-based formats, DSLs |
| Recursive descent parser | Nested or recursive structures |
| Parser combinator library | Complex grammars, reusable components |
Part 1: Tokenizing (Lexical Analysis)
A tokenizer breaks a string into meaningful chunks called tokens:
function tokenize(input) {
const tokens = [];
let i = 0;
while (i < input.length) {
const char = input[i];
// Skip whitespace
if (/\s/.test(char)) {
i++;
continue;
}
// Numbers (integers and decimals)
if (/\d/.test(char)) {
let num = "";
while (i < input.length && /[\d.]/.test(input[i])) {
num += input[i++];
}
tokens.push({ type: "NUMBER", value: parseFloat(num) });
continue;
}
// Operators
if ("+-*/()".includes(char)) {
tokens.push({ type: "OPERATOR", value: char });
i++;
continue;
}
// Identifiers (variable names)
if (/[a-zA-Z_]/.test(char)) {
let id = "";
while (i < input.length && /[a-zA-Z0-9_]/.test(input[i])) {
id += input[i++];
}
tokens.push({ type: "IDENTIFIER", value: id });
continue;
}
throw new Error(`Unexpected character: '${char}' at position ${i}`);
}
return tokens;
}
console.log(tokenize("3 + 42 * (x - 1)"));
// [
// { type: "NUMBER", value: 3 },
// { type: "OPERATOR", value: "+" },
// { type: "NUMBER", value: 42 },
// { type: "OPERATOR", value: "*" },
// { type: "OPERATOR", value: "(" },
// { type: "IDENTIFIER", value: "x" },
// { type: "OPERATOR", value: "-" },
// { type: "NUMBER", value: 1 },
// { type: "OPERATOR", value: ")" },
// ]Part 2: Recursive Descent Parser
A recursive descent parser uses one function per grammar rule. Here is a parser for arithmetic expressions with correct operator precedence:
Grammar
expression = term (('+' | '-') term)*
term = factor (('*' | '/') factor)*
factor = NUMBER | '(' expression ')'
Implementation
class ArithmeticParser {
constructor(tokens) {
this.tokens = tokens;
this.pos = 0;
}
peek() {
return this.tokens[this.pos] || null;
}
consume(expectedValue) {
const token = this.tokens[this.pos];
if (!token) throw new Error("Unexpected end of input");
if (expectedValue && token.value !== expectedValue) {
throw new Error(`Expected '${expectedValue}', got '${token.value}'`);
}
this.pos++;
return token;
}
// expression = term (('+' | '-') term)*
parseExpression() {
let left = this.parseTerm();
while (this.peek()?.value === "+" || this.peek()?.value === "-") {
const op = this.consume().value;
const right = this.parseTerm();
left = { type: "BinaryExpression", operator: op, left, right };
}
return left;
}
// term = factor (('*' | '/') factor)*
parseTerm() {
let left = this.parseFactor();
while (this.peek()?.value === "*" || this.peek()?.value === "/") {
const op = this.consume().value;
const right = this.parseFactor();
left = { type: "BinaryExpression", operator: op, left, right };
}
return left;
}
// factor = NUMBER | '(' expression ')'
parseFactor() {
const token = this.peek();
if (token?.type === "NUMBER") {
this.consume();
return { type: "NumberLiteral", value: token.value };
}
if (token?.value === "(") {
this.consume("(");
const expr = this.parseExpression();
this.consume(")");
return expr;
}
throw new Error(`Unexpected token: ${JSON.stringify(token)}`);
}
parse() {
const ast = this.parseExpression();
if (this.pos < this.tokens.length) {
throw new Error(`Unexpected token after expression: ${JSON.stringify(this.peek())}`);
}
return ast;
}
}Using the Parser
const tokens = tokenize("3 + 4 * 2");
const parser = new ArithmeticParser(tokens);
const ast = parser.parse();
console.log(JSON.stringify(ast, null, 2));
// {
// "type": "BinaryExpression",
// "operator": "+",
// "left": { "type": "NumberLiteral", "value": 3 },
// "right": {
// "type": "BinaryExpression",
// "operator": "*",
// "left": { "type": "NumberLiteral", "value": 4 },
// "right": { "type": "NumberLiteral", "value": 2 }
// }
// }Multiplication is nested deeper than addition, correctly representing operator precedence.
Part 3: Evaluating the AST
function evaluate(node) {
if (node.type === "NumberLiteral") {
return node.value;
}
if (node.type === "BinaryExpression") {
const left = evaluate(node.left);
const right = evaluate(node.right);
switch (node.operator) {
case "+": return left + right;
case "-": return left - right;
case "*": return left * right;
case "/": return left / right;
default: throw new Error(`Unknown operator: ${node.operator}`);
}
}
throw new Error(`Unknown node type: ${node.type}`);
}
const tokens2 = tokenize("(2 + 3) * 4");
const ast2 = new ArithmeticParser(tokens2).parse();
console.log(evaluate(ast2)); // 20Part 4: Parsing Key-Value Configs
A practical real-world example parsing .env-style files:
function parseEnvFile(input) {
const result = {};
const lines = input.split("\n");
for (let lineNum = 0; lineNum < lines.length; lineNum++) {
const line = lines[lineNum].trim();
// Skip empty lines and comments
if (!line || line.startsWith("#")) continue;
const eqIndex = line.indexOf("=");
if (eqIndex === -1) {
throw new Error(`Invalid syntax at line ${lineNum + 1}: missing '='`);
}
const key = line.slice(0, eqIndex).trim();
let value = line.slice(eqIndex + 1).trim();
// Strip surrounding quotes
if ((value.startsWith('"') && value.endsWith('"')) ||
(value.startsWith("'") && value.endsWith("'"))) {
value = value.slice(1, -1);
}
result[key] = value;
}
return result;
}
const env = parseEnvFile(`
# Database config
DB_HOST=localhost
DB_PORT=5432
DB_NAME="my_database"
API_KEY='secret-key-123'
`);
console.log(env);
// { DB_HOST: "localhost", DB_PORT: "5432", DB_NAME: "my_database", API_KEY: "secret-key-123" }Error Handling Best Practices
| Technique | Purpose |
|---|---|
| Position tracking | Report exact character/line of error |
| Expected-vs-got messages | Clear diagnostic for the developer |
| Recovery points | Skip to next statement after error |
| Error collection | Report all errors, not just the first |
class ParseError extends Error {
constructor(message, line, column) {
super(`${message} at line ${line}, column ${column}`);
this.line = line;
this.column = column;
}
}Rune AI
Key Insights
- Tokenizing is the first step: Break the input string into typed tokens (numbers, operators, identifiers) before parsing
- Recursive descent maps grammar rules to functions: Each production rule (expression, term, factor) becomes a parser method
- Operator precedence is handled by nesting: Lower-precedence operators are parsed at higher levels; higher-precedence operators at deeper levels
- The AST separates parsing from evaluation: Parse once into a tree, then traverse the tree for evaluation, optimization, or code generation
- Error messages must include position: Track line and column numbers during tokenizing so parse errors are actionable
Frequently Asked Questions
When should I write a parser instead of using regex?
How do I handle string literals with escape sequences?
What is a parser combinator?
Can I parse HTML with a recursive descent parser?
How do I add support for variables in the arithmetic parser?
Conclusion
Custom parsers give you full control over how strings become structured data. Start with a tokenizer that breaks input into typed tokens. Then build a recursive descent parser with one function per grammar rule. The resulting AST can be evaluated, transformed, or compiled. For string formatting with tagged templates (a simpler form of DSL), see JavaScript tagged template literals deep dive. For the module system to organize parser code, see JavaScript named exports a complete tutorial.
More in this topic
OffscreenCanvas API in JS for UI Performance
Master the OffscreenCanvas API to offload rendering from the main thread. Covers worker-based 2D and WebGL rendering, animation loops inside workers, bitmap transfer, double buffering, chart rendering pipelines, image processing, and performance measurement strategies.
Advanced Web Workers for High Performance JS
Master Web Workers for truly parallel JavaScript execution. Covers dedicated and shared workers, structured cloning, transferable objects, SharedArrayBuffer with Atomics, worker pools, task scheduling, Comlink RPC patterns, module workers, and performance profiling strategies.
JavaScript Macros and Abstract Code Generation
Master JavaScript code generation techniques for compile-time and runtime metaprogramming. Covers AST manipulation, Babel plugin authorship, tagged template literals as macros, code generation pipelines, source-to-source transformation, compile-time evaluation, and safe eval alternatives.