Building Custom JS String Parsers Full Tutorial

A full tutorial on building custom string parsers in JavaScript. Covers tokenizing, recursive descent parsing, building an AST, parsing arithmetic expressions, parsing JSON manually, error handling with line/column tracking, and when to use custom parsers vs regex or existing libraries.

JavaScriptintermediate

RuneHub Team

March 2, 2026

14 min read

RuneHub Team

Mar 2, 2026

14 min read

Parsing transforms a flat string into structured data. While regex handles simple patterns, complex formats (nested brackets, operator precedence, recursive structures) need a proper parser. This guide builds parsers step by step, from simple tokenizers to recursive descent parsers that produce abstract syntax trees.

When to Write a Custom Parser

Approach	Best For
`String.split()` / `String.match()`	Simple delimited data
Regex	Flat patterns without nesting
Custom tokenizer	Token-based formats, DSLs
Recursive descent parser	Nested or recursive structures
Parser combinator library	Complex grammars, reusable components

Part 1: Tokenizing (Lexical Analysis)

A tokenizer breaks a string into meaningful chunks called tokens:

javascript

function tokenize(input) {
  const tokens = [];
  let i = 0;
 
  while (i < input.length) {
    const char = input[i];
 
    // Skip whitespace
    if (/\s/.test(char)) {
      i++;
      continue;
    }
 
    // Numbers (integers and decimals)
    if (/\d/.test(char)) {
      let num = "";
      while (i < input.length && /[\d.]/.test(input[i])) {
        num += input[i++];
      }
      tokens.push({ type: "NUMBER", value: parseFloat(num) });
      continue;
    }
 
    // Operators
    if ("+-*/()".includes(char)) {
      tokens.push({ type: "OPERATOR", value: char });
      i++;
      continue;
    }
 
    // Identifiers (variable names)
    if (/[a-zA-Z_]/.test(char)) {
      let id = "";
      while (i < input.length && /[a-zA-Z0-9_]/.test(input[i])) {
        id += input[i++];
      }
      tokens.push({ type: "IDENTIFIER", value: id });
      continue;
    }
 
    throw new Error(`Unexpected character: '${char}' at position ${i}`);
  }
 
  return tokens;
}
 
console.log(tokenize("3 + 42 * (x - 1)"));
// [
//   { type: "NUMBER", value: 3 },
//   { type: "OPERATOR", value: "+" },
//   { type: "NUMBER", value: 42 },
//   { type: "OPERATOR", value: "*" },
//   { type: "OPERATOR", value: "(" },
//   { type: "IDENTIFIER", value: "x" },
//   { type: "OPERATOR", value: "-" },
//   { type: "NUMBER", value: 1 },
//   { type: "OPERATOR", value: ")" },
// ]

Part 2: Recursive Descent Parser

A recursive descent parser uses one function per grammar rule. Here is a parser for arithmetic expressions with correct operator precedence:

Grammar

Code

expression  = term (('+' | '-') term)*
term        = factor (('*' | '/') factor)*
factor      = NUMBER | '(' expression ')'

Implementation

javascript

class ArithmeticParser {
  constructor(tokens) {
    this.tokens = tokens;
    this.pos = 0;
  }
 
  peek() {
    return this.tokens[this.pos] || null;
  }
 
  consume(expectedValue) {
    const token = this.tokens[this.pos];
    if (!token) throw new Error("Unexpected end of input");
    if (expectedValue && token.value !== expectedValue) {
      throw new Error(`Expected '${expectedValue}', got '${token.value}'`);
    }
    this.pos++;
    return token;
  }
 
  // expression = term (('+' | '-') term)*
  parseExpression() {
    let left = this.parseTerm();
 
    while (this.peek()?.value === "+" || this.peek()?.value === "-") {
      const op = this.consume().value;
      const right = this.parseTerm();
      left = { type: "BinaryExpression", operator: op, left, right };
    }
 
    return left;
  }
 
  // term = factor (('*' | '/') factor)*
  parseTerm() {
    let left = this.parseFactor();
 
    while (this.peek()?.value === "*" || this.peek()?.value === "/") {
      const op = this.consume().value;
      const right = this.parseFactor();
      left = { type: "BinaryExpression", operator: op, left, right };
    }
 
    return left;
  }
 
  // factor = NUMBER | '(' expression ')'
  parseFactor() {
    const token = this.peek();
 
    if (token?.type === "NUMBER") {
      this.consume();
      return { type: "NumberLiteral", value: token.value };
    }
 
    if (token?.value === "(") {
      this.consume("(");
      const expr = this.parseExpression();
      this.consume(")");
      return expr;
    }
 
    throw new Error(`Unexpected token: ${JSON.stringify(token)}`);
  }
 
  parse() {
    const ast = this.parseExpression();
    if (this.pos < this.tokens.length) {
      throw new Error(`Unexpected token after expression: ${JSON.stringify(this.peek())}`);
    }
    return ast;
  }
}

Using the Parser

javascript

const tokens = tokenize("3 + 4 * 2");
const parser = new ArithmeticParser(tokens);
const ast = parser.parse();
 
console.log(JSON.stringify(ast, null, 2));
// {
//   "type": "BinaryExpression",
//   "operator": "+",
//   "left": { "type": "NumberLiteral", "value": 3 },
//   "right": {
//     "type": "BinaryExpression",
//     "operator": "*",
//     "left": { "type": "NumberLiteral", "value": 4 },
//     "right": { "type": "NumberLiteral", "value": 2 }
//   }
// }

Multiplication is nested deeper than addition, correctly representing operator precedence.

Part 3: Evaluating the AST

javascript

function evaluate(node) {
  if (node.type === "NumberLiteral") {
    return node.value;
  }
 
  if (node.type === "BinaryExpression") {
    const left = evaluate(node.left);
    const right = evaluate(node.right);
 
    switch (node.operator) {
      case "+": return left + right;
      case "-": return left - right;
      case "*": return left * right;
      case "/": return left / right;
      default:  throw new Error(`Unknown operator: ${node.operator}`);
    }
  }
 
  throw new Error(`Unknown node type: ${node.type}`);
}
 
const tokens2 = tokenize("(2 + 3) * 4");
const ast2 = new ArithmeticParser(tokens2).parse();
console.log(evaluate(ast2)); // 20

Part 4: Parsing Key-Value Configs

A practical real-world example parsing .env-style files:

javascript

function parseEnvFile(input) {
  const result = {};
  const lines = input.split("\n");
 
  for (let lineNum = 0; lineNum < lines.length; lineNum++) {
    const line = lines[lineNum].trim();
 
    // Skip empty lines and comments
    if (!line || line.startsWith("#")) continue;
 
    const eqIndex = line.indexOf("=");
    if (eqIndex === -1) {
      throw new Error(`Invalid syntax at line ${lineNum + 1}: missing '='`);
    }
 
    const key = line.slice(0, eqIndex).trim();
    let value = line.slice(eqIndex + 1).trim();
 
    // Strip surrounding quotes
    if ((value.startsWith('"') && value.endsWith('"')) ||
        (value.startsWith("'") && value.endsWith("'"))) {
      value = value.slice(1, -1);
    }
 
    result[key] = value;
  }
 
  return result;
}
 
const env = parseEnvFile(`
# Database config
DB_HOST=localhost
DB_PORT=5432
DB_NAME="my_database"
API_KEY='secret-key-123'
`);
 
console.log(env);
// { DB_HOST: "localhost", DB_PORT: "5432", DB_NAME: "my_database", API_KEY: "secret-key-123" }

Error Handling Best Practices

Technique	Purpose
Position tracking	Report exact character/line of error
Expected-vs-got messages	Clear diagnostic for the developer
Recovery points	Skip to next statement after error
Error collection	Report all errors, not just the first

javascript

class ParseError extends Error {
  constructor(message, line, column) {
    super(`${message} at line ${line}, column ${column}`);
    this.line = line;
    this.column = column;
  }
}

Rune AI

Key Insights

Tokenizing is the first step: Break the input string into typed tokens (numbers, operators, identifiers) before parsing
Recursive descent maps grammar rules to functions: Each production rule (expression, term, factor) becomes a parser method
Operator precedence is handled by nesting: Lower-precedence operators are parsed at higher levels; higher-precedence operators at deeper levels
The AST separates parsing from evaluation: Parse once into a tree, then traverse the tree for evaluation, optimization, or code generation
Error messages must include position: Track line and column numbers during tokenizing so parse errors are actionable

Frequently Asked Questions

When should I write a parser instead of using regex?

When the input has nesting (brackets, tags), recursive structure, or operator precedence. Regex cannot handle arbitrary nesting. For flat delimited text, regex or `split()` is fine.

How do I handle string literals with escape sequences?

In the tokenizer, when you encounter a quote character, consume characters until the matching close quote. Handle `\"` by checking for backslash-escaped characters during consumption.

What is a parser combinator?

function that takes small parsers and combines them into larger ones. Libraries like Parsimmon or Chevrotain use this pattern. They are more maintainable than hand-written parsers for large grammars.

Can I parse HTML with a recursive descent parser?

Technically yes, but real-world HTML is messy (self-closing tags, optional end tags, attribute variations). Use a battle-tested parser like `htmlparser2` for production HTML parsing.

How do I add support for variables in the arithmetic parser?

dd an `IDENTIFIER` case to `parseFactor()` that returns a `{ type: "Variable", name: token.value }` node. During evaluation, look up the variable in a scope object passed as an argument.

Conclusion

Custom parsers give you full control over how strings become structured data. Start with a tokenizer that breaks input into typed tokens. Then build a recursive descent parser with one function per grammar rule. The resulting AST can be evaluated, transformed, or compiled. For string formatting with tagged templates (a simpler form of DSL), see JavaScript tagged template literals deep dive. For the module system to organize parser code, see JavaScript named exports a complete tutorial.

Building Custom JS String Parsers Full Tutorial

When to Write a Custom Parser

Part 1: Tokenizing (Lexical Analysis)

Part 2: Recursive Descent Parser

Grammar

Implementation

Using the Parser

Part 3: Evaluating the AST

Part 4: Parsing Key-Value Configs

Error Handling Best Practices

Rune AI

Frequently Asked Questions

When should I write a parser instead of using regex?

How do I handle string literals with escape sequences?

What is a parser combinator?

Can I parse HTML with a recursive descent parser?

How do I add support for variables in the arithmetic parser?

Conclusion

Tags

More in this topic

OffscreenCanvas API in JS for UI Performance

Advanced Web Workers for High Performance JS

JavaScript Macros and Abstract Code Generation

Stay Updated