Building Custom JS String Parsers Full Tutorial

A full tutorial on building custom string parsers in JavaScript. Covers tokenizing, recursive descent parsing, building an AST, parsing arithmetic expressions, parsing JSON manually, error handling with line/column tracking, and when to use custom parsers vs regex or existing libraries.

JavaScriptintermediate
14 min read

Parsing transforms a flat string into structured data. While regex handles simple patterns, complex formats (nested brackets, operator precedence, recursive structures) need a proper parser. This guide builds parsers step by step, from simple tokenizers to recursive descent parsers that produce abstract syntax trees.

When to Write a Custom Parser

ApproachBest For
String.split() / String.match()Simple delimited data
RegexFlat patterns without nesting
Custom tokenizerToken-based formats, DSLs
Recursive descent parserNested or recursive structures
Parser combinator libraryComplex grammars, reusable components

Part 1: Tokenizing (Lexical Analysis)

A tokenizer breaks a string into meaningful chunks called tokens:

javascriptjavascript
function tokenize(input) {
  const tokens = [];
  let i = 0;
 
  while (i < input.length) {
    const char = input[i];
 
    // Skip whitespace
    if (/\s/.test(char)) {
      i++;
      continue;
    }
 
    // Numbers (integers and decimals)
    if (/\d/.test(char)) {
      let num = "";
      while (i < input.length && /[\d.]/.test(input[i])) {
        num += input[i++];
      }
      tokens.push({ type: "NUMBER", value: parseFloat(num) });
      continue;
    }
 
    // Operators
    if ("+-*/()".includes(char)) {
      tokens.push({ type: "OPERATOR", value: char });
      i++;
      continue;
    }
 
    // Identifiers (variable names)
    if (/[a-zA-Z_]/.test(char)) {
      let id = "";
      while (i < input.length && /[a-zA-Z0-9_]/.test(input[i])) {
        id += input[i++];
      }
      tokens.push({ type: "IDENTIFIER", value: id });
      continue;
    }
 
    throw new Error(`Unexpected character: '${char}' at position ${i}`);
  }
 
  return tokens;
}
 
console.log(tokenize("3 + 42 * (x - 1)"));
// [
//   { type: "NUMBER", value: 3 },
//   { type: "OPERATOR", value: "+" },
//   { type: "NUMBER", value: 42 },
//   { type: "OPERATOR", value: "*" },
//   { type: "OPERATOR", value: "(" },
//   { type: "IDENTIFIER", value: "x" },
//   { type: "OPERATOR", value: "-" },
//   { type: "NUMBER", value: 1 },
//   { type: "OPERATOR", value: ")" },
// ]

Part 2: Recursive Descent Parser

A recursive descent parser uses one function per grammar rule. Here is a parser for arithmetic expressions with correct operator precedence:

Grammar

CodeCode
expression  = term (('+' | '-') term)*
term        = factor (('*' | '/') factor)*
factor      = NUMBER | '(' expression ')'

Implementation

javascriptjavascript
class ArithmeticParser {
  constructor(tokens) {
    this.tokens = tokens;
    this.pos = 0;
  }
 
  peek() {
    return this.tokens[this.pos] || null;
  }
 
  consume(expectedValue) {
    const token = this.tokens[this.pos];
    if (!token) throw new Error("Unexpected end of input");
    if (expectedValue && token.value !== expectedValue) {
      throw new Error(`Expected '${expectedValue}', got '${token.value}'`);
    }
    this.pos++;
    return token;
  }
 
  // expression = term (('+' | '-') term)*
  parseExpression() {
    let left = this.parseTerm();
 
    while (this.peek()?.value === "+" || this.peek()?.value === "-") {
      const op = this.consume().value;
      const right = this.parseTerm();
      left = { type: "BinaryExpression", operator: op, left, right };
    }
 
    return left;
  }
 
  // term = factor (('*' | '/') factor)*
  parseTerm() {
    let left = this.parseFactor();
 
    while (this.peek()?.value === "*" || this.peek()?.value === "/") {
      const op = this.consume().value;
      const right = this.parseFactor();
      left = { type: "BinaryExpression", operator: op, left, right };
    }
 
    return left;
  }
 
  // factor = NUMBER | '(' expression ')'
  parseFactor() {
    const token = this.peek();
 
    if (token?.type === "NUMBER") {
      this.consume();
      return { type: "NumberLiteral", value: token.value };
    }
 
    if (token?.value === "(") {
      this.consume("(");
      const expr = this.parseExpression();
      this.consume(")");
      return expr;
    }
 
    throw new Error(`Unexpected token: ${JSON.stringify(token)}`);
  }
 
  parse() {
    const ast = this.parseExpression();
    if (this.pos < this.tokens.length) {
      throw new Error(`Unexpected token after expression: ${JSON.stringify(this.peek())}`);
    }
    return ast;
  }
}

Using the Parser

javascriptjavascript
const tokens = tokenize("3 + 4 * 2");
const parser = new ArithmeticParser(tokens);
const ast = parser.parse();
 
console.log(JSON.stringify(ast, null, 2));
// {
//   "type": "BinaryExpression",
//   "operator": "+",
//   "left": { "type": "NumberLiteral", "value": 3 },
//   "right": {
//     "type": "BinaryExpression",
//     "operator": "*",
//     "left": { "type": "NumberLiteral", "value": 4 },
//     "right": { "type": "NumberLiteral", "value": 2 }
//   }
// }

Multiplication is nested deeper than addition, correctly representing operator precedence.

Part 3: Evaluating the AST

javascriptjavascript
function evaluate(node) {
  if (node.type === "NumberLiteral") {
    return node.value;
  }
 
  if (node.type === "BinaryExpression") {
    const left = evaluate(node.left);
    const right = evaluate(node.right);
 
    switch (node.operator) {
      case "+": return left + right;
      case "-": return left - right;
      case "*": return left * right;
      case "/": return left / right;
      default:  throw new Error(`Unknown operator: ${node.operator}`);
    }
  }
 
  throw new Error(`Unknown node type: ${node.type}`);
}
 
const tokens2 = tokenize("(2 + 3) * 4");
const ast2 = new ArithmeticParser(tokens2).parse();
console.log(evaluate(ast2)); // 20

Part 4: Parsing Key-Value Configs

A practical real-world example parsing .env-style files:

javascriptjavascript
function parseEnvFile(input) {
  const result = {};
  const lines = input.split("\n");
 
  for (let lineNum = 0; lineNum < lines.length; lineNum++) {
    const line = lines[lineNum].trim();
 
    // Skip empty lines and comments
    if (!line || line.startsWith("#")) continue;
 
    const eqIndex = line.indexOf("=");
    if (eqIndex === -1) {
      throw new Error(`Invalid syntax at line ${lineNum + 1}: missing '='`);
    }
 
    const key = line.slice(0, eqIndex).trim();
    let value = line.slice(eqIndex + 1).trim();
 
    // Strip surrounding quotes
    if ((value.startsWith('"') && value.endsWith('"')) ||
        (value.startsWith("'") && value.endsWith("'"))) {
      value = value.slice(1, -1);
    }
 
    result[key] = value;
  }
 
  return result;
}
 
const env = parseEnvFile(`
# Database config
DB_HOST=localhost
DB_PORT=5432
DB_NAME="my_database"
API_KEY='secret-key-123'
`);
 
console.log(env);
// { DB_HOST: "localhost", DB_PORT: "5432", DB_NAME: "my_database", API_KEY: "secret-key-123" }

Error Handling Best Practices

TechniquePurpose
Position trackingReport exact character/line of error
Expected-vs-got messagesClear diagnostic for the developer
Recovery pointsSkip to next statement after error
Error collectionReport all errors, not just the first
javascriptjavascript
class ParseError extends Error {
  constructor(message, line, column) {
    super(`${message} at line ${line}, column ${column}`);
    this.line = line;
    this.column = column;
  }
}
Rune AI

Rune AI

Key Insights

  • Tokenizing is the first step: Break the input string into typed tokens (numbers, operators, identifiers) before parsing
  • Recursive descent maps grammar rules to functions: Each production rule (expression, term, factor) becomes a parser method
  • Operator precedence is handled by nesting: Lower-precedence operators are parsed at higher levels; higher-precedence operators at deeper levels
  • The AST separates parsing from evaluation: Parse once into a tree, then traverse the tree for evaluation, optimization, or code generation
  • Error messages must include position: Track line and column numbers during tokenizing so parse errors are actionable
RunePowered by Rune AI

Frequently Asked Questions

When should I write a parser instead of using regex?

When the input has nesting (brackets, tags), recursive structure, or operator precedence. Regex cannot handle arbitrary nesting. For flat delimited text, regex or `split()` is fine.

How do I handle string literals with escape sequences?

In the tokenizer, when you encounter a quote character, consume characters until the matching close quote. Handle `\"` by checking for backslash-escaped characters during consumption.

What is a parser combinator?

function that takes small parsers and combines them into larger ones. Libraries like Parsimmon or Chevrotain use this pattern. They are more maintainable than hand-written parsers for large grammars.

Can I parse HTML with a recursive descent parser?

Technically yes, but real-world HTML is messy (self-closing tags, optional end tags, attribute variations). Use a battle-tested parser like `htmlparser2` for production HTML parsing.

How do I add support for variables in the arithmetic parser?

dd an `IDENTIFIER` case to `parseFactor()` that returns a `{ type: "Variable", name: token.value }` node. During evaluation, look up the variable in a scope object passed as an argument.

Conclusion

Custom parsers give you full control over how strings become structured data. Start with a tokenizer that breaks input into typed tokens. Then build a recursive descent parser with one function per grammar rule. The resulting AST can be evaluated, transformed, or compiled. For string formatting with tagged templates (a simpler form of DSL), see JavaScript tagged template literals deep dive. For the module system to organize parser code, see JavaScript named exports a complete tutorial.