quartz-org-roam/quartz/util/base/compiler/lexer.ts

import { Position, Span } from "./ast"
import { Diagnostic } from "./errors"
import {
  Operator,
  Punctuation,
  Token,
  StringToken,
  RegexToken,
  NumberToken,
  BooleanToken,
  NullToken,
  ThisToken,
  IdentifierToken,
  OperatorToken,
  PunctuationToken,
  EofToken,
} from "./tokens"

type LexResult = { tokens: Token[]; diagnostics: Diagnostic[] }

const operatorTokens: Operator[] = [
  "==",
  "!=",
  ">=",
  "<=",
  "&&",
  "||",
  "+",
  "-",
  "*",
  "/",
  "%",
  "!",
  ">",
  "<",
]

const punctuationTokens: Punctuation[] = [".", ",", "(", ")", "[", "]"]

const isOperator = (value: string): value is Operator =>
  operatorTokens.some((token) => token === value)

const isPunctuation = (value: string): value is Punctuation =>
  punctuationTokens.some((token) => token === value)

export function lex(input: string, file?: string): LexResult {
  const tokens: Token[] = []
  const diagnostics: Diagnostic[] = []
  let index = 0
  let line = 1
  let column = 1
  let canStartRegex = true

  const makePosition = (offset: number, lineValue: number, columnValue: number): Position => ({
    offset,
    line: lineValue,
    column: columnValue,
  })

  const currentPosition = (): Position => makePosition(index, line, column)

  const makeSpan = (start: Position, end: Position): Span => ({ start, end, file })

  const advance = (): string => {
    const ch = input[index]
    index += 1
    if (ch === "\n") {
      line += 1
      column = 1
    } else {
      column += 1
    }
    return ch
  }

  const peek = (offset = 0): string => input[index + offset] ?? ""

  const addDiagnostic = (message: string, span: Span) => {
    diagnostics.push({ kind: "lex", message, span })
  }

  const updateRegexState = (token: Token | null) => {
    if (!token) {
      canStartRegex = true
      return
    }
    if (token.type === "operator") {
      canStartRegex = true
      return
    }
    if (token.type === "punctuation") {
      canStartRegex = token.value === "(" || token.value === "[" || token.value === ","
      return
    }
    canStartRegex = false
  }

  const isWhitespace = (ch: string) => ch === " " || ch === "\t" || ch === "\n" || ch === "\r"
  const isDigit = (ch: string) => ch >= "0" && ch <= "9"
  const isIdentStart = (ch: string) =>
    (ch >= "a" && ch <= "z") || (ch >= "A" && ch <= "Z") || ch === "_"
  const isIdentContinue = (ch: string) => isIdentStart(ch) || isDigit(ch)

  while (index < input.length) {
    const ch = peek()

    if (isWhitespace(ch)) {
      advance()
      continue
    }

    const start = currentPosition()

    if (ch === "=" && peek(1) !== "=") {
      let offset = 1
      while (isWhitespace(peek(offset))) {
        offset += 1
      }
      if (peek(offset) === ">") {
        advance()
        for (let step = 1; step < offset; step += 1) {
          advance()
        }
        if (peek() === ">") {
          advance()
        }
        const end = currentPosition()
        addDiagnostic(
          "arrow functions are not supported, use list.filter(expression)",
          makeSpan(start, end),
        )
        continue
      }
    }

    if (ch === '"' || ch === "'") {
      const quote = advance()
      let value = ""
      let closed = false

      while (index < input.length) {
        const curr = advance()
        if (curr === quote) {
          closed = true
          break
        }
        if (curr === "\\") {
          const next = advance()
          if (next === "n") value += "\n"
          else if (next === "t") value += "\t"
          else if (next === "r") value += "\r"
          else if (next === "\\" || next === "'" || next === '"') value += next
          else value += next
        } else {
          value += curr
        }
      }

      const end = currentPosition()
      const span = makeSpan(start, end)
      if (!closed) addDiagnostic("unterminated string literal", span)
      const token: StringToken = { type: "string", value, span }
      tokens.push(token)
      updateRegexState(token)
      continue
    }

    if (ch === "/" && canStartRegex) {
      const next = peek(1)
      if (next !== "/" && next !== "") {
        advance()
        let pattern = ""
        let closed = false
        let inClass = false
        while (index < input.length) {
          const curr = advance()
          if (curr === "\\" && index < input.length) {
            const escaped = advance()
            pattern += `\\${escaped}`
            continue
          }
          if (curr === "[" && !inClass) inClass = true
          if (curr === "]" && inClass) inClass = false
          if (curr === "/" && !inClass) {
            closed = true
            break
          }
          pattern += curr
        }
        let flags = ""
        while (index < input.length) {
          const flag = peek()
          if (!/^[gimsuy]$/.test(flag)) break
          flags += advance()
        }
        const end = currentPosition()
        const span = makeSpan(start, end)
        if (!closed) addDiagnostic("unterminated regex literal", span)
        const token: RegexToken = { type: "regex", pattern, flags, span }
        tokens.push(token)
        updateRegexState(token)
        continue
      }
    }

    if (isDigit(ch)) {
      let num = ""
      while (index < input.length && isDigit(peek())) {
        num += advance()
      }
      if (peek() === "." && isDigit(peek(1))) {
        num += advance()
        while (index < input.length && isDigit(peek())) {
          num += advance()
        }
      }
      const end = currentPosition()
      const span = makeSpan(start, end)
      const token: NumberToken = { type: "number", value: Number(num), span }
      tokens.push(token)
      updateRegexState(token)
      continue
    }

    if (isIdentStart(ch)) {
      let ident = ""
      while (index < input.length && isIdentContinue(peek())) {
        ident += advance()
      }
      const end = currentPosition()
      const span = makeSpan(start, end)
      if (ident === "true" || ident === "false") {
        const token: BooleanToken = { type: "boolean", value: ident === "true", span }
        tokens.push(token)
        updateRegexState(token)
        continue
      }
      if (ident === "null") {
        const token: NullToken = { type: "null", span }
        tokens.push(token)
        updateRegexState(token)
        continue
      }
      if (ident === "this") {
        const token: ThisToken = { type: "this", span }
        tokens.push(token)
        updateRegexState(token)
        continue
      }
      const token: IdentifierToken = { type: "identifier", value: ident, span }
      tokens.push(token)
      updateRegexState(token)
      continue
    }

    const twoChar = ch + peek(1)
    if (isOperator(twoChar)) {
      advance()
      advance()
      const end = currentPosition()
      const span = makeSpan(start, end)
      const token: OperatorToken = { type: "operator", value: twoChar, span }
      tokens.push(token)
      updateRegexState(token)
      continue
    }

    if (isOperator(ch)) {
      advance()
      const end = currentPosition()
      const span = makeSpan(start, end)
      const token: OperatorToken = { type: "operator", value: ch, span }
      tokens.push(token)
      updateRegexState(token)
      continue
    }

    if (isPunctuation(ch)) {
      advance()
      const end = currentPosition()
      const span = makeSpan(start, end)
      const token: PunctuationToken = { type: "punctuation", value: ch, span }
      tokens.push(token)
      updateRegexState(token)
      continue
    }

    advance()
    const end = currentPosition()
    addDiagnostic(`unexpected character: ${ch}`, makeSpan(start, end))
  }

  const eofPos = currentPosition()
  const eofSpan = makeSpan(eofPos, eofPos)
  const eofToken: EofToken = { type: "eof", span: eofSpan }
  tokens.push(eofToken)
  updateRegexState(eofToken)

  return { tokens, diagnostics }
}