Files
quartz-org-roam/quartz/util/base/compiler/lexer.ts
2026-01-30 02:25:53 -05:00

301 lines
7.7 KiB
TypeScript

import { Position, Span } from "./ast"
import { Diagnostic } from "./errors"
import {
Operator,
Punctuation,
Token,
StringToken,
RegexToken,
NumberToken,
BooleanToken,
NullToken,
ThisToken,
IdentifierToken,
OperatorToken,
PunctuationToken,
EofToken,
} from "./tokens"
type LexResult = { tokens: Token[]; diagnostics: Diagnostic[] }
const operatorTokens: Operator[] = [
"==",
"!=",
">=",
"<=",
"&&",
"||",
"+",
"-",
"*",
"/",
"%",
"!",
">",
"<",
]
const punctuationTokens: Punctuation[] = [".", ",", "(", ")", "[", "]"]
const isOperator = (value: string): value is Operator =>
operatorTokens.some((token) => token === value)
const isPunctuation = (value: string): value is Punctuation =>
punctuationTokens.some((token) => token === value)
export function lex(input: string, file?: string): LexResult {
const tokens: Token[] = []
const diagnostics: Diagnostic[] = []
let index = 0
let line = 1
let column = 1
let canStartRegex = true
const makePosition = (offset: number, lineValue: number, columnValue: number): Position => ({
offset,
line: lineValue,
column: columnValue,
})
const currentPosition = (): Position => makePosition(index, line, column)
const makeSpan = (start: Position, end: Position): Span => ({ start, end, file })
const advance = (): string => {
const ch = input[index]
index += 1
if (ch === "\n") {
line += 1
column = 1
} else {
column += 1
}
return ch
}
const peek = (offset = 0): string => input[index + offset] ?? ""
const addDiagnostic = (message: string, span: Span) => {
diagnostics.push({ kind: "lex", message, span })
}
const updateRegexState = (token: Token | null) => {
if (!token) {
canStartRegex = true
return
}
if (token.type === "operator") {
canStartRegex = true
return
}
if (token.type === "punctuation") {
canStartRegex = token.value === "(" || token.value === "[" || token.value === ","
return
}
canStartRegex = false
}
const isWhitespace = (ch: string) => ch === " " || ch === "\t" || ch === "\n" || ch === "\r"
const isDigit = (ch: string) => ch >= "0" && ch <= "9"
const isIdentStart = (ch: string) =>
(ch >= "a" && ch <= "z") || (ch >= "A" && ch <= "Z") || ch === "_"
const isIdentContinue = (ch: string) => isIdentStart(ch) || isDigit(ch)
while (index < input.length) {
const ch = peek()
if (isWhitespace(ch)) {
advance()
continue
}
const start = currentPosition()
if (ch === "=" && peek(1) !== "=") {
let offset = 1
while (isWhitespace(peek(offset))) {
offset += 1
}
if (peek(offset) === ">") {
advance()
for (let step = 1; step < offset; step += 1) {
advance()
}
if (peek() === ">") {
advance()
}
const end = currentPosition()
addDiagnostic(
"arrow functions are not supported, use list.filter(expression)",
makeSpan(start, end),
)
continue
}
}
if (ch === '"' || ch === "'") {
const quote = advance()
let value = ""
let closed = false
while (index < input.length) {
const curr = advance()
if (curr === quote) {
closed = true
break
}
if (curr === "\\") {
const next = advance()
if (next === "n") value += "\n"
else if (next === "t") value += "\t"
else if (next === "r") value += "\r"
else if (next === "\\" || next === "'" || next === '"') value += next
else value += next
} else {
value += curr
}
}
const end = currentPosition()
const span = makeSpan(start, end)
if (!closed) addDiagnostic("unterminated string literal", span)
const token: StringToken = { type: "string", value, span }
tokens.push(token)
updateRegexState(token)
continue
}
if (ch === "/" && canStartRegex) {
const next = peek(1)
if (next !== "/" && next !== "") {
advance()
let pattern = ""
let closed = false
let inClass = false
while (index < input.length) {
const curr = advance()
if (curr === "\\" && index < input.length) {
const escaped = advance()
pattern += `\\${escaped}`
continue
}
if (curr === "[" && !inClass) inClass = true
if (curr === "]" && inClass) inClass = false
if (curr === "/" && !inClass) {
closed = true
break
}
pattern += curr
}
let flags = ""
while (index < input.length) {
const flag = peek()
if (!/^[gimsuy]$/.test(flag)) break
flags += advance()
}
const end = currentPosition()
const span = makeSpan(start, end)
if (!closed) addDiagnostic("unterminated regex literal", span)
const token: RegexToken = { type: "regex", pattern, flags, span }
tokens.push(token)
updateRegexState(token)
continue
}
}
if (isDigit(ch)) {
let num = ""
while (index < input.length && isDigit(peek())) {
num += advance()
}
if (peek() === "." && isDigit(peek(1))) {
num += advance()
while (index < input.length && isDigit(peek())) {
num += advance()
}
}
const end = currentPosition()
const span = makeSpan(start, end)
const token: NumberToken = { type: "number", value: Number(num), span }
tokens.push(token)
updateRegexState(token)
continue
}
if (isIdentStart(ch)) {
let ident = ""
while (index < input.length && isIdentContinue(peek())) {
ident += advance()
}
const end = currentPosition()
const span = makeSpan(start, end)
if (ident === "true" || ident === "false") {
const token: BooleanToken = { type: "boolean", value: ident === "true", span }
tokens.push(token)
updateRegexState(token)
continue
}
if (ident === "null") {
const token: NullToken = { type: "null", span }
tokens.push(token)
updateRegexState(token)
continue
}
if (ident === "this") {
const token: ThisToken = { type: "this", span }
tokens.push(token)
updateRegexState(token)
continue
}
const token: IdentifierToken = { type: "identifier", value: ident, span }
tokens.push(token)
updateRegexState(token)
continue
}
const twoChar = ch + peek(1)
if (isOperator(twoChar)) {
advance()
advance()
const end = currentPosition()
const span = makeSpan(start, end)
const token: OperatorToken = { type: "operator", value: twoChar, span }
tokens.push(token)
updateRegexState(token)
continue
}
if (isOperator(ch)) {
advance()
const end = currentPosition()
const span = makeSpan(start, end)
const token: OperatorToken = { type: "operator", value: ch, span }
tokens.push(token)
updateRegexState(token)
continue
}
if (isPunctuation(ch)) {
advance()
const end = currentPosition()
const span = makeSpan(start, end)
const token: PunctuationToken = { type: "punctuation", value: ch, span }
tokens.push(token)
updateRegexState(token)
continue
}
advance()
const end = currentPosition()
addDiagnostic(`unexpected character: ${ch}`, makeSpan(start, end))
}
const eofPos = currentPosition()
const eofSpan = makeSpan(eofPos, eofPos)
const eofToken: EofToken = { type: "eof", span: eofSpan }
tokens.push(eofToken)
updateRegexState(eofToken)
return { tokens, diagnostics }
}