feat: semantic search (1/n)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
This commit is contained in:
Aaron Pham
2025-10-05 19:50:52 -04:00
parent f14260b2ba
commit f533902c75
23 changed files with 2955 additions and 502 deletions

View File

@@ -40,7 +40,7 @@ export const NotFoundPage: QuartzEmitterPlugin = () => {
description: notFound,
frontmatter: { title: notFound, tags: [] },
})
const externalResources = pageResources(path, resources)
const externalResources = pageResources(path, resources, ctx.cfg.configuration)
const componentData: QuartzComponentProps = {
ctx,
fileData: vfile.data,

View File

@@ -1,5 +1,8 @@
import { FullSlug, joinSegments } from "../../util/path"
import { QuartzEmitterPlugin } from "../types"
import path from "path"
import fs from "node:fs/promises"
import { globby } from "globby"
// @ts-ignore
import spaRouterScript from "../../components/scripts/spa.inline"
@@ -16,7 +19,7 @@ import {
processGoogleFonts,
} from "../../util/theme"
import { Features, transform } from "lightningcss"
import { transform as transpile } from "esbuild"
import { transform as transpile, build as bundle } from "esbuild"
import { write } from "./helpers"
type ComponentResources = {
@@ -357,7 +360,47 @@ export const ComponentResources: QuartzEmitterPlugin = () => {
ext: ".js",
content: postscript,
})
// Bundle all worker files
const workerFiles = await globby(["quartz/**/*.worker.ts"])
for (const src of workerFiles) {
const result = await bundle({
entryPoints: [src],
bundle: true,
minify: true,
platform: "browser",
format: "esm",
write: false,
})
const code = result.outputFiles[0].text
const name = path.basename(src).replace(/\.ts$/, "")
yield write({ ctx, slug: name as FullSlug, ext: ".js", content: code })
}
},
async *partialEmit(ctx, _content, _resources, changeEvents) {
// Handle worker file changes in incremental builds
for (const changeEvent of changeEvents) {
if (!/\.worker\.ts$/.test(changeEvent.path)) continue
if (changeEvent.type === "delete") {
const name = path.basename(changeEvent.path).replace(/\.ts$/, "")
const dest = joinSegments(ctx.argv.output, `${name}.js`)
try {
await fs.unlink(dest)
} catch {}
continue
}
const result = await bundle({
entryPoints: [changeEvent.path],
bundle: true,
minify: true,
platform: "browser",
format: "esm",
write: false,
})
const code = result.outputFiles[0].text
const name = path.basename(changeEvent.path).replace(/\.ts$/, "")
yield write({ ctx, slug: name as FullSlug, ext: ".js", content: code })
}
},
async *partialEmit() {},
}
}

View File

@@ -25,7 +25,7 @@ async function processContent(
) {
const slug = fileData.slug!
const cfg = ctx.cfg.configuration
const externalResources = pageResources(pathToRoot(slug), resources)
const externalResources = pageResources(pathToRoot(slug), resources, ctx.cfg.configuration)
const componentData: QuartzComponentProps = {
ctx,
fileData,

View File

@@ -38,7 +38,7 @@ async function* processFolderInfo(
const slug = joinSegments(folder, "index") as FullSlug
const [tree, file] = folderContent
const cfg = ctx.cfg.configuration
const externalResources = pageResources(pathToRoot(slug), resources)
const externalResources = pageResources(pathToRoot(slug), resources, ctx.cfg.configuration)
const componentData: QuartzComponentProps = {
ctx,
fileData: file.data,

View File

@@ -1,7 +1,7 @@
export { ContentPage } from "./contentPage"
export { TagPage } from "./tagPage"
export { FolderPage } from "./folderPage"
export { ContentIndex as ContentIndex } from "./contentIndex"
export { ContentIndex } from "./contentIndex"
export { AliasRedirects } from "./aliases"
export { Assets } from "./assets"
export { Static } from "./static"
@@ -10,3 +10,4 @@ export { ComponentResources } from "./componentResources"
export { NotFoundPage } from "./404"
export { CNAME } from "./cname"
export { CustomOgImages } from "./ogImage"
export { SemanticIndex } from "./semantic"

View File

@@ -0,0 +1,235 @@
import { write } from "./helpers"
import { QuartzEmitterPlugin } from "../types"
import { FilePath, FullSlug, joinSegments, QUARTZ } from "../../util/path"
import { ReadTimeResults } from "reading-time"
import { GlobalConfiguration } from "../../cfg"
import { spawn } from "child_process"
const DEFAULT_MODEL_ID = "onnx-community/Qwen3-Embedding-0.6B-ONNX"
const defaults: GlobalConfiguration["semanticSearch"] = {
enable: true,
model: DEFAULT_MODEL_ID,
aot: false,
dims: 1024,
dtype: "fp32",
shardSizeRows: 1024,
hnsw: { M: 16, efConstruction: 200 },
chunking: {
chunkSize: 512,
chunkOverlap: 128,
noChunking: false,
},
vllm: {
enable: false,
vllmUrl:
process.env.VLLM_URL || process.env.VLLM_EMBED_URL || "http://127.0.0.1:8000/v1/embeddings",
concurrency: parseInt(process.env.VLLM_CONCURRENCY || "8", 10),
batchSize: parseInt(process.env.VLLM_BATCH_SIZE || "64", 10),
},
}
type ContentDetails = {
slug: string
title: string
filePath: FilePath
content: string
readingTime?: Partial<ReadTimeResults>
}
/**
* Check if uv is installed
*/
function checkUvInstalled(): Promise<boolean> {
return new Promise((resolve) => {
const proc = spawn("uv", ["--version"], { shell: true })
proc.on("error", () => resolve(false))
proc.on("close", (code) => resolve(code === 0))
})
}
/**
* Run the Python embedding build script using uv
* Script uses PEP 723 inline metadata for dependency management
*/
function runEmbedBuild(
jsonlPath: string,
outDir: string,
opts: {
model: string
dtype: string
dims: number
shardSizeRows: number
chunking: { chunkSize: number; chunkOverlap: number; noChunking: boolean }
vllm: { enable: boolean; vllmUrl?: string; concurrency: number; batchSize: number }
},
): Promise<void> {
return new Promise((resolve, reject) => {
const scriptPath = joinSegments(QUARTZ, "embed_build.py")
const args = [
"run",
scriptPath,
"--jsonl",
jsonlPath,
"--model",
opts.model,
"--out",
outDir,
"--dtype",
opts.dtype,
"--dims",
String(opts.dims),
"--shard-size",
String(opts.shardSizeRows),
"--chunk-size",
String(opts.chunking.chunkSize),
"--chunk-overlap",
String(opts.chunking.chunkOverlap),
]
if (opts.chunking.noChunking) {
args.push("--no-chunking")
}
if (opts.vllm.enable) {
args.push("--use-vllm")
if (opts.vllm.vllmUrl) {
args.push("--vllm-url", opts.vllm.vllmUrl)
}
args.push("--concurrency", String(opts.vllm.concurrency))
args.push("--batch-size", String(opts.vllm.batchSize))
}
console.log("\nRunning embedding generation:")
console.log(` uv ${args.join(" ")}`)
const env = { ...process.env }
if (opts.vllm.enable && !env.USE_VLLM) {
env.USE_VLLM = "1"
}
const proc = spawn("uv", args, {
stdio: "inherit",
shell: true,
env,
})
proc.on("error", (err) => {
reject(new Error(`Failed to spawn uv: ${err.message}`))
})
proc.on("close", (code) => {
if (code === 0) {
console.log("Embedding generation completed successfully")
resolve()
} else {
reject(new Error(`embed_build.py exited with code ${code}`))
}
})
})
}
export const SemanticIndex: QuartzEmitterPlugin<Partial<GlobalConfiguration["semanticSearch"]>> = (
opts,
) => {
const merged = { ...defaults, ...opts }
const o = {
enable: merged.enable!,
model: merged.model!,
aot: merged.aot!,
dims: merged.dims!,
dtype: merged.dtype!,
shardSizeRows: merged.shardSizeRows!,
hnsw: {
M: merged.hnsw?.M ?? defaults.hnsw!.M!,
efConstruction: merged.hnsw?.efConstruction ?? defaults.hnsw!.efConstruction!,
efSearch: merged.hnsw?.efSearch,
},
chunking: {
chunkSize: merged.chunking?.chunkSize ?? defaults.chunking!.chunkSize!,
chunkOverlap: merged.chunking?.chunkOverlap ?? defaults.chunking!.chunkOverlap!,
noChunking: merged.chunking?.noChunking ?? defaults.chunking!.noChunking!,
},
vllm: {
enable: merged.vllm?.enable ?? defaults.vllm!.enable!,
vllmUrl: merged.vllm?.vllmUrl ?? defaults.vllm!.vllmUrl,
concurrency: merged.vllm?.concurrency ?? defaults.vllm!.concurrency!,
batchSize: merged.vllm?.batchSize ?? defaults.vllm!.batchSize!,
},
}
if (!o.model) {
throw new Error("Semantic search requires a model identifier")
}
return {
name: "SemanticIndex",
getQuartzComponents() {
return []
},
async *partialEmit() {},
async *emit(ctx, content, _resources) {
if (!o.enable) return
const docs: ContentDetails[] = []
for (const [_, file] of content) {
const slug = file.data.slug!
const title = file.data.frontmatter?.title ?? slug
const text = file.data.text
if (text) {
docs.push({
slug,
title,
filePath: file.data.filePath!,
content: text,
readingTime: file.data.readingTime,
})
}
}
// Emit JSONL with the exact text used for embeddings
const jsonl = docs
.map((d) => ({ slug: d.slug, title: d.title, text: d.content }))
.map((o) => JSON.stringify(o))
.join("\n")
const jsonlSlug = "embeddings-text" as FullSlug
yield write({
ctx,
slug: jsonlSlug,
ext: ".jsonl",
content: jsonl,
})
// If aot is false, run the embedding generation script
if (!o.aot) {
console.log("\nGenerating embeddings (aot=false)...")
// Check for uv
const hasUv = await checkUvInstalled()
if (!hasUv) {
throw new Error(
"uv is required for embedding generation. Install it from https://docs.astral.sh/uv/",
)
}
const jsonlPath = joinSegments(ctx.argv.output, "embeddings-text.jsonl")
const outDir = joinSegments(ctx.argv.output, "embeddings")
try {
await runEmbedBuild(jsonlPath, outDir, o)
} catch (err) {
const message = err instanceof Error ? err.message : String(err)
throw new Error(`Embedding generation failed: ${message}`)
}
} else {
console.log(
"\nSkipping embedding generation (aot=true). Expecting pre-generated embeddings in public/embeddings/",
)
}
},
externalResources(_ctx) {
return {}
},
}
}

View File

@@ -73,7 +73,7 @@ async function processTagPage(
const slug = joinSegments("tags", tag) as FullSlug
const [tree, file] = tagContent
const cfg = ctx.cfg.configuration
const externalResources = pageResources(pathToRoot(slug), resources)
const externalResources = pageResources(pathToRoot(slug), resources, ctx.cfg.configuration)
const componentData: QuartzComponentProps = {
ctx,
fileData: file.data,