feat: semantic search (1/n)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-10-05 19:50:52 -04:00
parent f14260b2ba
commit f533902c75
23 changed files with 2955 additions and 502 deletions
--- a/quartz/plugins/emitters/404.tsx
+++ b/quartz/plugins/emitters/404.tsx
@@ -40,7 +40,7 @@ export const NotFoundPage: QuartzEmitterPlugin = () => {
        description: notFound,
        frontmatter: { title: notFound, tags: [] },
      })
-      const externalResources = pageResources(path, resources)
+      const externalResources = pageResources(path, resources, ctx.cfg.configuration)
      const componentData: QuartzComponentProps = {
        ctx,
        fileData: vfile.data,
--- a/quartz/plugins/emitters/componentResources.ts
+++ b/quartz/plugins/emitters/componentResources.ts
@@ -1,5 +1,8 @@
 import { FullSlug, joinSegments } from "../../util/path"
 import { QuartzEmitterPlugin } from "../types"
+import path from "path"
+import fs from "node:fs/promises"
+import { globby } from "globby"

 // @ts-ignore
 import spaRouterScript from "../../components/scripts/spa.inline"
@@ -16,7 +19,7 @@ import {
  processGoogleFonts,
 } from "../../util/theme"
 import { Features, transform } from "lightningcss"
-import { transform as transpile } from "esbuild"
+import { transform as transpile, build as bundle } from "esbuild"
 import { write } from "./helpers"

 type ComponentResources = {
@@ -357,7 +360,47 @@ export const ComponentResources: QuartzEmitterPlugin = () => {
        ext: ".js",
        content: postscript,
      })
+
+      // Bundle all worker files
+      const workerFiles = await globby(["quartz/**/*.worker.ts"])
+      for (const src of workerFiles) {
+        const result = await bundle({
+          entryPoints: [src],
+          bundle: true,
+          minify: true,
+          platform: "browser",
+          format: "esm",
+          write: false,
+        })
+        const code = result.outputFiles[0].text
+        const name = path.basename(src).replace(/\.ts$/, "")
+        yield write({ ctx, slug: name as FullSlug, ext: ".js", content: code })
+      }
+    },
+    async *partialEmit(ctx, _content, _resources, changeEvents) {
+      // Handle worker file changes in incremental builds
+      for (const changeEvent of changeEvents) {
+        if (!/\.worker\.ts$/.test(changeEvent.path)) continue
+        if (changeEvent.type === "delete") {
+          const name = path.basename(changeEvent.path).replace(/\.ts$/, "")
+          const dest = joinSegments(ctx.argv.output, `${name}.js`)
+          try {
+            await fs.unlink(dest)
+          } catch {}
+          continue
+        }
+        const result = await bundle({
+          entryPoints: [changeEvent.path],
+          bundle: true,
+          minify: true,
+          platform: "browser",
+          format: "esm",
+          write: false,
+        })
+        const code = result.outputFiles[0].text
+        const name = path.basename(changeEvent.path).replace(/\.ts$/, "")
+        yield write({ ctx, slug: name as FullSlug, ext: ".js", content: code })
+      }
    },
-    async *partialEmit() {},
  }
 }
--- a/quartz/plugins/emitters/contentPage.tsx
+++ b/quartz/plugins/emitters/contentPage.tsx
@@ -25,7 +25,7 @@ async function processContent(
 ) {
  const slug = fileData.slug!
  const cfg = ctx.cfg.configuration
-  const externalResources = pageResources(pathToRoot(slug), resources)
+  const externalResources = pageResources(pathToRoot(slug), resources, ctx.cfg.configuration)
  const componentData: QuartzComponentProps = {
    ctx,
    fileData,
--- a/quartz/plugins/emitters/folderPage.tsx
+++ b/quartz/plugins/emitters/folderPage.tsx
@@ -38,7 +38,7 @@ async function* processFolderInfo(
    const slug = joinSegments(folder, "index") as FullSlug
    const [tree, file] = folderContent
    const cfg = ctx.cfg.configuration
-    const externalResources = pageResources(pathToRoot(slug), resources)
+    const externalResources = pageResources(pathToRoot(slug), resources, ctx.cfg.configuration)
    const componentData: QuartzComponentProps = {
      ctx,
      fileData: file.data,
--- a/quartz/plugins/emitters/index.ts
+++ b/quartz/plugins/emitters/index.ts
@@ -1,7 +1,7 @@
 export { ContentPage } from "./contentPage"
 export { TagPage } from "./tagPage"
 export { FolderPage } from "./folderPage"
-export { ContentIndex as ContentIndex } from "./contentIndex"
+export { ContentIndex } from "./contentIndex"
 export { AliasRedirects } from "./aliases"
 export { Assets } from "./assets"
 export { Static } from "./static"
@@ -10,3 +10,4 @@ export { ComponentResources } from "./componentResources"
 export { NotFoundPage } from "./404"
 export { CNAME } from "./cname"
 export { CustomOgImages } from "./ogImage"
+export { SemanticIndex } from "./semantic"
--- a/quartz/plugins/emitters/semantic.ts
+++ b/quartz/plugins/emitters/semantic.ts
@@ -0,0 +1,235 @@
+import { write } from "./helpers"
+import { QuartzEmitterPlugin } from "../types"
+import { FilePath, FullSlug, joinSegments, QUARTZ } from "../../util/path"
+import { ReadTimeResults } from "reading-time"
+import { GlobalConfiguration } from "../../cfg"
+import { spawn } from "child_process"
+
+const DEFAULT_MODEL_ID = "onnx-community/Qwen3-Embedding-0.6B-ONNX"
+
+const defaults: GlobalConfiguration["semanticSearch"] = {
+  enable: true,
+  model: DEFAULT_MODEL_ID,
+  aot: false,
+  dims: 1024,
+  dtype: "fp32",
+  shardSizeRows: 1024,
+  hnsw: { M: 16, efConstruction: 200 },
+  chunking: {
+    chunkSize: 512,
+    chunkOverlap: 128,
+    noChunking: false,
+  },
+  vllm: {
+    enable: false,
+    vllmUrl:
+      process.env.VLLM_URL || process.env.VLLM_EMBED_URL || "http://127.0.0.1:8000/v1/embeddings",
+    concurrency: parseInt(process.env.VLLM_CONCURRENCY || "8", 10),
+    batchSize: parseInt(process.env.VLLM_BATCH_SIZE || "64", 10),
+  },
+}
+
+type ContentDetails = {
+  slug: string
+  title: string
+  filePath: FilePath
+  content: string
+  readingTime?: Partial<ReadTimeResults>
+}
+
+/**
+ * Check if uv is installed
+ */
+function checkUvInstalled(): Promise<boolean> {
+  return new Promise((resolve) => {
+    const proc = spawn("uv", ["--version"], { shell: true })
+    proc.on("error", () => resolve(false))
+    proc.on("close", (code) => resolve(code === 0))
+  })
+}
+
+/**
+ * Run the Python embedding build script using uv
+ * Script uses PEP 723 inline metadata for dependency management
+ */
+function runEmbedBuild(
+  jsonlPath: string,
+  outDir: string,
+  opts: {
+    model: string
+    dtype: string
+    dims: number
+    shardSizeRows: number
+    chunking: { chunkSize: number; chunkOverlap: number; noChunking: boolean }
+    vllm: { enable: boolean; vllmUrl?: string; concurrency: number; batchSize: number }
+  },
+): Promise<void> {
+  return new Promise((resolve, reject) => {
+    const scriptPath = joinSegments(QUARTZ, "embed_build.py")
+    const args = [
+      "run",
+      scriptPath,
+      "--jsonl",
+      jsonlPath,
+      "--model",
+      opts.model,
+      "--out",
+      outDir,
+      "--dtype",
+      opts.dtype,
+      "--dims",
+      String(opts.dims),
+      "--shard-size",
+      String(opts.shardSizeRows),
+      "--chunk-size",
+      String(opts.chunking.chunkSize),
+      "--chunk-overlap",
+      String(opts.chunking.chunkOverlap),
+    ]
+
+    if (opts.chunking.noChunking) {
+      args.push("--no-chunking")
+    }
+
+    if (opts.vllm.enable) {
+      args.push("--use-vllm")
+      if (opts.vllm.vllmUrl) {
+        args.push("--vllm-url", opts.vllm.vllmUrl)
+      }
+      args.push("--concurrency", String(opts.vllm.concurrency))
+      args.push("--batch-size", String(opts.vllm.batchSize))
+    }
+
+    console.log("\nRunning embedding generation:")
+    console.log(`  uv ${args.join(" ")}`)
+
+    const env = { ...process.env }
+    if (opts.vllm.enable && !env.USE_VLLM) {
+      env.USE_VLLM = "1"
+    }
+
+    const proc = spawn("uv", args, {
+      stdio: "inherit",
+      shell: true,
+      env,
+    })
+
+    proc.on("error", (err) => {
+      reject(new Error(`Failed to spawn uv: ${err.message}`))
+    })
+
+    proc.on("close", (code) => {
+      if (code === 0) {
+        console.log("Embedding generation completed successfully")
+        resolve()
+      } else {
+        reject(new Error(`embed_build.py exited with code ${code}`))
+      }
+    })
+  })
+}
+
+export const SemanticIndex: QuartzEmitterPlugin<Partial<GlobalConfiguration["semanticSearch"]>> = (
+  opts,
+) => {
+  const merged = { ...defaults, ...opts }
+  const o = {
+    enable: merged.enable!,
+    model: merged.model!,
+    aot: merged.aot!,
+    dims: merged.dims!,
+    dtype: merged.dtype!,
+    shardSizeRows: merged.shardSizeRows!,
+    hnsw: {
+      M: merged.hnsw?.M ?? defaults.hnsw!.M!,
+      efConstruction: merged.hnsw?.efConstruction ?? defaults.hnsw!.efConstruction!,
+      efSearch: merged.hnsw?.efSearch,
+    },
+    chunking: {
+      chunkSize: merged.chunking?.chunkSize ?? defaults.chunking!.chunkSize!,
+      chunkOverlap: merged.chunking?.chunkOverlap ?? defaults.chunking!.chunkOverlap!,
+      noChunking: merged.chunking?.noChunking ?? defaults.chunking!.noChunking!,
+    },
+    vllm: {
+      enable: merged.vllm?.enable ?? defaults.vllm!.enable!,
+      vllmUrl: merged.vllm?.vllmUrl ?? defaults.vllm!.vllmUrl,
+      concurrency: merged.vllm?.concurrency ?? defaults.vllm!.concurrency!,
+      batchSize: merged.vllm?.batchSize ?? defaults.vllm!.batchSize!,
+    },
+  }
+
+  if (!o.model) {
+    throw new Error("Semantic search requires a model identifier")
+  }
+
+  return {
+    name: "SemanticIndex",
+    getQuartzComponents() {
+      return []
+    },
+    async *partialEmit() {},
+    async *emit(ctx, content, _resources) {
+      if (!o.enable) return
+
+      const docs: ContentDetails[] = []
+      for (const [_, file] of content) {
+        const slug = file.data.slug!
+        const title = file.data.frontmatter?.title ?? slug
+        const text = file.data.text
+        if (text) {
+          docs.push({
+            slug,
+            title,
+            filePath: file.data.filePath!,
+            content: text,
+            readingTime: file.data.readingTime,
+          })
+        }
+      }
+
+      // Emit JSONL with the exact text used for embeddings
+      const jsonl = docs
+        .map((d) => ({ slug: d.slug, title: d.title, text: d.content }))
+        .map((o) => JSON.stringify(o))
+        .join("\n")
+
+      const jsonlSlug = "embeddings-text" as FullSlug
+      yield write({
+        ctx,
+        slug: jsonlSlug,
+        ext: ".jsonl",
+        content: jsonl,
+      })
+
+      // If aot is false, run the embedding generation script
+      if (!o.aot) {
+        console.log("\nGenerating embeddings (aot=false)...")
+
+        // Check for uv
+        const hasUv = await checkUvInstalled()
+        if (!hasUv) {
+          throw new Error(
+            "uv is required for embedding generation. Install it from https://docs.astral.sh/uv/",
+          )
+        }
+
+        const jsonlPath = joinSegments(ctx.argv.output, "embeddings-text.jsonl")
+        const outDir = joinSegments(ctx.argv.output, "embeddings")
+
+        try {
+          await runEmbedBuild(jsonlPath, outDir, o)
+        } catch (err) {
+          const message = err instanceof Error ? err.message : String(err)
+          throw new Error(`Embedding generation failed: ${message}`)
+        }
+      } else {
+        console.log(
+          "\nSkipping embedding generation (aot=true). Expecting pre-generated embeddings in public/embeddings/",
+        )
+      }
+    },
+    externalResources(_ctx) {
+      return {}
+    },
+  }
+}
--- a/quartz/plugins/emitters/tagPage.tsx
+++ b/quartz/plugins/emitters/tagPage.tsx
@@ -73,7 +73,7 @@ async function processTagPage(
  const slug = joinSegments("tags", tag) as FullSlug
  const [tree, file] = tagContent
  const cfg = ctx.cfg.configuration
-  const externalResources = pageResources(pathToRoot(slug), resources)
+  const externalResources = pageResources(pathToRoot(slug), resources, ctx.cfg.configuration)
  const componentData: QuartzComponentProps = {
    ctx,
    fileData: file.data,