feat: semantic search (1/n)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-10-05 19:50:52 -04:00
parent f14260b2ba
commit f533902c75
23 changed files with 2955 additions and 502 deletions
--- a/quartz/plugins/emitters/semantic.ts
+++ b/quartz/plugins/emitters/semantic.ts
@@ -0,0 +1,235 @@
+import { write } from "./helpers"
+import { QuartzEmitterPlugin } from "../types"
+import { FilePath, FullSlug, joinSegments, QUARTZ } from "../../util/path"
+import { ReadTimeResults } from "reading-time"
+import { GlobalConfiguration } from "../../cfg"
+import { spawn } from "child_process"
+
+const DEFAULT_MODEL_ID = "onnx-community/Qwen3-Embedding-0.6B-ONNX"
+
+const defaults: GlobalConfiguration["semanticSearch"] = {
+  enable: true,
+  model: DEFAULT_MODEL_ID,
+  aot: false,
+  dims: 1024,
+  dtype: "fp32",
+  shardSizeRows: 1024,
+  hnsw: { M: 16, efConstruction: 200 },
+  chunking: {
+    chunkSize: 512,
+    chunkOverlap: 128,
+    noChunking: false,
+  },
+  vllm: {
+    enable: false,
+    vllmUrl:
+      process.env.VLLM_URL || process.env.VLLM_EMBED_URL || "http://127.0.0.1:8000/v1/embeddings",
+    concurrency: parseInt(process.env.VLLM_CONCURRENCY || "8", 10),
+    batchSize: parseInt(process.env.VLLM_BATCH_SIZE || "64", 10),
+  },
+}
+
+type ContentDetails = {
+  slug: string
+  title: string
+  filePath: FilePath
+  content: string
+  readingTime?: Partial<ReadTimeResults>
+}
+
+/**
+ * Check if uv is installed
+ */
+function checkUvInstalled(): Promise<boolean> {
+  return new Promise((resolve) => {
+    const proc = spawn("uv", ["--version"], { shell: true })
+    proc.on("error", () => resolve(false))
+    proc.on("close", (code) => resolve(code === 0))
+  })
+}
+
+/**
+ * Run the Python embedding build script using uv
+ * Script uses PEP 723 inline metadata for dependency management
+ */
+function runEmbedBuild(
+  jsonlPath: string,
+  outDir: string,
+  opts: {
+    model: string
+    dtype: string
+    dims: number
+    shardSizeRows: number
+    chunking: { chunkSize: number; chunkOverlap: number; noChunking: boolean }
+    vllm: { enable: boolean; vllmUrl?: string; concurrency: number; batchSize: number }
+  },
+): Promise<void> {
+  return new Promise((resolve, reject) => {
+    const scriptPath = joinSegments(QUARTZ, "embed_build.py")
+    const args = [
+      "run",
+      scriptPath,
+      "--jsonl",
+      jsonlPath,
+      "--model",
+      opts.model,
+      "--out",
+      outDir,
+      "--dtype",
+      opts.dtype,
+      "--dims",
+      String(opts.dims),
+      "--shard-size",
+      String(opts.shardSizeRows),
+      "--chunk-size",
+      String(opts.chunking.chunkSize),
+      "--chunk-overlap",
+      String(opts.chunking.chunkOverlap),
+    ]
+
+    if (opts.chunking.noChunking) {
+      args.push("--no-chunking")
+    }
+
+    if (opts.vllm.enable) {
+      args.push("--use-vllm")
+      if (opts.vllm.vllmUrl) {
+        args.push("--vllm-url", opts.vllm.vllmUrl)
+      }
+      args.push("--concurrency", String(opts.vllm.concurrency))
+      args.push("--batch-size", String(opts.vllm.batchSize))
+    }
+
+    console.log("\nRunning embedding generation:")
+    console.log(`  uv ${args.join(" ")}`)
+
+    const env = { ...process.env }
+    if (opts.vllm.enable && !env.USE_VLLM) {
+      env.USE_VLLM = "1"
+    }
+
+    const proc = spawn("uv", args, {
+      stdio: "inherit",
+      shell: true,
+      env,
+    })
+
+    proc.on("error", (err) => {
+      reject(new Error(`Failed to spawn uv: ${err.message}`))
+    })
+
+    proc.on("close", (code) => {
+      if (code === 0) {
+        console.log("Embedding generation completed successfully")
+        resolve()
+      } else {
+        reject(new Error(`embed_build.py exited with code ${code}`))
+      }
+    })
+  })
+}
+
+export const SemanticIndex: QuartzEmitterPlugin<Partial<GlobalConfiguration["semanticSearch"]>> = (
+  opts,
+) => {
+  const merged = { ...defaults, ...opts }
+  const o = {
+    enable: merged.enable!,
+    model: merged.model!,
+    aot: merged.aot!,
+    dims: merged.dims!,
+    dtype: merged.dtype!,
+    shardSizeRows: merged.shardSizeRows!,
+    hnsw: {
+      M: merged.hnsw?.M ?? defaults.hnsw!.M!,
+      efConstruction: merged.hnsw?.efConstruction ?? defaults.hnsw!.efConstruction!,
+      efSearch: merged.hnsw?.efSearch,
+    },
+    chunking: {
+      chunkSize: merged.chunking?.chunkSize ?? defaults.chunking!.chunkSize!,
+      chunkOverlap: merged.chunking?.chunkOverlap ?? defaults.chunking!.chunkOverlap!,
+      noChunking: merged.chunking?.noChunking ?? defaults.chunking!.noChunking!,
+    },
+    vllm: {
+      enable: merged.vllm?.enable ?? defaults.vllm!.enable!,
+      vllmUrl: merged.vllm?.vllmUrl ?? defaults.vllm!.vllmUrl,
+      concurrency: merged.vllm?.concurrency ?? defaults.vllm!.concurrency!,
+      batchSize: merged.vllm?.batchSize ?? defaults.vllm!.batchSize!,
+    },
+  }
+
+  if (!o.model) {
+    throw new Error("Semantic search requires a model identifier")
+  }
+
+  return {
+    name: "SemanticIndex",
+    getQuartzComponents() {
+      return []
+    },
+    async *partialEmit() {},
+    async *emit(ctx, content, _resources) {
+      if (!o.enable) return
+
+      const docs: ContentDetails[] = []
+      for (const [_, file] of content) {
+        const slug = file.data.slug!
+        const title = file.data.frontmatter?.title ?? slug
+        const text = file.data.text
+        if (text) {
+          docs.push({
+            slug,
+            title,
+            filePath: file.data.filePath!,
+            content: text,
+            readingTime: file.data.readingTime,
+          })
+        }
+      }
+
+      // Emit JSONL with the exact text used for embeddings
+      const jsonl = docs
+        .map((d) => ({ slug: d.slug, title: d.title, text: d.content }))
+        .map((o) => JSON.stringify(o))
+        .join("\n")
+
+      const jsonlSlug = "embeddings-text" as FullSlug
+      yield write({
+        ctx,
+        slug: jsonlSlug,
+        ext: ".jsonl",
+        content: jsonl,
+      })
+
+      // If aot is false, run the embedding generation script
+      if (!o.aot) {
+        console.log("\nGenerating embeddings (aot=false)...")
+
+        // Check for uv
+        const hasUv = await checkUvInstalled()
+        if (!hasUv) {
+          throw new Error(
+            "uv is required for embedding generation. Install it from https://docs.astral.sh/uv/",
+          )
+        }
+
+        const jsonlPath = joinSegments(ctx.argv.output, "embeddings-text.jsonl")
+        const outDir = joinSegments(ctx.argv.output, "embeddings")
+
+        try {
+          await runEmbedBuild(jsonlPath, outDir, o)
+        } catch (err) {
+          const message = err instanceof Error ? err.message : String(err)
+          throw new Error(`Embedding generation failed: ${message}`)
+        }
+      } else {
+        console.log(
+          "\nSkipping embedding generation (aot=true). Expecting pre-generated embeddings in public/embeddings/",
+        )
+      }
+    },
+    externalResources(_ctx) {
+      return {}
+    },
+  }
+}