Files
quartz-org-roam/quartz/plugins/emitters/semantic.ts
Aaron Pham f533902c75 feat: semantic search (1/n)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-10-05 19:50:52 -04:00

236 lines
6.4 KiB
TypeScript

import { write } from "./helpers"
import { QuartzEmitterPlugin } from "../types"
import { FilePath, FullSlug, joinSegments, QUARTZ } from "../../util/path"
import { ReadTimeResults } from "reading-time"
import { GlobalConfiguration } from "../../cfg"
import { spawn } from "child_process"
const DEFAULT_MODEL_ID = "onnx-community/Qwen3-Embedding-0.6B-ONNX"
const defaults: GlobalConfiguration["semanticSearch"] = {
enable: true,
model: DEFAULT_MODEL_ID,
aot: false,
dims: 1024,
dtype: "fp32",
shardSizeRows: 1024,
hnsw: { M: 16, efConstruction: 200 },
chunking: {
chunkSize: 512,
chunkOverlap: 128,
noChunking: false,
},
vllm: {
enable: false,
vllmUrl:
process.env.VLLM_URL || process.env.VLLM_EMBED_URL || "http://127.0.0.1:8000/v1/embeddings",
concurrency: parseInt(process.env.VLLM_CONCURRENCY || "8", 10),
batchSize: parseInt(process.env.VLLM_BATCH_SIZE || "64", 10),
},
}
type ContentDetails = {
slug: string
title: string
filePath: FilePath
content: string
readingTime?: Partial<ReadTimeResults>
}
/**
* Check if uv is installed
*/
function checkUvInstalled(): Promise<boolean> {
return new Promise((resolve) => {
const proc = spawn("uv", ["--version"], { shell: true })
proc.on("error", () => resolve(false))
proc.on("close", (code) => resolve(code === 0))
})
}
/**
* Run the Python embedding build script using uv
* Script uses PEP 723 inline metadata for dependency management
*/
function runEmbedBuild(
jsonlPath: string,
outDir: string,
opts: {
model: string
dtype: string
dims: number
shardSizeRows: number
chunking: { chunkSize: number; chunkOverlap: number; noChunking: boolean }
vllm: { enable: boolean; vllmUrl?: string; concurrency: number; batchSize: number }
},
): Promise<void> {
return new Promise((resolve, reject) => {
const scriptPath = joinSegments(QUARTZ, "embed_build.py")
const args = [
"run",
scriptPath,
"--jsonl",
jsonlPath,
"--model",
opts.model,
"--out",
outDir,
"--dtype",
opts.dtype,
"--dims",
String(opts.dims),
"--shard-size",
String(opts.shardSizeRows),
"--chunk-size",
String(opts.chunking.chunkSize),
"--chunk-overlap",
String(opts.chunking.chunkOverlap),
]
if (opts.chunking.noChunking) {
args.push("--no-chunking")
}
if (opts.vllm.enable) {
args.push("--use-vllm")
if (opts.vllm.vllmUrl) {
args.push("--vllm-url", opts.vllm.vllmUrl)
}
args.push("--concurrency", String(opts.vllm.concurrency))
args.push("--batch-size", String(opts.vllm.batchSize))
}
console.log("\nRunning embedding generation:")
console.log(` uv ${args.join(" ")}`)
const env = { ...process.env }
if (opts.vllm.enable && !env.USE_VLLM) {
env.USE_VLLM = "1"
}
const proc = spawn("uv", args, {
stdio: "inherit",
shell: true,
env,
})
proc.on("error", (err) => {
reject(new Error(`Failed to spawn uv: ${err.message}`))
})
proc.on("close", (code) => {
if (code === 0) {
console.log("Embedding generation completed successfully")
resolve()
} else {
reject(new Error(`embed_build.py exited with code ${code}`))
}
})
})
}
export const SemanticIndex: QuartzEmitterPlugin<Partial<GlobalConfiguration["semanticSearch"]>> = (
opts,
) => {
const merged = { ...defaults, ...opts }
const o = {
enable: merged.enable!,
model: merged.model!,
aot: merged.aot!,
dims: merged.dims!,
dtype: merged.dtype!,
shardSizeRows: merged.shardSizeRows!,
hnsw: {
M: merged.hnsw?.M ?? defaults.hnsw!.M!,
efConstruction: merged.hnsw?.efConstruction ?? defaults.hnsw!.efConstruction!,
efSearch: merged.hnsw?.efSearch,
},
chunking: {
chunkSize: merged.chunking?.chunkSize ?? defaults.chunking!.chunkSize!,
chunkOverlap: merged.chunking?.chunkOverlap ?? defaults.chunking!.chunkOverlap!,
noChunking: merged.chunking?.noChunking ?? defaults.chunking!.noChunking!,
},
vllm: {
enable: merged.vllm?.enable ?? defaults.vllm!.enable!,
vllmUrl: merged.vllm?.vllmUrl ?? defaults.vllm!.vllmUrl,
concurrency: merged.vllm?.concurrency ?? defaults.vllm!.concurrency!,
batchSize: merged.vllm?.batchSize ?? defaults.vllm!.batchSize!,
},
}
if (!o.model) {
throw new Error("Semantic search requires a model identifier")
}
return {
name: "SemanticIndex",
getQuartzComponents() {
return []
},
async *partialEmit() {},
async *emit(ctx, content, _resources) {
if (!o.enable) return
const docs: ContentDetails[] = []
for (const [_, file] of content) {
const slug = file.data.slug!
const title = file.data.frontmatter?.title ?? slug
const text = file.data.text
if (text) {
docs.push({
slug,
title,
filePath: file.data.filePath!,
content: text,
readingTime: file.data.readingTime,
})
}
}
// Emit JSONL with the exact text used for embeddings
const jsonl = docs
.map((d) => ({ slug: d.slug, title: d.title, text: d.content }))
.map((o) => JSON.stringify(o))
.join("\n")
const jsonlSlug = "embeddings-text" as FullSlug
yield write({
ctx,
slug: jsonlSlug,
ext: ".jsonl",
content: jsonl,
})
// If aot is false, run the embedding generation script
if (!o.aot) {
console.log("\nGenerating embeddings (aot=false)...")
// Check for uv
const hasUv = await checkUvInstalled()
if (!hasUv) {
throw new Error(
"uv is required for embedding generation. Install it from https://docs.astral.sh/uv/",
)
}
const jsonlPath = joinSegments(ctx.argv.output, "embeddings-text.jsonl")
const outDir = joinSegments(ctx.argv.output, "embeddings")
try {
await runEmbedBuild(jsonlPath, outDir, o)
} catch (err) {
const message = err instanceof Error ? err.message : String(err)
throw new Error(`Embedding generation failed: ${message}`)
}
} else {
console.log(
"\nSkipping embedding generation (aot=true). Expecting pre-generated embeddings in public/embeddings/",
)
}
},
externalResources(_ctx) {
return {}
},
}
}