Compare commits
48 Commits
feat/seman
...
0ea5808cd2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0ea5808cd2 | ||
|
|
d913138726 | ||
|
|
3deec2d011 | ||
|
|
a907d6513b | ||
|
|
9ecc9336da | ||
|
|
511b003da8 | ||
|
|
ec1426241c | ||
|
|
692f23bc36 | ||
|
|
86eaaae945 | ||
|
|
ec00a40aef | ||
|
|
25a6747d7d | ||
|
|
b4fb0e6682 | ||
|
|
f346a01296 | ||
|
|
c2dcc63b5f | ||
|
|
c2bea8a4c4 | ||
|
|
fa8d87a23a | ||
|
|
65c5b27041 | ||
|
|
5208a96a37 | ||
|
|
31ea7852fd | ||
|
|
7dc826be0a | ||
|
|
11ab6da80c | ||
|
|
de1e7505ba | ||
|
|
9c042dd717 | ||
|
|
bacd19c4ea | ||
|
|
722277b202 | ||
|
|
e6cc9ba368 | ||
|
|
643aca5ffa | ||
|
|
ec26ebcc9e | ||
|
|
19e324d914 | ||
|
|
368203cf85 | ||
|
|
13ff64db97 | ||
|
|
87f7f4804e | ||
|
|
c99c8070f2 | ||
|
|
e7d2a57aad | ||
|
|
ef29c69828 | ||
|
|
86a30ad150 | ||
|
|
45d2ef8690 | ||
|
|
0ecb859d2d | ||
|
|
2fdc8129b6 | ||
|
|
8bc6cb9061 | ||
|
|
af5773f0e4 | ||
|
|
4260214a07 | ||
|
|
0c4386dce1 | ||
|
|
08c861707b | ||
|
|
1377004fca | ||
|
|
519d56c132 | ||
|
|
52460f376f | ||
|
|
b4805a1031 |
8
.github/workflows/build-preview.yaml
vendored
8
.github/workflows/build-preview.yaml
vendored
@@ -11,17 +11,17 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
name: Build Preview
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Node
|
||||
uses: actions/setup-node@v5
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 22
|
||||
|
||||
- name: Cache dependencies
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: ~/.npm
|
||||
key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
run: npx quartz build -d docs -v
|
||||
|
||||
- name: Upload build artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: preview-build
|
||||
path: public
|
||||
|
||||
10
.github/workflows/ci.yaml
vendored
10
.github/workflows/ci.yaml
vendored
@@ -19,17 +19,17 @@ jobs:
|
||||
permissions:
|
||||
contents: write
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Node
|
||||
uses: actions/setup-node@v5
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 22
|
||||
|
||||
- name: Cache dependencies
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: ~/.npm
|
||||
key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
|
||||
@@ -53,11 +53,11 @@ jobs:
|
||||
permissions:
|
||||
contents: write
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Setup Node
|
||||
uses: actions/setup-node@v5
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 22
|
||||
- name: Get package version
|
||||
|
||||
2
.github/workflows/deploy-preview.yaml
vendored
2
.github/workflows/deploy-preview.yaml
vendored
@@ -18,7 +18,7 @@ jobs:
|
||||
name: Deploy Preview to Cloudflare Pages
|
||||
steps:
|
||||
- name: Download build artifact
|
||||
uses: actions/download-artifact@v5
|
||||
uses: actions/download-artifact@v7
|
||||
id: preview-build-artifact
|
||||
with:
|
||||
name: preview-build
|
||||
|
||||
6
.github/workflows/docker-build-push.yaml
vendored
6
.github/workflows/docker-build-push.yaml
vendored
@@ -21,11 +21,11 @@ jobs:
|
||||
echo "OWNER_LOWERCASE=${OWNER,,}" >> ${GITHUB_ENV}
|
||||
env:
|
||||
OWNER: "${{ github.repository_owner }}"
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 1
|
||||
- name: Inject slug/short variables
|
||||
uses: rlespinasse/github-slug-action@v5.2.0
|
||||
uses: rlespinasse/github-slug-action@v5.4.0
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
- name: Set up Docker Buildx
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
network=host
|
||||
- name: Install cosign
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: sigstore/cosign-installer@v3.10.0
|
||||
uses: sigstore/cosign-installer@v4.0.0
|
||||
- name: Login to GitHub Container Registry
|
||||
uses: docker/login-action@v3
|
||||
if: github.event_name != 'pull_request'
|
||||
|
||||
13
.gitignore
vendored
13
.gitignore
vendored
@@ -9,3 +9,16 @@ tsconfig.tsbuildinfo
|
||||
private/
|
||||
.replit
|
||||
replit.nix
|
||||
erl_crash.dump
|
||||
# content/ is generated by the export script; only keep the placeholder
|
||||
content/*
|
||||
!content/.gitkeep
|
||||
# static/ox-hugo/ is populated by ox-hugo during export
|
||||
static/ox-hugo/
|
||||
# Elixir/Mix build artifacts for the pipeline project
|
||||
scripts/pipeline/_build/
|
||||
scripts/pipeline/deps/
|
||||
scripts/pipeline/erl_crash.dump
|
||||
# Test helpers (not needed in production)
|
||||
scripts/test.bib
|
||||
scripts/test_pipeline.exs
|
||||
|
||||
361
AGENTS.md
Normal file
361
AGENTS.md
Normal file
@@ -0,0 +1,361 @@
|
||||
# AGENTS.md - Coding Agent Instructions
|
||||
|
||||
This document provides essential information for AI coding agents working in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
**Quartz** is a static site generator for publishing digital gardens and notes as websites.
|
||||
Built with TypeScript, Preact, and unified/remark/rehype for markdown processing.
|
||||
|
||||
| Stack | Technology |
|
||||
| ------------- | ----------------------------------------- |
|
||||
| Language | TypeScript 5.x (strict mode) |
|
||||
| Runtime | Node.js >=22 (v22.16.0 pinned) |
|
||||
| Package Mgr | npm >=10.9.2 |
|
||||
| Module System | ES Modules (`"type": "module"`) |
|
||||
| UI Framework | Preact 10.x (JSX with `react-jsx` pragma) |
|
||||
| Build Tool | esbuild |
|
||||
| Styling | SCSS via esbuild-sass-plugin |
|
||||
|
||||
## Environment
|
||||
|
||||
This is a Nix project. Use the provided `flake.nix` to enter a dev shell with Node.js 22 and npm:
|
||||
|
||||
```bash
|
||||
nix develop
|
||||
```
|
||||
|
||||
All `npm` commands below must be run inside the dev shell.
|
||||
|
||||
## Build, Lint, and Test Commands
|
||||
|
||||
```bash
|
||||
# Type check and format check (CI validation)
|
||||
npm run check
|
||||
|
||||
# Auto-format code with Prettier
|
||||
npm run format
|
||||
|
||||
# Run all tests
|
||||
npm run test
|
||||
|
||||
# Run a single test file
|
||||
npx tsx --test quartz/util/path.test.ts
|
||||
|
||||
# Run tests matching a pattern (use --test-name-pattern)
|
||||
npx tsx --test --test-name-pattern="typeguards" quartz/util/path.test.ts
|
||||
|
||||
# Build the static site
|
||||
npx quartz build
|
||||
|
||||
# Build and serve with hot reload
|
||||
npx quartz build --serve
|
||||
|
||||
# Profile build performance
|
||||
npm run profile
|
||||
```
|
||||
|
||||
### Test Files Location
|
||||
|
||||
Tests use Node.js native test runner via `tsx`. Test files follow the `*.test.ts` pattern:
|
||||
|
||||
- `quartz/util/path.test.ts`
|
||||
- `quartz/util/fileTrie.test.ts`
|
||||
- `quartz/components/scripts/search.test.ts`
|
||||
|
||||
## Code Style Guidelines
|
||||
|
||||
### Prettier Configuration (`.prettierrc`)
|
||||
|
||||
```json
|
||||
{
|
||||
"printWidth": 100,
|
||||
"tabWidth": 2,
|
||||
"semi": false,
|
||||
"trailingComma": "all",
|
||||
"quoteProps": "as-needed"
|
||||
}
|
||||
```
|
||||
|
||||
**No ESLint** - only Prettier for formatting. Run `npm run format` before committing.
|
||||
|
||||
### TypeScript Configuration
|
||||
|
||||
- **Strict mode enabled** (`strict: true`)
|
||||
- `noUnusedLocals: true` - no unused variables
|
||||
- `noUnusedParameters: true` - no unused function parameters
|
||||
- JSX configured for Preact (`jsxImportSource: "preact"`)
|
||||
|
||||
### Import Conventions
|
||||
|
||||
```typescript
|
||||
// 1. External packages first
|
||||
import { PluggableList } from "unified"
|
||||
import { visit } from "unist-util-visit"
|
||||
|
||||
// 2. Internal utilities/types (relative paths)
|
||||
import { QuartzTransformerPlugin } from "../types"
|
||||
import { FilePath, slugifyFilePath } from "../../util/path"
|
||||
import { i18n } from "../../i18n"
|
||||
```
|
||||
|
||||
### Naming Conventions
|
||||
|
||||
| Element | Convention | Example |
|
||||
| ---------------- | ------------ | ----------------------------------- |
|
||||
| Files (utils) | camelCase | `path.ts`, `fileTrie.ts` |
|
||||
| Files (comps) | PascalCase | `TableOfContents.tsx`, `Search.tsx` |
|
||||
| Types/Interfaces | PascalCase | `QuartzComponent`, `FullSlug` |
|
||||
| Type Guards | `is*` prefix | `isFilePath()`, `isFullSlug()` |
|
||||
| Constants | UPPER_CASE | `QUARTZ`, `UPSTREAM_NAME` |
|
||||
| Options types | `Options` | `interface Options { ... }` |
|
||||
|
||||
### Branded Types Pattern
|
||||
|
||||
This codebase uses branded types for type-safe path handling:
|
||||
|
||||
```typescript
|
||||
type SlugLike<T> = string & { __brand: T }
|
||||
export type FilePath = SlugLike<"filepath">
|
||||
export type FullSlug = SlugLike<"full">
|
||||
export type SimpleSlug = SlugLike<"simple">
|
||||
|
||||
// Always validate with type guards before using
|
||||
export function isFilePath(s: string): s is FilePath { ... }
|
||||
```
|
||||
|
||||
### Component Pattern (Preact)
|
||||
|
||||
Components use a factory function pattern with attached static properties:
|
||||
|
||||
```typescript
|
||||
export default ((userOpts?: Partial<Options>) => {
|
||||
const opts: Options = { ...defaultOptions, ...userOpts }
|
||||
|
||||
const ComponentName: QuartzComponent = ({ cfg, displayClass }: QuartzComponentProps) => {
|
||||
return <div class={classNames(displayClass, "component-name")}>...</div>
|
||||
}
|
||||
|
||||
ComponentName.css = style // SCSS styles
|
||||
ComponentName.afterDOMLoaded = script // Client-side JS
|
||||
return ComponentName
|
||||
}) satisfies QuartzComponentConstructor
|
||||
```
|
||||
|
||||
### Plugin Pattern
|
||||
|
||||
Three plugin types: transformers, filters, and emitters.
|
||||
|
||||
```typescript
|
||||
export const PluginName: QuartzTransformerPlugin<Partial<Options>> = (userOpts) => {
|
||||
const opts = { ...defaultOptions, ...userOpts }
|
||||
return {
|
||||
name: "PluginName",
|
||||
markdownPlugins(ctx) { return [...] },
|
||||
htmlPlugins(ctx) { return [...] },
|
||||
externalResources(ctx) { return { js: [], css: [] } },
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Testing Pattern
|
||||
|
||||
Use Node.js native test runner with `assert`:
|
||||
|
||||
```typescript
|
||||
import test, { describe, beforeEach } from "node:test"
|
||||
import assert from "node:assert"
|
||||
|
||||
describe("FeatureName", () => {
|
||||
test("should do something", () => {
|
||||
assert.strictEqual(actual, expected)
|
||||
assert.deepStrictEqual(actualObj, expectedObj)
|
||||
assert(condition) // truthy assertion
|
||||
assert(!condition) // falsy assertion
|
||||
})
|
||||
})
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
- Use `try/catch` for critical operations (file I/O, parsing)
|
||||
- Custom `trace` utility for error reporting with stack traces
|
||||
- `process.exit(1)` for fatal errors
|
||||
- `console.warn()` for non-fatal issues
|
||||
|
||||
### Async Patterns
|
||||
|
||||
- Prefer `async/await` over raw promises
|
||||
- Use async generators (`async *emit()`) for streaming file output
|
||||
- Use `async-mutex` for concurrent build protection
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
quartz/
|
||||
├── bootstrap-cli.mjs # CLI entry point
|
||||
├── build.ts # Build orchestration
|
||||
├── cfg.ts # Configuration types
|
||||
├── components/ # Preact UI components
|
||||
│ ├── *.tsx # Components
|
||||
│ ├── scripts/ # Client-side scripts (*.inline.ts)
|
||||
│ └── styles/ # Component SCSS
|
||||
├── plugins/
|
||||
│ ├── transformers/ # Markdown AST transformers
|
||||
│ ├── filters/ # Content filters
|
||||
│ ├── emitters/ # Output generators
|
||||
│ └── types.ts # Plugin type definitions
|
||||
├── processors/ # Build pipeline (parse/filter/emit)
|
||||
├── util/ # Utility functions
|
||||
└── i18n/ # Internationalization (30+ locales)
|
||||
```
|
||||
|
||||
## Branch Workflow
|
||||
|
||||
This is a fork of [jackyzha0/quartz](https://github.com/jackyzha0/quartz) with org-roam customizations.
|
||||
|
||||
| Branch | Purpose |
|
||||
| ----------- | ------------------------------------------------ |
|
||||
| `main` | Clean mirror of upstream quartz — no custom code |
|
||||
| `org-roam` | Default branch — all customizations live here |
|
||||
| `feature/*` | Short-lived branches off `org-roam` |
|
||||
|
||||
### Pulling Upstream Updates
|
||||
|
||||
```bash
|
||||
git checkout main
|
||||
git fetch upstream
|
||||
git merge upstream/main
|
||||
git checkout org-roam
|
||||
git merge main
|
||||
# Resolve conflicts if any, then commit
|
||||
```
|
||||
|
||||
### Working on Features
|
||||
|
||||
```bash
|
||||
git checkout org-roam
|
||||
git checkout -b feature/my-feature
|
||||
# ... work ...
|
||||
git checkout org-roam
|
||||
git merge feature/my-feature
|
||||
git branch -d feature/my-feature
|
||||
```
|
||||
|
||||
**Merge direction:** `upstream → main → org-roam → feature/*`
|
||||
|
||||
## Org-Roam Workflow
|
||||
|
||||
Notes live in a **separate directory** outside this repo. The export pipeline
|
||||
converts them to Markdown via ox-hugo, applies post-processing transforms, then
|
||||
Quartz builds the site.
|
||||
|
||||
### Tooling
|
||||
|
||||
The dev shell (`nix develop`) provides:
|
||||
|
||||
- `nodejs_22` — Quartz build
|
||||
- `elixir` — runs the export script and pipeline
|
||||
- `emacs` + `ox-hugo` — performs the org → markdown conversion
|
||||
|
||||
### Export and build
|
||||
|
||||
```bash
|
||||
# Export only (wipes content/, exports all .org files, runs pipeline)
|
||||
NOTES_DIR=/path/to/notes npm run export
|
||||
|
||||
# Export then build the site
|
||||
NOTES_DIR=/path/to/notes npm run build:notes
|
||||
|
||||
# Positional arg also works
|
||||
elixir scripts/export.exs /path/to/notes
|
||||
```
|
||||
|
||||
Optional env vars for the pipeline:
|
||||
|
||||
| Var | Default | Purpose |
|
||||
| --------------- | ------------------------ | ----------------------------------------- |
|
||||
| `BIBTEX_FILE` | — | Path to `.bib` file for citation fallback |
|
||||
| `ZOTERO_URL` | `http://localhost:23119` | Zotero Better BibTeX base URL |
|
||||
| `CITATION_MODE` | `warn` | `silent` / `warn` / `strict` |
|
||||
|
||||
### Export pipeline phases
|
||||
|
||||
`scripts/export.exs` runs four phases in sequence:
|
||||
|
||||
1. **Wipe** `content/` (preserving `.gitkeep`)
|
||||
2. **Export** each `.org` file via `emacs --batch` + `ox-hugo` → `content/**/*.md`
|
||||
3. **Pipeline** — run Elixir transform modules over every `.md` file
|
||||
4. **Index** — generate a fallback `content/index.md` if none was exported
|
||||
|
||||
The export uses TOML frontmatter (`+++`) and per-file mode (not per-subtree).
|
||||
|
||||
### Markdown pipeline (`scripts/pipeline/`)
|
||||
|
||||
A standalone Mix project that post-processes `content/*.md` after ox-hugo.
|
||||
It is compiled automatically on first run; subsequent runs use the `_build/`
|
||||
cache and are fast.
|
||||
|
||||
**Architecture:**
|
||||
|
||||
```
|
||||
scripts/pipeline/
|
||||
├── mix.exs # deps: req, jason
|
||||
└── lib/
|
||||
├── pipeline.ex # Generic runner (fold transforms over .md files)
|
||||
├── pipeline/
|
||||
│ ├── application.ex # OTP app — starts Finch HTTP pool
|
||||
│ ├── transform.ex # Behaviour: init/1, apply/3, teardown/1
|
||||
│ ├── transforms/
|
||||
│ │ └── citations.ex # Resolves cite:key → [Label](url)
|
||||
│ └── resolvers/
|
||||
│ ├── zotero.ex # JSON-RPC to Zotero Better BibTeX
|
||||
│ ├── bibtex.ex # Parses local .bib file
|
||||
│ └── doi.ex # Bare-key fallback (always succeeds)
|
||||
```
|
||||
|
||||
**Adding a new transform:**
|
||||
|
||||
1. Create `scripts/pipeline/lib/pipeline/transforms/my_transform.ex`
|
||||
2. Implement the `Pipeline.Transform` behaviour (`init/1`, `apply/3`)
|
||||
3. Append the module to `transforms` in `scripts/export.exs`
|
||||
|
||||
```elixir
|
||||
transforms = [
|
||||
Pipeline.Transforms.Citations,
|
||||
Pipeline.Transforms.MyTransform, # new
|
||||
]
|
||||
```
|
||||
|
||||
### Citation resolution (`Pipeline.Transforms.Citations`)
|
||||
|
||||
Handles org-citar syntax that passes through ox-hugo unchanged:
|
||||
|
||||
| Syntax | Example |
|
||||
| ---------------- | -------------------- |
|
||||
| org-cite / citar | `[cite:@key]` |
|
||||
| multiple keys | `[cite:@key1;@key2]` |
|
||||
| bare (legacy) | `cite:key` |
|
||||
|
||||
Resolution chain (first success wins):
|
||||
|
||||
1. **Zotero** — JSON-RPC to `localhost:23119/better-bibtex/json-rpc`
|
||||
- Calls `item.search` to find the item, then `item.attachments` to get
|
||||
the PDF link (`zotero://open-pdf/library/items/KEY`)
|
||||
- Falls back to `zotero://select/library/items/KEY` if no PDF attachment
|
||||
- Probe uses a JSON-RPC call, **not** `/better-bibtex/cayw`
|
||||
(that endpoint blocks waiting for interactive input)
|
||||
2. **BibTeX** — parses `BIBTEX_FILE`; extracts authors, year, DOI/URL
|
||||
3. **DOI fallback** — always succeeds; renders bare key or `https://doi.org/...`
|
||||
|
||||
**Zotero JSON-RPC gotcha:** `Req 0.5` does not allow combining `:finch` and
|
||||
`:connect_options` in the same call. Use `:receive_timeout` only.
|
||||
|
||||
## Important Notes
|
||||
|
||||
- **Client-side scripts**: Use `.inline.ts` suffix, bundled via esbuild
|
||||
- **Isomorphic code**: `quartz/util/path.ts` must not use Node.js APIs
|
||||
- **Incremental builds**: Plugins can implement `partialEmit` for efficiency
|
||||
- **Markdown flavors**: Supports Obsidian (`ofm.ts`) and Roam (`roam.ts`) syntax
|
||||
- **Pipeline build artifacts**: `scripts/pipeline/_build/` and `scripts/pipeline/deps/`
|
||||
are gitignored — run `mix deps.get` inside `scripts/pipeline/` after a fresh clone
|
||||
90
README.md
90
README.md
@@ -1,14 +1,96 @@
|
||||
# Quartz v4
|
||||
# Quartz v4 — org-roam edition
|
||||
|
||||
> “[One] who works with the door open gets all kinds of interruptions, but [they] also occasionally gets clues as to what the world is and what might be important.” — Richard Hamming
|
||||
> "[One] who works with the door open gets all kinds of interruptions, but [they] also occasionally gets clues as to what the world is and what might be important." — Richard Hamming
|
||||
|
||||
Quartz is a set of tools that helps you publish your [digital garden](https://jzhao.xyz/posts/networked-thought) and notes as a website for free.
|
||||
Quartz v4 features a from-the-ground rewrite focusing on end-user extensibility and ease-of-use.
|
||||
|
||||
🔗 Read the documentation and get started: https://quartz.jzhao.xyz/
|
||||
This fork adds first-class support for [org-roam](https://www.orgroam.com/) notes via [ox-hugo](https://ox-hugo.scripter.co/).
|
||||
|
||||
🔗 Upstream documentation: https://quartz.jzhao.xyz/
|
||||
|
||||
[Join the Discord Community](https://discord.gg/cRFFHYye7t)
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
This project uses Nix. Enter the development shell, which provides Node.js 22, Elixir, and Emacs with ox-hugo:
|
||||
|
||||
```bash
|
||||
nix develop
|
||||
```
|
||||
|
||||
All commands below must be run inside this shell.
|
||||
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
### Building from org-roam notes
|
||||
|
||||
Your org-roam notes live in a separate directory. Point `NOTES_DIR` at it:
|
||||
|
||||
```bash
|
||||
# Export notes to content/ and build the site
|
||||
NOTES_DIR=/path/to/notes npm run build:notes
|
||||
|
||||
# Export, build, and serve with hot reload
|
||||
NOTES_DIR=/path/to/notes npm run serve:notes
|
||||
|
||||
# Export only (wipes content/ and re-exports all .org files)
|
||||
NOTES_DIR=/path/to/notes npm run export
|
||||
```
|
||||
|
||||
The export pipeline runs in four phases:
|
||||
|
||||
1. **Wipe** `content/` clean
|
||||
2. **Export** every `.org` file via `emacs --batch` + ox-hugo → Markdown
|
||||
3. **Transform** — post-process the Markdown (citation resolution, etc.)
|
||||
4. **Index** — generate a fallback `index.md` if none was exported
|
||||
|
||||
#### Citations (org-citar → Zotero links)
|
||||
|
||||
org-citar references (`[cite:@key]`) are resolved to clickable Zotero links.
|
||||
With Zotero running and the [Better BibTeX](https://retorque.re/zotero-better-bibtex/)
|
||||
plugin installed, no extra configuration is needed — the pipeline detects it
|
||||
automatically and links directly to the PDF in your library.
|
||||
|
||||
```bash
|
||||
# Use a local .bib file as fallback when Zotero is not running
|
||||
BIBTEX_FILE=/path/to/refs.bib NOTES_DIR=/path/to/notes npm run export
|
||||
|
||||
# Control warning verbosity for unresolved keys
|
||||
CITATION_MODE=strict NOTES_DIR=/path/to/notes npm run export
|
||||
```
|
||||
|
||||
| Env var | Default | Purpose |
|
||||
| --------------- | ------------------------ | ----------------------------------------- |
|
||||
| `BIBTEX_FILE` | — | Path to `.bib` file for citation fallback |
|
||||
| `ZOTERO_URL` | `http://localhost:23119` | Zotero Better BibTeX base URL |
|
||||
| `CITATION_MODE` | `warn` | `silent` / `warn` / `strict` |
|
||||
|
||||
### Building without org-roam notes
|
||||
|
||||
If you manage `content/` directly with Markdown files:
|
||||
|
||||
```bash
|
||||
# Build the site
|
||||
npx quartz build
|
||||
|
||||
# Build and serve with hot reload
|
||||
npx quartz build --serve
|
||||
```
|
||||
|
||||
The site is generated in `public/`. When serving, visit http://localhost:8080.
|
||||
|
||||
### Development
|
||||
|
||||
```bash
|
||||
npm run check # type check + format check
|
||||
npm run format # auto-format with Prettier
|
||||
npm run test # run tests
|
||||
```
|
||||
|
||||
## Sponsors
|
||||
|
||||
<p align="center">
|
||||
|
||||
@@ -36,6 +36,7 @@ This part of the configuration concerns anything that can affect the whole site.
|
||||
- `{provider: 'clarity', projectId: '<your-clarity-id-code' }`: use [Microsoft clarity](https://clarity.microsoft.com/). The project id can be found on top of the overview page.
|
||||
- `{ provider: 'matomo', siteId: '<your-matomo-id-code', host: 'matomo.example.com' }`: use [Matomo](https://matomo.org/), without protocol.
|
||||
- `{ provider: 'vercel' }`: use [Vercel Web Analytics](https://vercel.com/docs/concepts/analytics).
|
||||
- `{ provider: 'rybbit', siteId: 'my-rybbit-id' }` (managed) or `{ provider: 'rybbit', siteId: 'my-rybbit-id', host: 'my-rybbit-domain.com' }` (self-hosted) use [Rybbit](https://rybbit.com);
|
||||
- `locale`: used for [[i18n]] and date formatting
|
||||
- `baseUrl`: this is used for sitemaps and RSS feeds that require an absolute URL to know where the canonical 'home' of your site lives. This is normally the deployed URL of your site (e.g. `quartz.jzhao.xyz` for this site). Do not include the protocol (i.e. `https://`) or any leading or trailing slashes.
|
||||
- This should also include the subpath if you are [[hosting]] on GitHub pages without a custom domain. For example, if my repository is `jackyzha0/quartz`, GitHub pages would deploy to `https://jackyzha0.github.io/quartz` and the `baseUrl` would be `jackyzha0.github.io/quartz`.
|
||||
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
@@ -5,3 +5,7 @@ You can run the below one-liner to run Quartz in Docker.
|
||||
```sh
|
||||
docker run --rm -itp 8080:8080 -p 3001:3001 -v ./content:/usr/src/app/content $(docker build -q .)
|
||||
```
|
||||
|
||||
> [!warning] Not to be used for production
|
||||
> Serve mode is intended for local previews only.
|
||||
> For production workloads, see the page on [[hosting]].
|
||||
|
||||
@@ -162,7 +162,7 @@ You can access the tags of a file by `node.data.tags`.
|
||||
Component.Explorer({
|
||||
filterFn: (node) => {
|
||||
// exclude files with the tag "explorerexclude"
|
||||
return node.data.tags?.includes("explorerexclude") !== true
|
||||
return node.data?.tags?.includes("explorerexclude") !== true
|
||||
},
|
||||
})
|
||||
```
|
||||
|
||||
@@ -8,7 +8,7 @@ By default, Quartz only fetches previews for pages inside your vault due to [COR
|
||||
|
||||
When [[creating components|creating your own components]], you can include this `popover-hint` class to also include it in the popover.
|
||||
|
||||
Similar to Obsidian, [[quartz layout.png|images referenced using wikilinks]] can also be viewed as popups.
|
||||
Similar to Obsidian, [[quartz-layout-desktop.png|images referenced using wikilinks]] can also be viewed as popups.
|
||||
|
||||
## Configuration
|
||||
|
||||
|
||||
61
flake.lock
generated
Normal file
61
flake.lock
generated
Normal file
@@ -0,0 +1,61 @@
|
||||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1731533236,
|
||||
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1771008912,
|
||||
"narHash": "sha256-gf2AmWVTs8lEq7z/3ZAsgnZDhWIckkb+ZnAo5RzSxJg=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "a82ccc39b39b621151d6732718e3e250109076fa",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
99
flake.nix
Normal file
99
flake.nix
Normal file
@@ -0,0 +1,99 @@
|
||||
{
|
||||
description = "Quartz org-roam dev shell and build app";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
};
|
||||
|
||||
outputs = { self, nixpkgs, flake-utils }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
pkgs = import nixpkgs { inherit system; };
|
||||
|
||||
# Emacs with ox-hugo — shared between devShell and buildApp
|
||||
emacsWithOxHugo = (pkgs.emacsPackagesFor pkgs.emacs-nox).emacsWithPackages
|
||||
(epkgs: [ epkgs.ox-hugo ]);
|
||||
|
||||
# Pre-fetched npm dependency tree (node_modules)
|
||||
quartzDeps = pkgs.buildNpmPackage {
|
||||
pname = "quartz-deps";
|
||||
version = "4.5.2";
|
||||
src = ./.;
|
||||
npmDepsHash = "sha256-7u+VlIx44B3/ivM9vLMIOn+e4TL4eS6B682vhS+Ikb4=";
|
||||
dontBuild = true;
|
||||
installPhase = ''
|
||||
mkdir -p $out
|
||||
cp -r node_modules $out/node_modules
|
||||
'';
|
||||
};
|
||||
|
||||
# Pre-fetched Hex/Mix dependencies for scripts/pipeline
|
||||
pipelineMixDeps = pkgs.beamPackages.fetchMixDeps {
|
||||
pname = "pipeline-mix-deps";
|
||||
version = "0.1.0";
|
||||
src = ./scripts/pipeline;
|
||||
sha256 = "sha256-E79X+nUy86G1Jrwv3T7dXekoGv8Hd14ZgJSKWjvlmAw=";
|
||||
};
|
||||
|
||||
# The build application wrapper script
|
||||
buildApp = pkgs.writeShellApplication {
|
||||
name = "build";
|
||||
runtimeInputs = [ pkgs.nodejs_22 pkgs.elixir emacsWithOxHugo ];
|
||||
text = ''
|
||||
NOTES_DIR="''${1:?Usage: build <path-to-notes-dir>}"
|
||||
NOTES_DIR=$(realpath "$NOTES_DIR")
|
||||
ORIG_CWD=$(pwd)
|
||||
|
||||
# Set up a writable working copy of the repo in a temp dir
|
||||
WORK=$(mktemp -d)
|
||||
trap 'rm -rf "$WORK"' EXIT
|
||||
cp -r ${self}/. "$WORK/repo"
|
||||
chmod -R u+w "$WORK/repo"
|
||||
|
||||
# Drop in pre-built node_modules
|
||||
ln -s ${quartzDeps}/node_modules "$WORK/repo/node_modules"
|
||||
|
||||
# Drop in pre-fetched Mix deps so mix compile runs offline
|
||||
cp -r ${pipelineMixDeps} "$WORK/repo/scripts/pipeline/deps"
|
||||
chmod -R u+w "$WORK/repo/scripts/pipeline/deps"
|
||||
|
||||
# ox-hugo requires static/ to exist before it can copy image assets
|
||||
mkdir -p "$WORK/repo/static"
|
||||
|
||||
# Run the export pipeline (org → md, citations transform)
|
||||
NOTES_DIR="$NOTES_DIR" elixir "$WORK/repo/scripts/export.exs"
|
||||
|
||||
# Build the static site from within the repo copy so relative paths
|
||||
# (e.g. ./package.json in constants.js) resolve correctly.
|
||||
# --output is absolute so the result lands in the caller's cwd.
|
||||
cd "$WORK/repo"
|
||||
node quartz/bootstrap-cli.mjs build \
|
||||
--directory "$WORK/repo/content" \
|
||||
--output "$ORIG_CWD/public"
|
||||
'';
|
||||
};
|
||||
in
|
||||
{
|
||||
devShells.default = pkgs.mkShell {
|
||||
buildInputs = [
|
||||
pkgs.nodejs_22
|
||||
pkgs.elixir
|
||||
emacsWithOxHugo
|
||||
pkgs.mcp-nixos
|
||||
];
|
||||
|
||||
shellHook = ''
|
||||
echo "Node $(node --version) / npm $(npm --version)"
|
||||
elixir --version 2>/dev/null | head -1 || true
|
||||
echo "Emacs $(emacs --version | head -1)"
|
||||
'';
|
||||
};
|
||||
|
||||
packages.default = buildApp;
|
||||
packages.build = buildApp;
|
||||
|
||||
apps.default = { type = "app"; program = "${buildApp}/bin/build"; };
|
||||
apps.build = { type = "app"; program = "${buildApp}/bin/build"; };
|
||||
});
|
||||
}
|
||||
1
index.d.ts
vendored
1
index.d.ts
vendored
@@ -13,4 +13,3 @@ interface CustomEventMap {
|
||||
|
||||
type ContentIndex = Record<FullSlug, ContentDetails>
|
||||
declare const fetchData: Promise<ContentIndex>
|
||||
declare const semanticCfg: import("./quartz/cfg").GlobalConfiguration["semanticSearch"]
|
||||
|
||||
BIN
notes-external/external-location-image.png
Normal file
BIN
notes-external/external-location-image.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 8.2 KiB |
16
notes/bus/emt-madrid.org
Normal file
16
notes/bus/emt-madrid.org
Normal file
@@ -0,0 +1,16 @@
|
||||
:PROPERTIES:
|
||||
:ID: emt-madrid
|
||||
:END:
|
||||
#+title: EMT Madrid (urban bus)
|
||||
|
||||
Empresa Municipal de Transportes (EMT) operates the urban bus network
|
||||
within the municipality of Madrid — around 200 lines.
|
||||
|
||||
* Notable lines
|
||||
- *Line 27* — connects Embajadores with Barrio de la Concepción, one of the
|
||||
oldest routes in the network.
|
||||
- *Line 34* — Argüelles to Carabanchel, crossing the city centre via Gran Vía.
|
||||
- *Búho (owl) lines* — night buses running from Cibeles from midnight to 6 am.
|
||||
|
||||
* See also
|
||||
- [[id:madrid-transport][Madrid Public Transport]]
|
||||
13
notes/example-citation.org
Normal file
13
notes/example-citation.org
Normal file
@@ -0,0 +1,13 @@
|
||||
#+title: Example: Citation Reference
|
||||
|
||||
This file demonstrates how org-citar citations pass through ox-hugo into
|
||||
markdown, where the pipeline transform resolves them.
|
||||
|
||||
The methodology described in [cite:@podlovics2021journalArticle] provides a
|
||||
useful framework for analysis.
|
||||
|
||||
Multiple citations can appear together:
|
||||
[cite:@podlovics2021journalArticle]
|
||||
|
||||
Older bare-cite style (org-roam v1 / older citar) also works:
|
||||
cite:podlovics2021journalArticle
|
||||
33
notes/example-images.org
Normal file
33
notes/example-images.org
Normal file
@@ -0,0 +1,33 @@
|
||||
:PROPERTIES:
|
||||
:ID: example-images
|
||||
:END:
|
||||
#+title: Example: Image References
|
||||
|
||||
This note demonstrates the three image reference scenarios that the pipeline
|
||||
must handle.
|
||||
|
||||
* Scenario 1: External image (URL)
|
||||
|
||||
An image hosted on the web — ox-hugo passes the URL through as-is and no
|
||||
local file handling is needed.
|
||||
|
||||
#+attr_html: :link "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSkzsTuLOt8esM6enoKwkzqA52G3p9hldlf2g&s"
|
||||
[[file:quartz-logo-external.png]]
|
||||
|
||||
* Scenario 2: Local image (same notes directory)
|
||||
|
||||
An image sitting next to this .org file inside the notes directory.
|
||||
ox-hugo copies files referenced with a relative path into the Hugo =static/=
|
||||
assets tree automatically.
|
||||
|
||||
#+CAPTION: Quartz logo (local, same notes dir)
|
||||
[[file:quartz-logo.png]]
|
||||
|
||||
* Scenario 3: External image (outside notes directory)
|
||||
|
||||
An image that lives outside the notes directory entirely — for example a
|
||||
shared assets folder or a system path. ox-hugo still copies it into =static/=
|
||||
and rewrites the reference.
|
||||
|
||||
#+CAPTION: Quartz logo (outside notes dir)
|
||||
[[file:../notes-external/external-location-image.png]]
|
||||
17
notes/madrid-transport.org
Normal file
17
notes/madrid-transport.org
Normal file
@@ -0,0 +1,17 @@
|
||||
:PROPERTIES:
|
||||
:ID: madrid-transport
|
||||
:END:
|
||||
#+title: Madrid Public Transport
|
||||
|
||||
Madrid has one of the most extensive public transport networks in Europe,
|
||||
operated primarily by [[id:crtm][Consorcio Regional de Transportes de Madrid]] (CRTM).
|
||||
|
||||
* Modes
|
||||
- [[id:metro-madrid][Metro de Madrid]] — 13 lines, ~300 km of track
|
||||
- [[id:emt-madrid][EMT Bus]] — urban buses within the city
|
||||
- Cercanías — suburban rail run by Renfe
|
||||
- Interurbano — regional buses to the wider Community of Madrid
|
||||
|
||||
* Ticketing
|
||||
A single [[https://www.crtm.es][tarjeta transporte]] (transport card) works across all modes.
|
||||
The Multi card covers zones A–C and is topped up at any metro station.
|
||||
18
notes/metro/metro-madrid.org
Normal file
18
notes/metro/metro-madrid.org
Normal file
@@ -0,0 +1,18 @@
|
||||
:PROPERTIES:
|
||||
:ID: metro-madrid
|
||||
:END:
|
||||
#+title: Metro de Madrid
|
||||
|
||||
The Madrid Metro is the main rapid transit network in the city, opened in 1919.
|
||||
It is the second oldest metro in the Iberian Peninsula after Barcelona.
|
||||
|
||||
* Key Lines
|
||||
| Line | Name | Colour | Terminals |
|
||||
|------+-----------------+--------+------------------------------|
|
||||
| L1 | Pinar de Chamartín–Valdecarros | Blue | Pinar de Chamartín / Valdecarros |
|
||||
| L6 | Circular | Grey | Circular (loop) |
|
||||
| L10 | — | Dark blue | Hospital Infanta Sofía / Tres Olivos |
|
||||
|
||||
* See also
|
||||
- [[id:madrid-transport][Madrid Public Transport]]
|
||||
- [[id:sol-interchange][Sol interchange]]
|
||||
12
notes/metro/sol-interchange.org
Normal file
12
notes/metro/sol-interchange.org
Normal file
@@ -0,0 +1,12 @@
|
||||
:PROPERTIES:
|
||||
:ID: sol-interchange
|
||||
:END:
|
||||
#+title: Sol (interchange)
|
||||
|
||||
Sol is the busiest interchange station in the Madrid Metro, sitting beneath
|
||||
Puerta del Sol in the city centre.
|
||||
|
||||
Lines serving Sol: [[id:metro-madrid][L1]], L2, L3.
|
||||
|
||||
It also connects to the Cercanías hub underneath, making it the de-facto
|
||||
zero point of Madrid's public transport.
|
||||
BIN
notes/quartz-logo.png
Normal file
BIN
notes/quartz-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 8.2 KiB |
22
notes/roads/crtm.org
Normal file
22
notes/roads/crtm.org
Normal file
@@ -0,0 +1,22 @@
|
||||
:PROPERTIES:
|
||||
:ID: crtm
|
||||
:END:
|
||||
#+title: CRTM — Consorcio Regional de Transportes de Madrid
|
||||
|
||||
The CRTM is the regional authority that coordinates public transport across
|
||||
the Community of Madrid. It does not operate services directly but sets
|
||||
fares, zones, and integration policy.
|
||||
|
||||
* Fare zones
|
||||
| Zone | Coverage |
|
||||
|-------+-----------------------------|
|
||||
| A | Municipality of Madrid |
|
||||
| B1 | Inner ring municipalities |
|
||||
| B2 | Outer ring municipalities |
|
||||
| B3 | Further suburban area |
|
||||
| C1–C2 | Commuter belt |
|
||||
|
||||
* Related
|
||||
- [[id:madrid-transport][Madrid Public Transport]]
|
||||
- [[id:metro-madrid][Metro de Madrid]]
|
||||
- [[id:emt-madrid][EMT Madrid]]
|
||||
19
notes/roads/m30.org
Normal file
19
notes/roads/m30.org
Normal file
@@ -0,0 +1,19 @@
|
||||
:PROPERTIES:
|
||||
:ID: m30
|
||||
:END:
|
||||
#+title: M-30
|
||||
|
||||
The M-30 is Madrid's innermost ring road, circling the city centre at a
|
||||
radius of roughly 3–5 km from Puerta del Sol.
|
||||
|
||||
It runs mostly underground through the Madrid Río tunnel section along the
|
||||
Manzanares river, built during the 2004–2007 renovation that reclaimed the
|
||||
riverbank as a public park.
|
||||
|
||||
* Key junctions
|
||||
- Nudo Norte — connects to A-1 (Burgos) and A-6 (La Coruña)
|
||||
- Nudo Sur — connects to A-4 (Cádiz) and A-42 (Toledo)
|
||||
|
||||
* See also
|
||||
- [[id:crtm][CRTM]]
|
||||
- [[id:madrid-transport][Madrid Public Transport]]
|
||||
10
opencode.json
Normal file
10
opencode.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"$schema": "https://opencode.ai/config.json",
|
||||
"mcp": {
|
||||
"nixos": {
|
||||
"type": "local",
|
||||
"command": ["mcp-nixos"],
|
||||
"enabled": true
|
||||
}
|
||||
}
|
||||
}
|
||||
2091
package-lock.json
generated
2091
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
49
package.json
49
package.json
@@ -17,7 +17,10 @@
|
||||
"check": "tsc --noEmit && npx prettier . --check",
|
||||
"format": "npx prettier . --write",
|
||||
"test": "tsx --test",
|
||||
"profile": "0x -D prof ./quartz/bootstrap-cli.mjs build --concurrency=1"
|
||||
"profile": "0x -D prof ./quartz/bootstrap-cli.mjs build --concurrency=1",
|
||||
"export": "elixir scripts/export.exs",
|
||||
"build:notes": "elixir scripts/export.exs && npx quartz build",
|
||||
"serve:notes": "elixir scripts/export.exs && npx quartz build --serve"
|
||||
},
|
||||
"engines": {
|
||||
"npm": ">=10.9.2",
|
||||
@@ -37,35 +40,33 @@
|
||||
"dependencies": {
|
||||
"@clack/prompts": "^0.11.0",
|
||||
"@floating-ui/dom": "^1.7.4",
|
||||
"@huggingface/transformers": "^3.7.5",
|
||||
"@myriaddreamin/rehype-typst": "^0.6.0",
|
||||
"@napi-rs/simple-git": "0.1.22",
|
||||
"@tweenjs/tween.js": "^25.0.0",
|
||||
"ansi-truncate": "^1.4.0",
|
||||
"async-mutex": "^0.5.0",
|
||||
"chokidar": "^4.0.3",
|
||||
"chokidar": "^5.0.0",
|
||||
"cli-spinner": "^0.2.10",
|
||||
"d3": "^7.9.0",
|
||||
"esbuild-sass-plugin": "^3.3.1",
|
||||
"esbuild-sass-plugin": "^3.6.0",
|
||||
"flexsearch": "^0.8.205",
|
||||
"github-slugger": "^2.0.0",
|
||||
"globby": "^15.0.0",
|
||||
"globby": "^16.1.0",
|
||||
"gray-matter": "^4.0.3",
|
||||
"hast-util-to-html": "^9.0.5",
|
||||
"hast-util-to-jsx-runtime": "^2.3.6",
|
||||
"hast-util-to-string": "^3.0.1",
|
||||
"is-absolute-url": "^5.0.0",
|
||||
"js-yaml": "^4.1.0",
|
||||
"lightningcss": "^1.30.2",
|
||||
"js-yaml": "^4.1.1",
|
||||
"lightningcss": "^1.31.1",
|
||||
"mdast-util-find-and-replace": "^3.0.2",
|
||||
"mdast-util-to-hast": "^13.2.0",
|
||||
"mdast-util-to-hast": "^13.2.1",
|
||||
"mdast-util-to-string": "^4.0.0",
|
||||
"micromorph": "^0.4.5",
|
||||
"minimatch": "^10.0.3",
|
||||
"onnxruntime-web": "^1.23.0",
|
||||
"pixi.js": "^8.13.2",
|
||||
"preact": "^10.27.2",
|
||||
"preact-render-to-string": "^6.6.1",
|
||||
"minimatch": "^10.1.1",
|
||||
"pixi.js": "^8.15.0",
|
||||
"preact": "^10.28.2",
|
||||
"preact-render-to-string": "^6.6.5",
|
||||
"pretty-bytes": "^7.1.0",
|
||||
"pretty-time": "^1.1.0",
|
||||
"reading-time": "^1.5.0",
|
||||
@@ -85,32 +86,32 @@
|
||||
"remark-rehype": "^11.1.2",
|
||||
"remark-smartypants": "^3.0.2",
|
||||
"rfdc": "^1.4.1",
|
||||
"satori": "^0.18.3",
|
||||
"satori": "^0.19.1",
|
||||
"serve-handler": "^6.1.6",
|
||||
"sharp": "^0.34.4",
|
||||
"sharp": "^0.34.5",
|
||||
"shiki": "^1.26.2",
|
||||
"source-map-support": "^0.5.21",
|
||||
"to-vfile": "^8.0.0",
|
||||
"toml": "^3.0.0",
|
||||
"unified": "^11.0.5",
|
||||
"unist-util-visit": "^5.0.0",
|
||||
"unist-util-visit": "^5.1.0",
|
||||
"vfile": "^6.0.3",
|
||||
"workerpool": "^9.3.4",
|
||||
"ws": "^8.18.3",
|
||||
"workerpool": "^10.0.1",
|
||||
"ws": "^8.19.0",
|
||||
"yargs": "^18.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/d3": "^7.4.3",
|
||||
"@types/hast": "^3.0.4",
|
||||
"@types/js-yaml": "^4.0.9",
|
||||
"@types/node": "^24.6.0",
|
||||
"@types/node": "^25.0.10",
|
||||
"@types/pretty-time": "^1.1.5",
|
||||
"@types/source-map-support": "^0.5.10",
|
||||
"@types/ws": "^8.18.1",
|
||||
"@types/yargs": "^17.0.33",
|
||||
"esbuild": "^0.25.10",
|
||||
"prettier": "^3.6.2",
|
||||
"tsx": "^4.20.6",
|
||||
"typescript": "^5.9.2"
|
||||
"@types/yargs": "^17.0.35",
|
||||
"esbuild": "^0.27.2",
|
||||
"prettier": "^3.8.1",
|
||||
"tsx": "^4.21.0",
|
||||
"typescript": "^5.9.3"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,18 +1,6 @@
|
||||
import { GlobalConfiguration, QuartzConfig } from "./quartz/cfg"
|
||||
import { QuartzConfig } from "./quartz/cfg"
|
||||
import * as Plugin from "./quartz/plugins"
|
||||
|
||||
const semanticSearch: GlobalConfiguration["semanticSearch"] = {
|
||||
enable: true,
|
||||
model: "onnx-community/embeddinggemma-300m-ONNX",
|
||||
aot: true,
|
||||
dims: 768,
|
||||
dtype: "fp32",
|
||||
shardSizeRows: 1024,
|
||||
hnsw: { M: 16, efConstruction: 200 },
|
||||
chunking: { chunkSize: 256, chunkOverlap: 64 },
|
||||
vllm: { enable: true, concurrency: 16, batchSize: 128 },
|
||||
}
|
||||
|
||||
/**
|
||||
* Quartz 4 Configuration
|
||||
*
|
||||
@@ -64,11 +52,10 @@ const config: QuartzConfig = {
|
||||
},
|
||||
},
|
||||
},
|
||||
semanticSearch,
|
||||
},
|
||||
plugins: {
|
||||
transformers: [
|
||||
Plugin.FrontMatter(),
|
||||
Plugin.FrontMatter({ delimiters: "+++", language: "toml" }),
|
||||
Plugin.CreatedModifiedDate({
|
||||
priority: ["frontmatter", "git", "filesystem"],
|
||||
}),
|
||||
@@ -79,7 +66,11 @@ const config: QuartzConfig = {
|
||||
},
|
||||
keepBackground: false,
|
||||
}),
|
||||
Plugin.ObsidianFlavoredMarkdown({ enableInHtmlEmbed: false }),
|
||||
// OxHugoFlavouredMarkdown must come before GitHubFlavoredMarkdown.
|
||||
// Note: not compatible with ObsidianFlavoredMarkdown — use one or the other.
|
||||
// If ox-hugo exports TOML frontmatter, change FrontMatter to:
|
||||
// Plugin.FrontMatter({ delims: "+++", language: "toml" })
|
||||
Plugin.OxHugoFlavouredMarkdown(),
|
||||
Plugin.GitHubFlavoredMarkdown(),
|
||||
Plugin.TableOfContents(),
|
||||
Plugin.CrawlLinks({ markdownLinkResolution: "shortest" }),
|
||||
@@ -97,7 +88,6 @@ const config: QuartzConfig = {
|
||||
enableSiteMap: true,
|
||||
enableRSS: true,
|
||||
}),
|
||||
Plugin.SemanticIndex(semanticSearch),
|
||||
Plugin.Assets(),
|
||||
Plugin.Static(),
|
||||
Plugin.Favicon(),
|
||||
|
||||
@@ -71,7 +71,7 @@ async function buildQuartz(argv: Argv, mut: Mutex, clientRefresh: () => void) {
|
||||
console.log(`Cleaned output directory \`${output}\` in ${perf.timeSince("clean")}`)
|
||||
|
||||
perf.addEvent("glob")
|
||||
const allFiles = await glob("**/*.*", argv.directory, cfg.configuration.ignorePatterns)
|
||||
const allFiles = await glob("**/*.*", argv.directory, cfg.configuration.ignorePatterns, false)
|
||||
const markdownPaths = allFiles.filter((fp) => fp.endsWith(".md")).sort()
|
||||
console.log(
|
||||
`Found ${markdownPaths.length} input files from \`${argv.directory}\` in ${perf.timeSince("glob")}`,
|
||||
@@ -143,6 +143,7 @@ async function startWatching(
|
||||
}
|
||||
|
||||
const watcher = chokidar.watch(".", {
|
||||
awaitWriteFinish: { stabilityThreshold: 250 },
|
||||
persistent: true,
|
||||
cwd: argv.directory,
|
||||
ignoreInitial: true,
|
||||
|
||||
@@ -50,6 +50,11 @@ export type Analytics =
|
||||
| {
|
||||
provider: "vercel"
|
||||
}
|
||||
| {
|
||||
provider: "rybbit"
|
||||
siteId: string
|
||||
host?: string
|
||||
}
|
||||
|
||||
export interface GlobalConfiguration {
|
||||
pageTitle: string
|
||||
@@ -78,34 +83,6 @@ export interface GlobalConfiguration {
|
||||
* Region Codes: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
|
||||
*/
|
||||
locale: ValidLocale
|
||||
/** Semantic search configuration */
|
||||
semanticSearch?: {
|
||||
enable: boolean
|
||||
model: string
|
||||
aot: boolean
|
||||
dtype: "fp32" | "fp16"
|
||||
dims: number
|
||||
shardSizeRows: number
|
||||
manifestUrl?: string
|
||||
manifestBaseUrl?: string
|
||||
disableCache?: boolean
|
||||
hnsw: {
|
||||
M: number
|
||||
efConstruction: number
|
||||
efSearch?: number
|
||||
}
|
||||
chunking: {
|
||||
chunkSize: number
|
||||
chunkOverlap: number
|
||||
noChunking?: boolean
|
||||
}
|
||||
vllm?: {
|
||||
enable: boolean
|
||||
vllmUrl?: string
|
||||
concurrency: number
|
||||
batchSize: number
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export interface QuartzConfig {
|
||||
|
||||
@@ -7,8 +7,8 @@ import fs from "fs"
|
||||
export function escapePath(fp) {
|
||||
return fp
|
||||
.replace(/\\ /g, " ") // unescape spaces
|
||||
.replace(/^".*"$/, "$1")
|
||||
.replace(/^'.*"$/, "$1")
|
||||
.replace(/^"(.*)"$/, "$1")
|
||||
.replace(/^'(.*)'$/, "$1")
|
||||
.trim()
|
||||
}
|
||||
|
||||
|
||||
@@ -7,12 +7,10 @@ import { i18n } from "../i18n"
|
||||
|
||||
export interface SearchOptions {
|
||||
enablePreview: boolean
|
||||
includeButton: boolean
|
||||
}
|
||||
|
||||
const defaultOptions: SearchOptions = {
|
||||
enablePreview: true,
|
||||
includeButton: true,
|
||||
}
|
||||
|
||||
export default ((userOpts?: Partial<SearchOptions>) => {
|
||||
@@ -31,54 +29,19 @@ export default ((userOpts?: Partial<SearchOptions>) => {
|
||||
</svg>
|
||||
<p>{i18n(cfg.locale).components.search.title}</p>
|
||||
</button>
|
||||
<search class="search-container">
|
||||
<form class="search-space">
|
||||
<div class="input-container">
|
||||
<input
|
||||
autocomplete="off"
|
||||
class="search-bar"
|
||||
name="search"
|
||||
type="text"
|
||||
aria-label={searchPlaceholder}
|
||||
placeholder={searchPlaceholder}
|
||||
/>
|
||||
<div class="search-mode-toggle" role="radiogroup" aria-label="Search mode">
|
||||
<button
|
||||
type="button"
|
||||
class="mode-option"
|
||||
data-mode="lexical"
|
||||
aria-pressed="true"
|
||||
aria-label="Full-text search"
|
||||
>
|
||||
<svg viewBox="0 0 20 20" role="img" aria-hidden="true">
|
||||
<g fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round">
|
||||
<path d="M4 6h12M4 10h8M4 14h6" />
|
||||
</g>
|
||||
</svg>
|
||||
<span class="sr-only">Full-text</span>
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
class="mode-option"
|
||||
data-mode="semantic"
|
||||
aria-pressed="false"
|
||||
aria-label="Semantic search"
|
||||
>
|
||||
<svg viewBox="0 0 20 20" role="img" aria-hidden="true">
|
||||
<g fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round">
|
||||
<circle cx="5.2" cy="10" r="2.4" />
|
||||
<circle cx="14.8" cy="4.8" r="2.1" />
|
||||
<circle cx="14.8" cy="15.2" r="2.1" />
|
||||
<path d="M7.1 8.7l5.2-2.4M7.1 11.3l5.2 2.4M14.8 6.9v6.2" />
|
||||
</g>
|
||||
</svg>
|
||||
<span class="sr-only">Semantic</span>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<output class="search-layout" data-preview={opts.enablePreview} />
|
||||
</form>
|
||||
</search>
|
||||
<div class="search-container">
|
||||
<div class="search-space">
|
||||
<input
|
||||
autocomplete="off"
|
||||
class="search-bar"
|
||||
name="search"
|
||||
type="text"
|
||||
aria-label={searchPlaceholder}
|
||||
placeholder={searchPlaceholder}
|
||||
/>
|
||||
<div class="search-layout" data-preview={opts.enablePreview}></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ import { visit } from "unist-util-visit"
|
||||
import { Root, Element, ElementContent } from "hast"
|
||||
import { GlobalConfiguration } from "../cfg"
|
||||
import { i18n } from "../i18n"
|
||||
import { styleText } from "util"
|
||||
|
||||
interface RenderComponents {
|
||||
head: QuartzComponent
|
||||
@@ -25,7 +26,6 @@ const headerRegex = new RegExp(/h[1-6]/)
|
||||
export function pageResources(
|
||||
baseDir: FullSlug | RelativeURL,
|
||||
staticResources: StaticResources,
|
||||
cfg?: GlobalConfiguration,
|
||||
): StaticResources {
|
||||
const contentIndexPath = joinSegments(baseDir, "static/contentIndex.json")
|
||||
const contentIndexScript = `const fetchData = fetch("${contentIndexPath}").then(data => data.json())`
|
||||
@@ -49,12 +49,6 @@ export function pageResources(
|
||||
spaPreserve: true,
|
||||
script: contentIndexScript,
|
||||
},
|
||||
{
|
||||
loadTime: "beforeDOMReady",
|
||||
contentType: "inline",
|
||||
spaPreserve: true,
|
||||
script: `const semanticCfg = ${JSON.stringify(cfg?.semanticSearch ?? {})};`,
|
||||
},
|
||||
...staticResources.js,
|
||||
],
|
||||
additionalHead: staticResources.additionalHead,
|
||||
@@ -75,6 +69,7 @@ function renderTranscludes(
|
||||
cfg: GlobalConfiguration,
|
||||
slug: FullSlug,
|
||||
componentData: QuartzComponentProps,
|
||||
visited: Set<FullSlug>,
|
||||
) {
|
||||
// process transcludes in componentData
|
||||
visit(root, "element", (node, _index, _parent) => {
|
||||
@@ -83,6 +78,30 @@ function renderTranscludes(
|
||||
if (classNames.includes("transclude")) {
|
||||
const inner = node.children[0] as Element
|
||||
const transcludeTarget = (inner.properties["data-slug"] ?? slug) as FullSlug
|
||||
if (visited.has(transcludeTarget)) {
|
||||
console.warn(
|
||||
styleText(
|
||||
"yellow",
|
||||
`Warning: Skipping circular transclusion: ${slug} -> ${transcludeTarget}`,
|
||||
),
|
||||
)
|
||||
node.children = [
|
||||
{
|
||||
type: "element",
|
||||
tagName: "p",
|
||||
properties: { style: "color: var(--secondary);" },
|
||||
children: [
|
||||
{
|
||||
type: "text",
|
||||
value: `Circular transclusion detected: ${transcludeTarget}`,
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
return
|
||||
}
|
||||
visited.add(transcludeTarget)
|
||||
|
||||
const page = componentData.allFiles.find((f) => f.slug === transcludeTarget)
|
||||
if (!page) {
|
||||
return
|
||||
@@ -203,7 +222,8 @@ export function renderPage(
|
||||
// make a deep copy of the tree so we don't remove the transclusion references
|
||||
// for the file cached in contentMap in build.ts
|
||||
const root = clone(componentData.tree) as Root
|
||||
renderTranscludes(root, cfg, slug, componentData)
|
||||
const visited = new Set<FullSlug>([slug])
|
||||
renderTranscludes(root, cfg, slug, componentData, visited)
|
||||
|
||||
// set componentData.tree to the edited html that has transclusions rendered
|
||||
componentData.tree = root
|
||||
@@ -274,7 +294,7 @@ export function renderPage(
|
||||
</body>
|
||||
{pageResources.js
|
||||
.filter((resource) => resource.loadTime === "afterDOMReady")
|
||||
.map((res) => JSResourceToScriptElement(res))}
|
||||
.map((res) => JSResourceToScriptElement(res, true))}
|
||||
</html>
|
||||
)
|
||||
|
||||
|
||||
@@ -111,6 +111,10 @@ function createFolderNode(
|
||||
const folderPath = node.slug
|
||||
folderContainer.dataset.folderpath = folderPath
|
||||
|
||||
if (currentSlug === folderPath) {
|
||||
folderContainer.classList.add("active")
|
||||
}
|
||||
|
||||
if (opts.folderClickBehavior === "link") {
|
||||
// Replace button with link for link behavior
|
||||
const button = titleContainer.querySelector(".folder-button") as HTMLElement
|
||||
|
||||
@@ -29,17 +29,31 @@ class DiagramPanZoom {
|
||||
const mouseDownHandler = this.onMouseDown.bind(this)
|
||||
const mouseMoveHandler = this.onMouseMove.bind(this)
|
||||
const mouseUpHandler = this.onMouseUp.bind(this)
|
||||
|
||||
// Touch drag events
|
||||
const touchStartHandler = this.onTouchStart.bind(this)
|
||||
const touchMoveHandler = this.onTouchMove.bind(this)
|
||||
const touchEndHandler = this.onTouchEnd.bind(this)
|
||||
|
||||
const resizeHandler = this.resetTransform.bind(this)
|
||||
|
||||
this.container.addEventListener("mousedown", mouseDownHandler)
|
||||
document.addEventListener("mousemove", mouseMoveHandler)
|
||||
document.addEventListener("mouseup", mouseUpHandler)
|
||||
|
||||
this.container.addEventListener("touchstart", touchStartHandler, { passive: false })
|
||||
document.addEventListener("touchmove", touchMoveHandler, { passive: false })
|
||||
document.addEventListener("touchend", touchEndHandler)
|
||||
|
||||
window.addEventListener("resize", resizeHandler)
|
||||
|
||||
this.cleanups.push(
|
||||
() => this.container.removeEventListener("mousedown", mouseDownHandler),
|
||||
() => document.removeEventListener("mousemove", mouseMoveHandler),
|
||||
() => document.removeEventListener("mouseup", mouseUpHandler),
|
||||
() => this.container.removeEventListener("touchstart", touchStartHandler),
|
||||
() => document.removeEventListener("touchmove", touchMoveHandler),
|
||||
() => document.removeEventListener("touchend", touchEndHandler),
|
||||
() => window.removeEventListener("resize", resizeHandler),
|
||||
)
|
||||
}
|
||||
@@ -99,6 +113,30 @@ class DiagramPanZoom {
|
||||
this.container.style.cursor = "grab"
|
||||
}
|
||||
|
||||
private onTouchStart(e: TouchEvent) {
|
||||
if (e.touches.length !== 1) return
|
||||
this.isDragging = true
|
||||
const touch = e.touches[0]
|
||||
this.startPan = { x: touch.clientX - this.currentPan.x, y: touch.clientY - this.currentPan.y }
|
||||
}
|
||||
|
||||
private onTouchMove(e: TouchEvent) {
|
||||
if (!this.isDragging || e.touches.length !== 1) return
|
||||
e.preventDefault() // Prevent scrolling
|
||||
|
||||
const touch = e.touches[0]
|
||||
this.currentPan = {
|
||||
x: touch.clientX - this.startPan.x,
|
||||
y: touch.clientY - this.startPan.y,
|
||||
}
|
||||
|
||||
this.updateTransform()
|
||||
}
|
||||
|
||||
private onTouchEnd() {
|
||||
this.isDragging = false
|
||||
}
|
||||
|
||||
private zoom(delta: number) {
|
||||
const newScale = Math.min(Math.max(this.scale + delta, this.MIN_SCALE), this.MAX_SCALE)
|
||||
|
||||
@@ -120,11 +158,15 @@ class DiagramPanZoom {
|
||||
}
|
||||
|
||||
private resetTransform() {
|
||||
this.scale = 1
|
||||
const svg = this.content.querySelector("svg")!
|
||||
const rect = svg.getBoundingClientRect()
|
||||
const width = rect.width / this.scale
|
||||
const height = rect.height / this.scale
|
||||
|
||||
this.scale = 1
|
||||
this.currentPan = {
|
||||
x: svg.getBoundingClientRect().width / 2,
|
||||
y: svg.getBoundingClientRect().height / 2,
|
||||
x: (this.container.clientWidth - width) / 2,
|
||||
y: (this.container.clientHeight - height) / 2,
|
||||
}
|
||||
this.updateTransform()
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import FlexSearch, { DefaultDocumentSearchResults, Id } from "flexsearch"
|
||||
import FlexSearch, { DefaultDocumentSearchResults } from "flexsearch"
|
||||
import { ContentDetails } from "../../plugins/emitters/contentIndex"
|
||||
import { SemanticClient, type SemanticResult } from "./semantic.inline"
|
||||
import { registerEscapeHandler, removeAllChildren, fetchCanonical } from "./util"
|
||||
import { registerEscapeHandler, removeAllChildren } from "./util"
|
||||
import { FullSlug, normalizeRelativeURLs, resolveRelative } from "../../util/path"
|
||||
|
||||
interface Item {
|
||||
@@ -15,46 +14,81 @@ interface Item {
|
||||
|
||||
// Can be expanded with things like "term" in the future
|
||||
type SearchType = "basic" | "tags"
|
||||
type SearchMode = "lexical" | "semantic"
|
||||
const SEARCH_MODE_STORAGE_KEY = "quartz:search:mode"
|
||||
|
||||
const loadStoredSearchMode = (): SearchMode | null => {
|
||||
if (typeof window === "undefined") {
|
||||
return null
|
||||
}
|
||||
|
||||
try {
|
||||
const stored = window.localStorage.getItem(SEARCH_MODE_STORAGE_KEY)
|
||||
return stored === "lexical" || stored === "semantic" ? stored : null
|
||||
} catch (err) {
|
||||
console.warn("[Search] failed to read stored search mode:", err)
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
const persistSearchMode = (mode: SearchMode) => {
|
||||
if (typeof window === "undefined") {
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
window.localStorage.setItem(SEARCH_MODE_STORAGE_KEY, mode)
|
||||
} catch (err) {
|
||||
console.warn("[Search] failed to persist search mode:", err)
|
||||
}
|
||||
}
|
||||
|
||||
let searchMode: SearchMode = "lexical"
|
||||
let searchType: SearchType = "basic"
|
||||
let currentSearchTerm: string = ""
|
||||
let rawSearchTerm: string = ""
|
||||
let semantic: SemanticClient | null = null
|
||||
let semanticReady = false
|
||||
let semanticInitFailed = false
|
||||
type SimilarityResult = { item: Item; similarity: number }
|
||||
let chunkMetadata: Record<string, { parentSlug: string; chunkId: number }> = {}
|
||||
let manifestIds: string[] = []
|
||||
const encoder = (str: string): string[] => {
|
||||
const tokens: string[] = []
|
||||
let bufferStart = -1
|
||||
let bufferEnd = -1
|
||||
const lower = str.toLowerCase()
|
||||
|
||||
let i = 0
|
||||
for (const char of lower) {
|
||||
const code = char.codePointAt(0)!
|
||||
|
||||
const isCJK =
|
||||
(code >= 0x3040 && code <= 0x309f) ||
|
||||
(code >= 0x30a0 && code <= 0x30ff) ||
|
||||
(code >= 0x4e00 && code <= 0x9fff) ||
|
||||
(code >= 0xac00 && code <= 0xd7af) ||
|
||||
(code >= 0x20000 && code <= 0x2a6df)
|
||||
|
||||
const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13
|
||||
|
||||
if (isCJK) {
|
||||
if (bufferStart !== -1) {
|
||||
tokens.push(lower.slice(bufferStart, bufferEnd))
|
||||
bufferStart = -1
|
||||
}
|
||||
tokens.push(char)
|
||||
} else if (isWhitespace) {
|
||||
if (bufferStart !== -1) {
|
||||
tokens.push(lower.slice(bufferStart, bufferEnd))
|
||||
bufferStart = -1
|
||||
}
|
||||
} else {
|
||||
if (bufferStart === -1) bufferStart = i
|
||||
bufferEnd = i + char.length
|
||||
}
|
||||
|
||||
i += char.length
|
||||
}
|
||||
|
||||
if (bufferStart !== -1) {
|
||||
tokens.push(lower.slice(bufferStart))
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
let index = new FlexSearch.Document<Item>({
|
||||
encode: encoder,
|
||||
document: {
|
||||
id: "id",
|
||||
tag: "tags",
|
||||
index: [
|
||||
{
|
||||
field: "title",
|
||||
tokenize: "forward",
|
||||
},
|
||||
{
|
||||
field: "content",
|
||||
tokenize: "forward",
|
||||
},
|
||||
{
|
||||
field: "tags",
|
||||
tokenize: "forward",
|
||||
},
|
||||
],
|
||||
},
|
||||
})
|
||||
|
||||
const p = new DOMParser()
|
||||
const fetchContentCache: Map<FullSlug, Element[]> = new Map()
|
||||
const contextWindowWords = 30
|
||||
const numSearchResults = 8
|
||||
const numTagResults = 5
|
||||
|
||||
const tokenizeTerm = (term: string) => {
|
||||
const tokens = term.split(/\s+/).filter((t) => t.trim() !== "")
|
||||
const tokenLen = tokens.length
|
||||
@@ -112,102 +146,6 @@ function highlight(searchTerm: string, text: string, trim?: boolean) {
|
||||
}`
|
||||
}
|
||||
|
||||
// To be used with search and everything else with flexsearch
|
||||
const encoder = (str: string) =>
|
||||
str
|
||||
.toLowerCase()
|
||||
.split(/\s+/)
|
||||
.filter((token) => token.length > 0)
|
||||
|
||||
/**
|
||||
* Get parent document slug for a chunk ID
|
||||
*/
|
||||
function getParentSlug(slug: string): string {
|
||||
const meta = chunkMetadata[slug]
|
||||
return meta ? meta.parentSlug : slug
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate semantic search results from chunks to documents using RRF
|
||||
* @param results Raw semantic results (chunk-level)
|
||||
* @param slugToDocIndex Map from document slug to index in idDataMap
|
||||
* @returns Object with rrfScores (for ranking) and maxScores (for display)
|
||||
*/
|
||||
function aggregateChunkResults(
|
||||
results: SemanticResult[],
|
||||
slugToDocIndex: Map<FullSlug, number>,
|
||||
): { rrfScores: Map<number, number>; maxScores: Map<number, number> } {
|
||||
// Group chunks by parent document
|
||||
const docChunks = new Map<string, Array<{ score: number }>>()
|
||||
|
||||
results.forEach(({ id, score }) => {
|
||||
// id is an index into manifestIds (the chunk IDs from embeddings)
|
||||
const chunkSlug = manifestIds[id]
|
||||
if (!chunkSlug) return
|
||||
|
||||
// Get parent document slug
|
||||
const parentSlug = getParentSlug(chunkSlug)
|
||||
|
||||
if (!docChunks.has(parentSlug)) {
|
||||
docChunks.set(parentSlug, [])
|
||||
}
|
||||
|
||||
docChunks.get(parentSlug)!.push({ score })
|
||||
})
|
||||
|
||||
// Apply RRF for ranking and track max similarity for display
|
||||
const rrfScores = new Map<number, number>()
|
||||
const maxScores = new Map<number, number>()
|
||||
const RRF_K = 60
|
||||
|
||||
for (const [parentSlug, chunks] of docChunks) {
|
||||
const docIdx = slugToDocIndex.get(parentSlug as FullSlug)
|
||||
if (typeof docIdx !== "number") continue
|
||||
|
||||
// Sort chunks by score descending to assign per-document ranks
|
||||
chunks.sort((a, b) => b.score - a.score)
|
||||
|
||||
// RRF formula: sum(1 / (k + rank)) across all chunks, using per-document ranks
|
||||
const rrfScore = chunks.reduce((sum, _, rank) => sum + 1.0 / (RRF_K + rank), 0)
|
||||
|
||||
// Max similarity score for display (original 0-1 range)
|
||||
const maxScore = chunks[0].score
|
||||
|
||||
rrfScores.set(docIdx, rrfScore)
|
||||
maxScores.set(docIdx, maxScore)
|
||||
}
|
||||
|
||||
return { rrfScores, maxScores }
|
||||
}
|
||||
|
||||
// Initialize the FlexSearch Document instance with the appropriate configuration
|
||||
const index = new FlexSearch.Document<Item>({
|
||||
tokenize: "forward",
|
||||
encode: encoder,
|
||||
document: {
|
||||
id: "id",
|
||||
tag: "tags",
|
||||
index: [
|
||||
{
|
||||
field: "title",
|
||||
tokenize: "forward",
|
||||
},
|
||||
{
|
||||
field: "content",
|
||||
tokenize: "forward",
|
||||
},
|
||||
{
|
||||
field: "tags",
|
||||
tokenize: "forward",
|
||||
},
|
||||
],
|
||||
},
|
||||
})
|
||||
|
||||
const p = new DOMParser()
|
||||
const fetchContentCache: Map<FullSlug, Element[]> = new Map()
|
||||
const numSearchResults = 10
|
||||
const numTagResults = 10
|
||||
function highlightHTML(searchTerm: string, el: HTMLElement) {
|
||||
const p = new DOMParser()
|
||||
const tokenizedTerms = tokenizeTerm(searchTerm)
|
||||
@@ -249,11 +187,7 @@ function highlightHTML(searchTerm: string, el: HTMLElement) {
|
||||
return html.body
|
||||
}
|
||||
|
||||
async function setupSearch(
|
||||
searchElement: HTMLDivElement,
|
||||
currentSlug: FullSlug,
|
||||
data: ContentIndex,
|
||||
) {
|
||||
async function setupSearch(searchElement: Element, currentSlug: FullSlug, data: ContentIndex) {
|
||||
const container = searchElement.querySelector(".search-container") as HTMLElement
|
||||
if (!container) return
|
||||
|
||||
@@ -268,183 +202,12 @@ async function setupSearch(
|
||||
const searchLayout = searchElement.querySelector(".search-layout") as HTMLElement
|
||||
if (!searchLayout) return
|
||||
|
||||
const searchSpace = searchElement?.querySelector(".search-space") as HTMLFormElement
|
||||
if (!searchSpace) return
|
||||
|
||||
// Create semantic search progress bar
|
||||
const progressBar = document.createElement("div")
|
||||
progressBar.className = "semantic-search-progress"
|
||||
progressBar.style.cssText = `
|
||||
position: absolute;
|
||||
bottom: 0;
|
||||
left: 0;
|
||||
height: 2px;
|
||||
width: 0;
|
||||
background: var(--secondary);
|
||||
transition: width 0.3s ease, opacity 0.3s ease;
|
||||
opacity: 0;
|
||||
z-index: 9999;
|
||||
`
|
||||
searchBar.parentElement?.appendChild(progressBar)
|
||||
|
||||
const startSemanticProgress = () => {
|
||||
progressBar.style.opacity = "1"
|
||||
progressBar.style.width = "0"
|
||||
setTimeout(() => {
|
||||
progressBar.style.width = "100%"
|
||||
}, 10)
|
||||
}
|
||||
|
||||
const completeSemanticProgress = () => {
|
||||
progressBar.style.opacity = "0"
|
||||
setTimeout(() => {
|
||||
progressBar.style.width = "0"
|
||||
}, 300)
|
||||
}
|
||||
|
||||
const resetProgressBar = () => {
|
||||
progressBar.style.opacity = "0"
|
||||
progressBar.style.width = "0"
|
||||
}
|
||||
|
||||
const idDataMap = Object.keys(data) as FullSlug[]
|
||||
const slugToIndex = new Map<FullSlug, number>()
|
||||
idDataMap.forEach((slug, idx) => slugToIndex.set(slug, idx))
|
||||
const modeToggle = searchSpace.querySelector(".search-mode-toggle") as HTMLDivElement | null
|
||||
const modeButtons = modeToggle
|
||||
? Array.from(modeToggle.querySelectorAll<HTMLButtonElement>(".mode-option"))
|
||||
: []
|
||||
|
||||
const appendLayout = (el: HTMLElement) => {
|
||||
searchLayout.appendChild(el)
|
||||
}
|
||||
|
||||
const enablePreview = searchLayout.dataset.preview === "true"
|
||||
if (!semantic && !semanticInitFailed) {
|
||||
const client = new SemanticClient(semanticCfg)
|
||||
try {
|
||||
await client.ensureReady()
|
||||
semantic = client
|
||||
semanticReady = true
|
||||
|
||||
// Load chunk metadata and IDs from manifest
|
||||
try {
|
||||
const manifestUrl = "/embeddings/manifest.json"
|
||||
const res = await fetch(manifestUrl)
|
||||
if (res.ok) {
|
||||
const manifest = await res.json()
|
||||
chunkMetadata = manifest.chunkMetadata || {}
|
||||
manifestIds = manifest.ids || []
|
||||
console.debug(
|
||||
`[Search] Loaded manifest: ${manifestIds.length} chunks, ${Object.keys(chunkMetadata).length} chunked documents`,
|
||||
)
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn("[Search] failed to load chunk metadata:", err)
|
||||
chunkMetadata = {}
|
||||
manifestIds = []
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn("[SemanticClient] initialization failed:", err)
|
||||
client.dispose()
|
||||
semantic = null
|
||||
semanticReady = false
|
||||
semanticInitFailed = true
|
||||
}
|
||||
} else if (semantic && !semanticReady) {
|
||||
try {
|
||||
await semantic.ensureReady()
|
||||
semanticReady = true
|
||||
} catch (err) {
|
||||
console.warn("[SemanticClient] became unavailable:", err)
|
||||
semantic.dispose()
|
||||
semantic = null
|
||||
semanticReady = false
|
||||
semanticInitFailed = true
|
||||
}
|
||||
}
|
||||
const storedMode = loadStoredSearchMode()
|
||||
if (storedMode === "semantic") {
|
||||
if (semanticReady) {
|
||||
searchMode = storedMode
|
||||
}
|
||||
} else if (storedMode === "lexical") {
|
||||
searchMode = storedMode
|
||||
}
|
||||
if (!semanticReady && searchMode === "semantic") {
|
||||
searchMode = "lexical"
|
||||
}
|
||||
let searchSeq = 0
|
||||
let runSearchTimer: number | null = null
|
||||
let lastInputAt = 0
|
||||
searchLayout.dataset.mode = searchMode
|
||||
|
||||
const updateModeUI = (mode: SearchMode) => {
|
||||
modeButtons.forEach((button) => {
|
||||
const btnMode = (button.dataset.mode as SearchMode) ?? "lexical"
|
||||
const isActive = btnMode === mode
|
||||
button.classList.toggle("active", isActive)
|
||||
button.setAttribute("aria-pressed", String(isActive))
|
||||
})
|
||||
if (modeToggle) {
|
||||
modeToggle.dataset.mode = mode
|
||||
}
|
||||
searchLayout.dataset.mode = mode
|
||||
}
|
||||
|
||||
const computeDebounceDelay = (term: string): number => {
|
||||
const trimmed = term.trim()
|
||||
const lastTerm = currentSearchTerm
|
||||
const isExtension =
|
||||
lastTerm.length > 0 && trimmed.length > lastTerm.length && trimmed.startsWith(lastTerm)
|
||||
const isRetraction = lastTerm.length > trimmed.length
|
||||
const isReplacement =
|
||||
lastTerm.length > 0 && !trimmed.startsWith(lastTerm) && !lastTerm.startsWith(trimmed)
|
||||
const baseFullQueryDelay = 200
|
||||
const semanticPenalty = searchMode === "semantic" ? 60 : 0
|
||||
|
||||
if (isExtension && trimmed.length > 2) {
|
||||
return baseFullQueryDelay + semanticPenalty
|
||||
}
|
||||
|
||||
if (isReplacement && trimmed.length > 3) {
|
||||
return Math.max(90, baseFullQueryDelay - 80)
|
||||
}
|
||||
|
||||
if (isRetraction) {
|
||||
return 90
|
||||
}
|
||||
|
||||
return baseFullQueryDelay + (searchMode === "semantic" ? 40 : 0)
|
||||
}
|
||||
|
||||
const triggerSearchWithMode = (mode: SearchMode) => {
|
||||
if (mode === "semantic" && !semanticReady) {
|
||||
return
|
||||
}
|
||||
if (searchMode === mode) return
|
||||
searchMode = mode
|
||||
updateModeUI(mode)
|
||||
persistSearchMode(searchMode)
|
||||
if (rawSearchTerm.trim() !== "") {
|
||||
searchLayout.classList.add("display-results")
|
||||
const token = ++searchSeq
|
||||
void runSearch(rawSearchTerm, token)
|
||||
}
|
||||
}
|
||||
|
||||
updateModeUI(searchMode)
|
||||
|
||||
modeButtons.forEach((button) => {
|
||||
const btnMode = (button.dataset.mode as SearchMode) ?? "lexical"
|
||||
if (btnMode === "semantic") {
|
||||
button.disabled = !semanticReady
|
||||
button.setAttribute("aria-disabled", String(!semanticReady))
|
||||
}
|
||||
const handler = () => triggerSearchWithMode(btnMode)
|
||||
button.addEventListener("click", handler)
|
||||
window.addCleanup(() => button.removeEventListener("click", handler))
|
||||
})
|
||||
let preview: HTMLDivElement | undefined = undefined
|
||||
let previewInner: HTMLDivElement | undefined = undefined
|
||||
const results = document.createElement("div")
|
||||
@@ -466,23 +229,20 @@ async function setupSearch(
|
||||
removeAllChildren(preview)
|
||||
}
|
||||
searchLayout.classList.remove("display-results")
|
||||
searchType = "basic" // reset search type after closing
|
||||
searchButton.focus()
|
||||
resetProgressBar()
|
||||
}
|
||||
|
||||
function showSearch(type: SearchType) {
|
||||
function showSearch(searchTypeNew: SearchType) {
|
||||
searchType = searchTypeNew
|
||||
if (sidebar) sidebar.style.zIndex = "1"
|
||||
container.classList.add("active")
|
||||
if (type === "tags") {
|
||||
searchBar.value = "#"
|
||||
rawSearchTerm = "#"
|
||||
}
|
||||
searchBar.focus()
|
||||
}
|
||||
|
||||
let currentHover: HTMLInputElement | null = null
|
||||
|
||||
async function shortcutHandler(e: HTMLElementEventMap["keydown"]) {
|
||||
if ((e.key === "/" || e.key === "k") && (e.ctrlKey || e.metaKey) && !e.shiftKey) {
|
||||
if (e.key === "k" && (e.ctrlKey || e.metaKey) && !e.shiftKey) {
|
||||
e.preventDefault()
|
||||
const searchBarOpen = container.classList.contains("active")
|
||||
searchBarOpen ? hideSearch() : showSearch("basic")
|
||||
@@ -492,6 +252,9 @@ async function setupSearch(
|
||||
e.preventDefault()
|
||||
const searchBarOpen = container.classList.contains("active")
|
||||
searchBarOpen ? hideSearch() : showSearch("tags")
|
||||
|
||||
// add "#" prefix for tag search
|
||||
searchBar.value = "#"
|
||||
return
|
||||
}
|
||||
|
||||
@@ -501,29 +264,20 @@ async function setupSearch(
|
||||
|
||||
// If search is active, then we will render the first result and display accordingly
|
||||
if (!container.classList.contains("active")) return
|
||||
if (e.key === "Enter") {
|
||||
if (e.key === "Enter" && !e.isComposing) {
|
||||
// If result has focus, navigate to that one, otherwise pick first result
|
||||
let anchor: HTMLAnchorElement | undefined
|
||||
if (results.contains(document.activeElement)) {
|
||||
anchor = document.activeElement as HTMLAnchorElement
|
||||
if (anchor.classList.contains("no-match")) return
|
||||
await displayPreview(anchor)
|
||||
e.preventDefault()
|
||||
anchor.click()
|
||||
const active = document.activeElement as HTMLInputElement
|
||||
if (active.classList.contains("no-match")) return
|
||||
await displayPreview(active)
|
||||
active.click()
|
||||
} else {
|
||||
anchor = document.getElementsByClassName("result-card")[0] as HTMLAnchorElement
|
||||
const anchor = document.getElementsByClassName("result-card")[0] as HTMLInputElement | null
|
||||
if (!anchor || anchor.classList.contains("no-match")) return
|
||||
await displayPreview(anchor)
|
||||
e.preventDefault()
|
||||
anchor.click()
|
||||
}
|
||||
if (anchor !== undefined)
|
||||
window.spaNavigate(new URL(new URL(anchor.href).pathname, window.location.toString()))
|
||||
} else if (
|
||||
e.key === "ArrowUp" ||
|
||||
(e.shiftKey && e.key === "Tab") ||
|
||||
(e.ctrlKey && e.key === "p")
|
||||
) {
|
||||
} else if (e.key === "ArrowUp" || (e.shiftKey && e.key === "Tab")) {
|
||||
e.preventDefault()
|
||||
if (results.contains(document.activeElement)) {
|
||||
// If an element in results-container already has focus, focus previous one
|
||||
@@ -536,7 +290,7 @@ async function setupSearch(
|
||||
if (prevResult) currentHover = prevResult
|
||||
await displayPreview(prevResult)
|
||||
}
|
||||
} else if (e.key === "ArrowDown" || e.key === "Tab" || (e.ctrlKey && e.key === "n")) {
|
||||
} else if (e.key === "ArrowDown" || e.key === "Tab") {
|
||||
e.preventDefault()
|
||||
// The results should already been focused, so we need to find the next one.
|
||||
// The activeElement is the search bar, so we need to find the first result and focus it.
|
||||
@@ -553,33 +307,25 @@ async function setupSearch(
|
||||
}
|
||||
}
|
||||
|
||||
const formatForDisplay = (term: string, id: number, renderType: SearchType) => {
|
||||
const formatForDisplay = (term: string, id: number) => {
|
||||
const slug = idDataMap[id]
|
||||
|
||||
// Check if query contains title words (for boosting exact matches)
|
||||
const queryTokens = tokenizeTerm(term)
|
||||
const titleTokens = tokenizeTerm(data[slug].title ?? "")
|
||||
const titleMatch = titleTokens.some((t) => queryTokens.includes(t))
|
||||
|
||||
return {
|
||||
id,
|
||||
slug,
|
||||
title: renderType === "tags" ? data[slug].title : highlight(term, data[slug].title ?? ""),
|
||||
title: searchType === "tags" ? data[slug].title : highlight(term, data[slug].title ?? ""),
|
||||
content: highlight(term, data[slug].content ?? "", true),
|
||||
tags: highlightTags(term, data[slug].tags, renderType),
|
||||
titleMatch, // Add title match flag for boosting
|
||||
tags: highlightTags(term.substring(1), data[slug].tags),
|
||||
}
|
||||
}
|
||||
|
||||
function highlightTags(term: string, tags: string[], renderType: SearchType) {
|
||||
if (!tags || renderType !== "tags") {
|
||||
function highlightTags(term: string, tags: string[]) {
|
||||
if (!tags || searchType !== "tags") {
|
||||
return []
|
||||
}
|
||||
|
||||
const tagTerm = term.toLowerCase()
|
||||
return tags
|
||||
.map((tag) => {
|
||||
if (tag.toLowerCase().includes(tagTerm)) {
|
||||
if (tag.toLowerCase().includes(term.toLowerCase())) {
|
||||
return `<li><p class="match-tag">#${tag}</p></li>`
|
||||
} else {
|
||||
return `<li><p>#${tag}</p></li>`
|
||||
@@ -592,40 +338,24 @@ async function setupSearch(
|
||||
return new URL(resolveRelative(currentSlug, slug), location.toString())
|
||||
}
|
||||
|
||||
const resultToHTML = ({ item, percent }: { item: Item; percent: number | null }) => {
|
||||
const { slug, title, content, tags, target } = item
|
||||
const resultToHTML = ({ slug, title, content, tags }: Item) => {
|
||||
const htmlTags = tags.length > 0 ? `<ul class="tags">${tags.join("")}</ul>` : ``
|
||||
const itemTile = document.createElement("a")
|
||||
const titleContent = target ? highlight(currentSearchTerm, target) : title
|
||||
const subscript = target ? `<b>${slug}</b>` : ``
|
||||
let percentLabel = "—"
|
||||
let percentAttr = ""
|
||||
if (percent !== null && Number.isFinite(percent)) {
|
||||
const bounded = Math.max(0, Math.min(100, percent))
|
||||
percentLabel = `${bounded.toFixed(1)}%`
|
||||
percentAttr = bounded.toFixed(3)
|
||||
}
|
||||
itemTile.classList.add("result-card")
|
||||
itemTile.id = slug
|
||||
itemTile.href = resolveUrl(slug).toString()
|
||||
itemTile.innerHTML = `<hgroup>
|
||||
<h3>${titleContent}</h3>
|
||||
${subscript}${htmlTags}
|
||||
${searchMode === "semantic" ? `<span class="result-likelihood" title="match likelihood"> ${percentLabel}</span>` : ""}
|
||||
${enablePreview && window.innerWidth > 600 ? "" : `<p>${content}</p>`}
|
||||
</hgroup>`
|
||||
if (percentAttr) itemTile.dataset.scorePercent = percentAttr
|
||||
else delete itemTile.dataset.scorePercent
|
||||
itemTile.innerHTML = `
|
||||
<h3 class="card-title">${title}</h3>
|
||||
${htmlTags}
|
||||
<p class="card-description">${content}</p>
|
||||
`
|
||||
itemTile.addEventListener("click", (event) => {
|
||||
if (event.altKey || event.ctrlKey || event.metaKey || event.shiftKey) return
|
||||
hideSearch()
|
||||
})
|
||||
|
||||
const handler = (evt: MouseEvent) => {
|
||||
if (evt.altKey || evt.ctrlKey || evt.metaKey || evt.shiftKey) return
|
||||
const anchor = evt.currentTarget as HTMLAnchorElement | null
|
||||
if (!anchor) return
|
||||
evt.preventDefault()
|
||||
const href = anchor.getAttribute("href")
|
||||
if (!href) return
|
||||
const url = new URL(href, window.location.toString())
|
||||
window.spaNavigate(url)
|
||||
const handler = (event: MouseEvent) => {
|
||||
if (event.altKey || event.ctrlKey || event.metaKey || event.shiftKey) return
|
||||
hideSearch()
|
||||
}
|
||||
|
||||
@@ -643,22 +373,15 @@ async function setupSearch(
|
||||
return itemTile
|
||||
}
|
||||
|
||||
async function displayResults(finalResults: SimilarityResult[]) {
|
||||
async function displayResults(finalResults: Item[]) {
|
||||
removeAllChildren(results)
|
||||
if (finalResults.length === 0) {
|
||||
results.innerHTML = `<a class="result-card no-match">
|
||||
<h3>No results.</h3>
|
||||
<p>Try another search term?</p>
|
||||
</a>`
|
||||
currentHover = null
|
||||
} else {
|
||||
const decorated = finalResults.map(({ item, similarity }) => {
|
||||
if (!Number.isFinite(similarity)) return { item, percent: null }
|
||||
const bounded = Math.max(-1, Math.min(1, similarity))
|
||||
const percent = ((bounded + 1) / 2) * 100
|
||||
return { item, percent }
|
||||
})
|
||||
results.append(...decorated.map(resultToHTML))
|
||||
results.append(...finalResults.map(resultToHTML))
|
||||
}
|
||||
|
||||
if (finalResults.length === 0 && preview) {
|
||||
@@ -678,8 +401,8 @@ async function setupSearch(
|
||||
return fetchContentCache.get(slug) as Element[]
|
||||
}
|
||||
|
||||
const targetUrl = resolveUrl(slug)
|
||||
const contents = await fetchCanonical(targetUrl)
|
||||
const targetUrl = resolveUrl(slug).toString()
|
||||
const contents = await fetch(targetUrl)
|
||||
.then((res) => res.text())
|
||||
.then((contents) => {
|
||||
if (contents === undefined) {
|
||||
@@ -709,296 +432,73 @@ async function setupSearch(
|
||||
const highlights = [...preview.getElementsByClassName("highlight")].sort(
|
||||
(a, b) => b.innerHTML.length - a.innerHTML.length,
|
||||
)
|
||||
if (highlights.length > 0) {
|
||||
const highlight = highlights[0]
|
||||
const container = preview
|
||||
if (container && highlight) {
|
||||
// Get the relative positions
|
||||
const containerRect = container.getBoundingClientRect()
|
||||
const highlightRect = highlight.getBoundingClientRect()
|
||||
// Calculate the scroll position relative to the container
|
||||
const relativeTop = highlightRect.top - containerRect.top + container.scrollTop - 20 // 20px buffer
|
||||
// Smoothly scroll the container
|
||||
container.scrollTo({
|
||||
top: relativeTop,
|
||||
behavior: "smooth",
|
||||
})
|
||||
}
|
||||
}
|
||||
highlights[0]?.scrollIntoView({ block: "start" })
|
||||
}
|
||||
|
||||
async function runSearch(rawTerm: string, token: number) {
|
||||
async function onType(e: HTMLElementEventMap["input"]) {
|
||||
if (!searchLayout || !index) return
|
||||
const trimmed = rawTerm.trim()
|
||||
if (trimmed === "") {
|
||||
removeAllChildren(results)
|
||||
if (preview) {
|
||||
removeAllChildren(preview)
|
||||
}
|
||||
currentHover = null
|
||||
searchLayout.classList.remove("display-results")
|
||||
resetProgressBar()
|
||||
return
|
||||
}
|
||||
currentSearchTerm = (e.target as HTMLInputElement).value
|
||||
searchLayout.classList.toggle("display-results", currentSearchTerm !== "")
|
||||
searchType = currentSearchTerm.startsWith("#") ? "tags" : "basic"
|
||||
|
||||
const modeForRanking: SearchMode = searchMode
|
||||
const initialType: SearchType = trimmed.startsWith("#") ? "tags" : "basic"
|
||||
let workingType: SearchType = initialType
|
||||
let highlightTerm = trimmed
|
||||
let tagTerm = ""
|
||||
let searchResults: DefaultDocumentSearchResults<Item> = []
|
||||
|
||||
if (initialType === "tags") {
|
||||
tagTerm = trimmed.substring(1).trim()
|
||||
const separatorIndex = tagTerm.indexOf(" ")
|
||||
if (separatorIndex !== -1) {
|
||||
const tag = tagTerm.substring(0, separatorIndex).trim()
|
||||
const query = tagTerm.substring(separatorIndex + 1).trim()
|
||||
const results = await index.searchAsync({
|
||||
query,
|
||||
let searchResults: DefaultDocumentSearchResults<Item>
|
||||
if (searchType === "tags") {
|
||||
currentSearchTerm = currentSearchTerm.substring(1).trim()
|
||||
const separatorIndex = currentSearchTerm.indexOf(" ")
|
||||
if (separatorIndex != -1) {
|
||||
// search by title and content index and then filter by tag (implemented in flexsearch)
|
||||
const tag = currentSearchTerm.substring(0, separatorIndex)
|
||||
const query = currentSearchTerm.substring(separatorIndex + 1).trim()
|
||||
searchResults = await index.searchAsync({
|
||||
query: query,
|
||||
// return at least 10000 documents, so it is enough to filter them by tag (implemented in flexsearch)
|
||||
limit: Math.max(numSearchResults, 10000),
|
||||
index: ["title", "content"],
|
||||
tag: { tags: tag },
|
||||
})
|
||||
if (token !== searchSeq) return
|
||||
searchResults = Object.values(results)
|
||||
workingType = "basic"
|
||||
highlightTerm = query
|
||||
for (let searchResult of searchResults) {
|
||||
searchResult.result = searchResult.result.slice(0, numSearchResults)
|
||||
}
|
||||
// set search type to basic and remove tag from term for proper highlightning and scroll
|
||||
searchType = "basic"
|
||||
currentSearchTerm = query
|
||||
} else {
|
||||
const results = await index.searchAsync({
|
||||
query: tagTerm,
|
||||
// default search by tags index
|
||||
searchResults = await index.searchAsync({
|
||||
query: currentSearchTerm,
|
||||
limit: numSearchResults,
|
||||
index: ["tags"],
|
||||
})
|
||||
if (token !== searchSeq) return
|
||||
searchResults = Object.values(results)
|
||||
highlightTerm = tagTerm
|
||||
}
|
||||
} else {
|
||||
const results = await index.searchAsync({
|
||||
query: highlightTerm,
|
||||
} else if (searchType === "basic") {
|
||||
searchResults = await index.searchAsync({
|
||||
query: currentSearchTerm,
|
||||
limit: numSearchResults,
|
||||
index: ["title", "content"],
|
||||
})
|
||||
if (token !== searchSeq) return
|
||||
searchResults = Object.values(results)
|
||||
}
|
||||
|
||||
const coerceIds = (hit?: DefaultDocumentSearchResults<Item>[number]): number[] => {
|
||||
if (!hit) return []
|
||||
return hit.result
|
||||
.map((value: Id) => {
|
||||
if (typeof value === "number") {
|
||||
return value
|
||||
}
|
||||
const parsed = Number.parseInt(String(value), 10)
|
||||
return Number.isNaN(parsed) ? null : parsed
|
||||
})
|
||||
.filter((value): value is number => value !== null)
|
||||
}
|
||||
|
||||
const getByField = (field: string): number[] => {
|
||||
const hit = searchResults.find((x) => x.field === field)
|
||||
return coerceIds(hit)
|
||||
const results = searchResults.filter((x) => x.field === field)
|
||||
return results.length === 0 ? [] : ([...results[0].result] as number[])
|
||||
}
|
||||
|
||||
// order titles ahead of content
|
||||
const allIds: Set<number> = new Set([
|
||||
...getByField("title"),
|
||||
...getByField("content"),
|
||||
...getByField("tags"),
|
||||
])
|
||||
|
||||
currentSearchTerm = highlightTerm
|
||||
|
||||
const candidateItems = new Map<string, Item>()
|
||||
const ensureItem = (id: number): Item | null => {
|
||||
const slug = idDataMap[id]
|
||||
if (!slug) return null
|
||||
const cached = candidateItems.get(slug)
|
||||
if (cached) return cached
|
||||
const item = formatForDisplay(highlightTerm, id, workingType)
|
||||
if (item) {
|
||||
candidateItems.set(slug, item)
|
||||
return item
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
const baseIndices: number[] = []
|
||||
for (const id of allIds) {
|
||||
const item = ensureItem(id)
|
||||
if (!item) continue
|
||||
const idx = slugToIndex.get(item.slug)
|
||||
if (typeof idx === "number") {
|
||||
baseIndices.push(idx)
|
||||
}
|
||||
}
|
||||
|
||||
let semanticIds: number[] = []
|
||||
const semanticSimilarity = new Map<number, number>()
|
||||
|
||||
const integrateIds = (ids: number[]) => {
|
||||
ids.forEach((docId) => {
|
||||
ensureItem(docId)
|
||||
})
|
||||
}
|
||||
|
||||
const orchestrator = semanticReady && semantic ? semantic : null
|
||||
|
||||
const resolveSimilarity = (item: Item): number => {
|
||||
const semanticHit = semanticSimilarity.get(item.id)
|
||||
return semanticHit ?? Number.NaN
|
||||
}
|
||||
|
||||
const render = async () => {
|
||||
if (token !== searchSeq) return
|
||||
const useSemantic = semanticReady && semanticIds.length > 0
|
||||
const weights =
|
||||
modeForRanking === "semantic" && useSemantic
|
||||
? { base: 0.3, semantic: 1.0 }
|
||||
: { base: 1.0, semantic: useSemantic ? 0.3 : 0 }
|
||||
const rrf = new Map<string, number>()
|
||||
const push = (ids: number[], weight: number, applyTitleBoost: boolean = false) => {
|
||||
if (!ids.length || weight <= 0) return
|
||||
ids.forEach((docId, rank) => {
|
||||
const slug = idDataMap[docId]
|
||||
if (!slug) return
|
||||
const item = ensureItem(docId)
|
||||
if (!item) return
|
||||
|
||||
// Apply title boost for FlexSearch results (1.5x boost for exact title matches)
|
||||
let effectiveWeight = weight
|
||||
if (applyTitleBoost && item.titleMatch) {
|
||||
effectiveWeight *= 1.5
|
||||
}
|
||||
|
||||
const prev = rrf.get(slug) ?? 0
|
||||
rrf.set(slug, prev + effectiveWeight / (1 + rank))
|
||||
})
|
||||
}
|
||||
|
||||
push(baseIndices, weights.base, true) // FlexSearch with title boost
|
||||
push(semanticIds, weights.semantic, false) // Semantic without boost
|
||||
|
||||
const rankedEntries = Array.from(candidateItems.values())
|
||||
.map((item) => ({ item, score: rrf.get(item.slug) ?? 0 }))
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, numSearchResults)
|
||||
|
||||
const displayEntries: SimilarityResult[] = []
|
||||
for (const entry of rankedEntries) {
|
||||
const similarity = resolveSimilarity(entry.item)
|
||||
displayEntries.push({ item: entry.item, similarity })
|
||||
}
|
||||
|
||||
await displayResults(displayEntries)
|
||||
}
|
||||
|
||||
await render()
|
||||
|
||||
if (workingType === "tags" || !orchestrator || !semanticReady || highlightTerm.length < 2) {
|
||||
return
|
||||
}
|
||||
|
||||
const showProgress = modeForRanking === "semantic"
|
||||
if (showProgress) {
|
||||
startSemanticProgress()
|
||||
}
|
||||
|
||||
try {
|
||||
const { semantic: semRes } = await orchestrator.search(
|
||||
highlightTerm,
|
||||
numSearchResults * 3, // Request more chunks to ensure good document coverage
|
||||
)
|
||||
if (token !== searchSeq) {
|
||||
if (showProgress) completeSemanticProgress()
|
||||
return
|
||||
}
|
||||
|
||||
// Aggregate chunk results to document level using RRF
|
||||
const { rrfScores: semRrfScores, maxScores: semMaxScores } = aggregateChunkResults(
|
||||
semRes,
|
||||
slugToIndex,
|
||||
)
|
||||
|
||||
// Use RRF scores for ranking
|
||||
semanticIds = Array.from(semRrfScores.entries())
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, numSearchResults)
|
||||
.map(([docIdx]) => docIdx)
|
||||
|
||||
// Use max chunk similarity for display (0-1 range)
|
||||
semanticSimilarity.clear()
|
||||
semMaxScores.forEach((score, docIdx) => {
|
||||
semanticSimilarity.set(docIdx, score)
|
||||
})
|
||||
|
||||
integrateIds(semanticIds)
|
||||
if (showProgress) completeSemanticProgress()
|
||||
} catch (err) {
|
||||
console.warn("[SemanticClient] search failed:", err)
|
||||
if (showProgress) completeSemanticProgress()
|
||||
orchestrator.dispose()
|
||||
semantic = null
|
||||
semanticReady = false
|
||||
semanticInitFailed = true
|
||||
if (searchMode === "semantic") {
|
||||
searchMode = "lexical"
|
||||
updateModeUI(searchMode)
|
||||
}
|
||||
modeButtons.forEach((button) => {
|
||||
if ((button.dataset.mode as SearchMode) === "semantic") {
|
||||
button.disabled = true
|
||||
button.setAttribute("aria-disabled", "true")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
await render()
|
||||
}
|
||||
|
||||
function onType(e: HTMLElementEventMap["input"]) {
|
||||
if (!searchLayout || !index) return
|
||||
rawSearchTerm = (e.target as HTMLInputElement).value
|
||||
const hasQuery = rawSearchTerm.trim() !== ""
|
||||
searchLayout.classList.toggle("display-results", hasQuery)
|
||||
const term = rawSearchTerm
|
||||
const token = ++searchSeq
|
||||
if (runSearchTimer !== null) {
|
||||
window.clearTimeout(runSearchTimer)
|
||||
runSearchTimer = null
|
||||
}
|
||||
if (!hasQuery) {
|
||||
void runSearch("", token)
|
||||
return
|
||||
}
|
||||
const now = performance.now()
|
||||
lastInputAt = now
|
||||
const delay = computeDebounceDelay(term)
|
||||
const scheduledAt = lastInputAt
|
||||
runSearchTimer = window.setTimeout(() => {
|
||||
if (scheduledAt !== lastInputAt) {
|
||||
return
|
||||
}
|
||||
runSearchTimer = null
|
||||
void runSearch(term, token)
|
||||
}, delay)
|
||||
const finalResults = [...allIds].map((id) => formatForDisplay(currentSearchTerm, id))
|
||||
await displayResults(finalResults)
|
||||
}
|
||||
|
||||
document.addEventListener("keydown", shortcutHandler)
|
||||
window.addCleanup(() => document.removeEventListener("keydown", shortcutHandler))
|
||||
const openHandler = () => showSearch("basic")
|
||||
searchButton.addEventListener("click", openHandler)
|
||||
window.addCleanup(() => searchButton.removeEventListener("click", openHandler))
|
||||
searchButton.addEventListener("click", () => showSearch("basic"))
|
||||
window.addCleanup(() => searchButton.removeEventListener("click", () => showSearch("basic")))
|
||||
searchBar.addEventListener("input", onType)
|
||||
window.addCleanup(() => searchBar.removeEventListener("input", onType))
|
||||
window.addCleanup(() => {
|
||||
if (runSearchTimer !== null) {
|
||||
window.clearTimeout(runSearchTimer)
|
||||
runSearchTimer = null
|
||||
}
|
||||
resetProgressBar()
|
||||
})
|
||||
|
||||
registerEscapeHandler(container, hideSearch)
|
||||
await fillDocument(data)
|
||||
@@ -1006,17 +506,17 @@ async function setupSearch(
|
||||
|
||||
/**
|
||||
* Fills flexsearch document with data
|
||||
* @param index index to fill
|
||||
* @param data data to fill index with
|
||||
*/
|
||||
let indexPopulated = false
|
||||
async function fillDocument(data: ContentIndex) {
|
||||
if (indexPopulated) return
|
||||
let id = 0
|
||||
const promises = []
|
||||
const promises: Array<Promise<unknown>> = []
|
||||
for (const [slug, fileData] of Object.entries<ContentDetails>(data)) {
|
||||
promises.push(
|
||||
//@ts-ignore
|
||||
index.addAsync({
|
||||
index.addAsync(id++, {
|
||||
id,
|
||||
slug: slug as FullSlug,
|
||||
title: fileData.title,
|
||||
@@ -1024,7 +524,6 @@ async function fillDocument(data: ContentIndex) {
|
||||
tags: fileData.tags,
|
||||
}),
|
||||
)
|
||||
id++
|
||||
}
|
||||
|
||||
await Promise.all(promises)
|
||||
@@ -1034,9 +533,7 @@ async function fillDocument(data: ContentIndex) {
|
||||
document.addEventListener("nav", async (e: CustomEventMap["nav"]) => {
|
||||
const currentSlug = e.detail.url
|
||||
const data = await fetchData
|
||||
const searchElement = document.getElementsByClassName(
|
||||
"search",
|
||||
) as HTMLCollectionOf<HTMLDivElement>
|
||||
const searchElement = document.getElementsByClassName("search")
|
||||
for (const element of searchElement) {
|
||||
await setupSearch(element, currentSlug, data)
|
||||
}
|
||||
|
||||
163
quartz/components/scripts/search.test.ts
Normal file
163
quartz/components/scripts/search.test.ts
Normal file
@@ -0,0 +1,163 @@
|
||||
import test, { describe } from "node:test"
|
||||
import assert from "node:assert"
|
||||
|
||||
// Inline the encoder function from search.inline.ts for testing
|
||||
const encoder = (str: string): string[] => {
|
||||
const tokens: string[] = []
|
||||
let bufferStart = -1
|
||||
let bufferEnd = -1
|
||||
const lower = str.toLowerCase()
|
||||
|
||||
let i = 0
|
||||
for (const char of lower) {
|
||||
const code = char.codePointAt(0)!
|
||||
|
||||
const isCJK =
|
||||
(code >= 0x3040 && code <= 0x309f) ||
|
||||
(code >= 0x30a0 && code <= 0x30ff) ||
|
||||
(code >= 0x4e00 && code <= 0x9fff) ||
|
||||
(code >= 0xac00 && code <= 0xd7af) ||
|
||||
(code >= 0x20000 && code <= 0x2a6df)
|
||||
|
||||
const isWhitespace = code === 32 || code === 9 || code === 10 || code === 13
|
||||
|
||||
if (isCJK) {
|
||||
if (bufferStart !== -1) {
|
||||
tokens.push(lower.slice(bufferStart, bufferEnd))
|
||||
bufferStart = -1
|
||||
}
|
||||
tokens.push(char)
|
||||
} else if (isWhitespace) {
|
||||
if (bufferStart !== -1) {
|
||||
tokens.push(lower.slice(bufferStart, bufferEnd))
|
||||
bufferStart = -1
|
||||
}
|
||||
} else {
|
||||
if (bufferStart === -1) bufferStart = i
|
||||
bufferEnd = i + char.length
|
||||
}
|
||||
|
||||
i += char.length
|
||||
}
|
||||
|
||||
if (bufferStart !== -1) {
|
||||
tokens.push(lower.slice(bufferStart))
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
describe("search encoder", () => {
|
||||
describe("English text", () => {
|
||||
test("should tokenize simple English words", () => {
|
||||
const result = encoder("hello world")
|
||||
assert.deepStrictEqual(result, ["hello", "world"])
|
||||
})
|
||||
|
||||
test("should handle multiple spaces", () => {
|
||||
const result = encoder("hello world")
|
||||
assert.deepStrictEqual(result, ["hello", "world"])
|
||||
})
|
||||
|
||||
test("should handle tabs and newlines", () => {
|
||||
const result = encoder("hello\tworld\ntest")
|
||||
assert.deepStrictEqual(result, ["hello", "world", "test"])
|
||||
})
|
||||
|
||||
test("should lowercase all text", () => {
|
||||
const result = encoder("Hello WORLD Test")
|
||||
assert.deepStrictEqual(result, ["hello", "world", "test"])
|
||||
})
|
||||
})
|
||||
|
||||
describe("CJK text", () => {
|
||||
test("should tokenize Japanese Hiragana character by character", () => {
|
||||
const result = encoder("こんにちは")
|
||||
assert.deepStrictEqual(result, ["こ", "ん", "に", "ち", "は"])
|
||||
})
|
||||
|
||||
test("should tokenize Japanese Katakana character by character", () => {
|
||||
const result = encoder("コントロール")
|
||||
assert.deepStrictEqual(result, ["コ", "ン", "ト", "ロ", "ー", "ル"])
|
||||
})
|
||||
|
||||
test("should tokenize Japanese Kanji character by character", () => {
|
||||
const result = encoder("日本語")
|
||||
assert.deepStrictEqual(result, ["日", "本", "語"])
|
||||
})
|
||||
|
||||
test("should tokenize Korean Hangul character by character", () => {
|
||||
const result = encoder("안녕하세요")
|
||||
assert.deepStrictEqual(result, ["안", "녕", "하", "세", "요"])
|
||||
})
|
||||
|
||||
test("should tokenize Chinese characters character by character", () => {
|
||||
const result = encoder("你好世界")
|
||||
assert.deepStrictEqual(result, ["你", "好", "世", "界"])
|
||||
})
|
||||
|
||||
test("should handle mixed Hiragana/Katakana/Kanji", () => {
|
||||
const result = encoder("て以来")
|
||||
assert.deepStrictEqual(result, ["て", "以", "来"])
|
||||
})
|
||||
})
|
||||
|
||||
describe("Mixed CJK and English", () => {
|
||||
test("should handle Japanese with English words", () => {
|
||||
const result = encoder("hello 世界")
|
||||
assert.deepStrictEqual(result, ["hello", "世", "界"])
|
||||
})
|
||||
|
||||
test("should handle English with Japanese words", () => {
|
||||
const result = encoder("世界 hello world")
|
||||
assert.deepStrictEqual(result, ["世", "界", "hello", "world"])
|
||||
})
|
||||
|
||||
test("should handle complex mixed content", () => {
|
||||
const result = encoder("これはtest文章です")
|
||||
assert.deepStrictEqual(result, ["こ", "れ", "は", "test", "文", "章", "で", "す"])
|
||||
})
|
||||
|
||||
test("should handle mixed Korean and English", () => {
|
||||
const result = encoder("hello 안녕 world")
|
||||
assert.deepStrictEqual(result, ["hello", "안", "녕", "world"])
|
||||
})
|
||||
|
||||
test("should handle mixed Chinese and English", () => {
|
||||
const result = encoder("你好 world")
|
||||
assert.deepStrictEqual(result, ["你", "好", "world"])
|
||||
})
|
||||
})
|
||||
|
||||
describe("Edge cases", () => {
|
||||
test("should handle empty string", () => {
|
||||
const result = encoder("")
|
||||
assert.deepStrictEqual(result, [])
|
||||
})
|
||||
|
||||
test("should handle only whitespace", () => {
|
||||
const result = encoder(" \t\n ")
|
||||
assert.deepStrictEqual(result, [])
|
||||
})
|
||||
|
||||
test("should handle single character", () => {
|
||||
const result = encoder("a")
|
||||
assert.deepStrictEqual(result, ["a"])
|
||||
})
|
||||
|
||||
test("should handle single CJK character", () => {
|
||||
const result = encoder("あ")
|
||||
assert.deepStrictEqual(result, ["あ"])
|
||||
})
|
||||
|
||||
test("should handle CJK with trailing whitespace", () => {
|
||||
const result = encoder("日本語 ")
|
||||
assert.deepStrictEqual(result, ["日", "本", "語"])
|
||||
})
|
||||
|
||||
test("should handle English with trailing whitespace", () => {
|
||||
const result = encoder("hello ")
|
||||
assert.deepStrictEqual(result, ["hello"])
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -1,182 +0,0 @@
|
||||
export type SemanticResult = { id: number; score: number }
|
||||
|
||||
type ProgressMessage = {
|
||||
type: "progress"
|
||||
loadedRows: number
|
||||
totalRows: number
|
||||
}
|
||||
|
||||
type ReadyMessage = { type: "ready" }
|
||||
|
||||
type ResultMessage = {
|
||||
type: "search-result"
|
||||
seq: number
|
||||
semantic: SemanticResult[]
|
||||
}
|
||||
|
||||
type ErrorMessage = { type: "error"; seq?: number; message: string }
|
||||
|
||||
type SearchPayload = {
|
||||
semantic: SemanticResult[]
|
||||
}
|
||||
|
||||
type PendingResolver = {
|
||||
resolve: (payload: SearchPayload) => void
|
||||
reject: (err: Error) => void
|
||||
}
|
||||
|
||||
export class SemanticClient {
|
||||
private ready: Promise<void>
|
||||
private resolveReady!: () => void
|
||||
private worker: Worker | null = null
|
||||
private pending = new Map<number, PendingResolver>()
|
||||
private seq = 0
|
||||
private disposed = false
|
||||
private readySettled = false
|
||||
private configured = false
|
||||
private lastError: Error | null = null
|
||||
|
||||
constructor(private cfg?: any) {
|
||||
this.ready = new Promise((resolve) => {
|
||||
this.resolveReady = () => {
|
||||
if (this.readySettled) return
|
||||
this.readySettled = true
|
||||
resolve()
|
||||
}
|
||||
})
|
||||
|
||||
if (this.cfg?.enable === false) {
|
||||
this.lastError = new Error("semantic search disabled by configuration")
|
||||
this.resolveReady()
|
||||
return
|
||||
}
|
||||
|
||||
this.boot()
|
||||
}
|
||||
|
||||
private boot() {
|
||||
try {
|
||||
this.worker = new Worker("/semantic.worker.js", { type: "module" })
|
||||
} catch (err) {
|
||||
this.handleFatal(err)
|
||||
return
|
||||
}
|
||||
this.setupWorker()
|
||||
this.startInit()
|
||||
}
|
||||
|
||||
private setupWorker() {
|
||||
if (!this.worker) return
|
||||
this.worker.onmessage = (
|
||||
event: MessageEvent<ProgressMessage | ReadyMessage | ResultMessage | ErrorMessage>,
|
||||
) => {
|
||||
const msg = event.data
|
||||
if (msg.type === "progress") {
|
||||
// Progress updates during initialization - can be logged if needed
|
||||
return
|
||||
}
|
||||
if (msg.type === "ready") {
|
||||
this.configured = true
|
||||
this.lastError = null
|
||||
this.resolveReady()
|
||||
return
|
||||
}
|
||||
if (msg.type === "search-result") {
|
||||
const pending = this.pending.get(msg.seq)
|
||||
if (pending) {
|
||||
this.pending.delete(msg.seq)
|
||||
pending.resolve({ semantic: msg.semantic ?? [] })
|
||||
}
|
||||
return
|
||||
}
|
||||
if (msg.type === "error") {
|
||||
if (typeof msg.seq === "number") {
|
||||
const pending = this.pending.get(msg.seq)
|
||||
if (pending) {
|
||||
this.pending.delete(msg.seq)
|
||||
pending.reject(new Error(msg.message))
|
||||
}
|
||||
} else {
|
||||
this.handleFatal(msg.message)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private startInit() {
|
||||
if (!this.worker) return
|
||||
const manifestUrl =
|
||||
typeof this.cfg?.manifestUrl === "string" && this.cfg.manifestUrl.length > 0
|
||||
? this.cfg.manifestUrl
|
||||
: "/embeddings/manifest.json"
|
||||
const disableCache = Boolean(this.cfg?.disableCache)
|
||||
const baseUrl =
|
||||
typeof this.cfg?.manifestBaseUrl === "string" ? this.cfg.manifestBaseUrl : undefined
|
||||
this.worker.postMessage({
|
||||
type: "init",
|
||||
cfg: this.cfg,
|
||||
manifestUrl,
|
||||
baseUrl,
|
||||
disableCache,
|
||||
})
|
||||
}
|
||||
|
||||
private rejectAll(err: Error, fatal = false) {
|
||||
for (const [id, pending] of this.pending.entries()) {
|
||||
pending.reject(err)
|
||||
this.pending.delete(id)
|
||||
}
|
||||
if (fatal) {
|
||||
this.lastError = err
|
||||
this.configured = false
|
||||
if (!this.readySettled) {
|
||||
this.resolveReady()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private handleFatal(err: unknown) {
|
||||
const error = err instanceof Error ? err : new Error(String(err))
|
||||
console.error("[SemanticClient] initialization failure:", error)
|
||||
this.rejectAll(error, true)
|
||||
if (this.worker) {
|
||||
this.worker.postMessage({ type: "reset" })
|
||||
this.worker.terminate()
|
||||
this.worker = null
|
||||
}
|
||||
}
|
||||
|
||||
async ensureReady() {
|
||||
await this.ready
|
||||
if (!this.configured) {
|
||||
throw this.lastError ?? new Error("semantic search unavailable")
|
||||
}
|
||||
}
|
||||
|
||||
async search(text: string, k: number): Promise<SearchPayload> {
|
||||
if (this.disposed) {
|
||||
throw new Error("semantic client has been disposed")
|
||||
}
|
||||
await this.ensureReady()
|
||||
if (!this.worker || !this.configured) {
|
||||
throw this.lastError ?? new Error("worker unavailable")
|
||||
}
|
||||
return new Promise<SearchPayload>((resolve, reject) => {
|
||||
const seq = ++this.seq
|
||||
this.pending.set(seq, { resolve, reject })
|
||||
this.worker?.postMessage({ type: "search", text, k, seq })
|
||||
})
|
||||
}
|
||||
|
||||
dispose() {
|
||||
if (this.disposed) return
|
||||
this.disposed = true
|
||||
this.rejectAll(new Error("semantic client disposed"))
|
||||
if (this.worker) {
|
||||
this.worker.postMessage({ type: "reset" })
|
||||
this.worker.terminate()
|
||||
}
|
||||
this.worker = null
|
||||
this.configured = false
|
||||
}
|
||||
}
|
||||
@@ -115,9 +115,9 @@ async function _navigate(url: URL, isBack: boolean = false) {
|
||||
}
|
||||
|
||||
// now, patch head, re-executing scripts
|
||||
const elementsToRemove = document.head.querySelectorAll(":not([spa-preserve])")
|
||||
const elementsToRemove = document.head.querySelectorAll(":not([data-persist])")
|
||||
elementsToRemove.forEach((el) => el.remove())
|
||||
const elementsToAdd = html.head.querySelectorAll(":not([spa-preserve])")
|
||||
const elementsToAdd = html.head.querySelectorAll(":not([data-persist])")
|
||||
elementsToAdd.forEach((el) => document.head.appendChild(el))
|
||||
|
||||
// delay setting the url until now
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
background: none;
|
||||
border: none;
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
height: 32px;
|
||||
margin: 0;
|
||||
text-align: inherit;
|
||||
flex-shrink: 0;
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
& > :not(.sidebar.left:has(.explorer)) {
|
||||
transition: transform 300ms ease-in-out;
|
||||
}
|
||||
|
||||
&.lock-scroll > :not(.sidebar.left:has(.explorer)) {
|
||||
transform: translateX(100dvw);
|
||||
transition: transform 300ms ease-in-out;
|
||||
@@ -33,8 +34,10 @@
|
||||
|
||||
min-height: 1.2rem;
|
||||
flex: 0 1 auto;
|
||||
|
||||
&.collapsed {
|
||||
flex: 0 1 1.2rem;
|
||||
|
||||
& .fold {
|
||||
transform: rotateZ(-90deg);
|
||||
}
|
||||
@@ -118,7 +121,10 @@ button.desktop-explorer {
|
||||
list-style: none;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
overscroll-behavior: contain;
|
||||
|
||||
&.explorer-ul {
|
||||
overscroll-behavior: contain;
|
||||
}
|
||||
|
||||
& li > a {
|
||||
color: var(--dark);
|
||||
@@ -133,12 +139,16 @@ button.desktop-explorer {
|
||||
}
|
||||
|
||||
.folder-outer {
|
||||
visibility: collapse;
|
||||
display: grid;
|
||||
grid-template-rows: 0fr;
|
||||
transition: grid-template-rows 0.3s ease-in-out;
|
||||
transition-property: grid-template-rows, visibility;
|
||||
transition-duration: 0.3s;
|
||||
transition-timing-function: ease-in-out;
|
||||
}
|
||||
|
||||
.folder-outer.open {
|
||||
visibility: visible;
|
||||
grid-template-rows: 1fr;
|
||||
}
|
||||
|
||||
@@ -265,6 +275,8 @@ li:has(> .folder-outer:not(.open)) > .folder-container > svg {
|
||||
|
||||
.mobile-no-scroll {
|
||||
@media all and ($mobile) {
|
||||
overscroll-behavior: none;
|
||||
.explorer-content > .explorer-ul {
|
||||
overscroll-behavior: contain;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,7 +65,6 @@ pre {
|
||||
overflow: hidden;
|
||||
|
||||
& > .mermaid-content {
|
||||
padding: 2rem;
|
||||
position: relative;
|
||||
transform-origin: 0 0;
|
||||
transition: transform 0.1s ease;
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
background: none;
|
||||
border: none;
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
height: 32px;
|
||||
margin: 0;
|
||||
text-align: inherit;
|
||||
flex-shrink: 0;
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
& > p {
|
||||
display: inline;
|
||||
color: var(--gray);
|
||||
text-wrap: unset;
|
||||
}
|
||||
|
||||
& svg {
|
||||
@@ -77,97 +78,16 @@
|
||||
margin-bottom: 2em;
|
||||
}
|
||||
|
||||
& > .input-container {
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
position: relative;
|
||||
& > input {
|
||||
box-sizing: border-box;
|
||||
padding: 0.5em 1em;
|
||||
font-family: var(--bodyFont);
|
||||
color: var(--dark);
|
||||
font-size: 1.1em;
|
||||
border: 1px solid var(--lightgray);
|
||||
|
||||
.search-bar {
|
||||
flex: 1 1 auto;
|
||||
min-width: 0;
|
||||
box-sizing: border-box;
|
||||
padding: 0.5em 1em;
|
||||
font-family: var(--bodyFont);
|
||||
color: var(--dark);
|
||||
font-size: 1.1em;
|
||||
border: none;
|
||||
background: transparent;
|
||||
|
||||
&:focus {
|
||||
outline: none;
|
||||
}
|
||||
}
|
||||
|
||||
.semantic-search-progress {
|
||||
position: absolute;
|
||||
bottom: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
height: 2px;
|
||||
background-color: var(--secondary);
|
||||
width: 0;
|
||||
opacity: 0;
|
||||
transition:
|
||||
width 0.3s ease,
|
||||
opacity 0.2s ease;
|
||||
pointer-events: none;
|
||||
}
|
||||
|
||||
.search-mode-toggle {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
border-radius: 9999px;
|
||||
height: 1.4rem;
|
||||
background-color: color-mix(in srgb, var(--darkgray) 12%, transparent);
|
||||
margin-right: 1rem;
|
||||
|
||||
.mode-option {
|
||||
border: none;
|
||||
background: transparent;
|
||||
font: inherit;
|
||||
color: var(--gray);
|
||||
border-radius: 9999px;
|
||||
cursor: pointer;
|
||||
transition:
|
||||
background-color 0.2s ease,
|
||||
color 0.2s ease;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
width: 1.5rem;
|
||||
height: 1.5rem;
|
||||
position: relative;
|
||||
|
||||
&:focus-visible {
|
||||
outline: 2px solid var(--tertiary);
|
||||
outline-offset: 2px;
|
||||
}
|
||||
|
||||
&.active {
|
||||
background-color: var(--secondary);
|
||||
color: var(--light);
|
||||
}
|
||||
|
||||
svg {
|
||||
width: 18px;
|
||||
height: 18px;
|
||||
}
|
||||
|
||||
.sr-only {
|
||||
position: absolute;
|
||||
width: 1px;
|
||||
height: 1px;
|
||||
padding: 0;
|
||||
margin: -1px;
|
||||
overflow: hidden;
|
||||
clip: rect(0, 0, 0, 0);
|
||||
white-space: nowrap;
|
||||
border: 0;
|
||||
}
|
||||
}
|
||||
&:focus {
|
||||
outline: none;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,542 +0,0 @@
|
||||
# /// script
|
||||
# requires-python = ">=3.11"
|
||||
# dependencies = [
|
||||
# "langchain-text-splitters",
|
||||
# "numpy",
|
||||
# "openai",
|
||||
# "sentence-transformers",
|
||||
# "tiktoken",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os, json, argparse, hashlib, math, random, logging
|
||||
|
||||
from pathlib import Path
|
||||
from functools import lru_cache
|
||||
from collections.abc import Iterable
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
import tiktoken, numpy as np
|
||||
|
||||
from openai import OpenAI
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
DEFAULT_VLLM_URL = os.environ.get("VLLM_URL") or os.environ.get("VLLM_EMBED_URL") or "http://127.0.0.1:8000/v1"
|
||||
|
||||
|
||||
def resolve_vllm_base_url(url: str) -> str:
|
||||
if not url:
|
||||
raise ValueError("vLLM URL must be non-empty")
|
||||
|
||||
trimmed = url.rstrip("/")
|
||||
if trimmed.endswith("/v1/embeddings"):
|
||||
trimmed = trimmed[: -len("/embeddings")]
|
||||
elif trimmed.endswith("/embeddings"):
|
||||
trimmed = trimmed[: trimmed.rfind("/")]
|
||||
|
||||
if not trimmed.endswith("/v1"):
|
||||
trimmed = f"{trimmed}/v1"
|
||||
|
||||
return trimmed
|
||||
|
||||
|
||||
def load_jsonl(fp: str) -> Iterable[dict]:
|
||||
with open(fp, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
def l2_normalize_rows(x: np.ndarray) -> np.ndarray:
|
||||
# x: [N, D]
|
||||
norms = np.linalg.norm(x, ord=2, axis=1, keepdims=True)
|
||||
norms[norms == 0] = 1.0
|
||||
return x / norms
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_tiktoken_encoder():
|
||||
# Get the o200k_base tokenizer (GPT-4o) with caching
|
||||
# change this if you want something else.
|
||||
return tiktoken.get_encoding("o200k_base")
|
||||
|
||||
|
||||
def count_tokens(text: str) -> int:
|
||||
# Count tokens using o200k_base encoding
|
||||
encoder = get_tiktoken_encoder()
|
||||
return len(encoder.encode(text))
|
||||
|
||||
|
||||
def get_text_splitter(chunk_size: int, overlap: int):
|
||||
encoder = get_tiktoken_encoder()
|
||||
return RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size * 4, # character approximation
|
||||
chunk_overlap=overlap * 4,
|
||||
separators=["\n\n", "\n", ". ", " ", ""],
|
||||
length_function=lambda t: len(encoder.encode(t)),
|
||||
is_separator_regex=False,
|
||||
)
|
||||
|
||||
|
||||
def chunk_document(
|
||||
doc: dict, max_tokens: int = 512, overlap_tokens: int = 128, min_chunk_size: int = 100
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Chunk a document if it exceeds max_tokens
|
||||
|
||||
Args:
|
||||
doc: {'slug': str, 'title': str, 'text': str}
|
||||
max_tokens: Maximum tokens per chunk
|
||||
overlap_tokens: Overlap between chunks
|
||||
min_chunk_size: Minimum chunk size (avoid tiny chunks)
|
||||
|
||||
Returns:
|
||||
List of chunk dicts with metadata
|
||||
"""
|
||||
text = doc["text"]
|
||||
token_count = count_tokens(text)
|
||||
|
||||
# No chunking needed
|
||||
if token_count <= max_tokens:
|
||||
return [
|
||||
{
|
||||
"slug": doc["slug"],
|
||||
"title": doc.get("title", doc["slug"]),
|
||||
"text": text,
|
||||
"chunk_id": 0,
|
||||
"parent_slug": doc["slug"],
|
||||
"is_chunked": False,
|
||||
}
|
||||
]
|
||||
|
||||
# Apply chunking
|
||||
splitter = get_text_splitter(max_tokens, overlap_tokens)
|
||||
raw_chunks = splitter.split_text(text)
|
||||
|
||||
# Filter out tiny chunks
|
||||
valid_chunks = [c for c in raw_chunks if count_tokens(c) >= min_chunk_size]
|
||||
|
||||
return [
|
||||
{
|
||||
"slug": f"{doc['slug']}#chunk{i}",
|
||||
"title": doc.get("title", doc["slug"]),
|
||||
"text": chunk,
|
||||
"chunk_id": i,
|
||||
"parent_slug": doc["slug"],
|
||||
"is_chunked": True,
|
||||
}
|
||||
for i, chunk in enumerate(valid_chunks)
|
||||
]
|
||||
|
||||
|
||||
def write_shards(vectors: np.ndarray, shard_size: int, dtype: str, out_dir: Path) -> list[dict]:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
rows, dims = vectors.shape
|
||||
shards_meta: list[dict] = []
|
||||
np_dtype = np.float16 if dtype == "fp16" else np.float32
|
||||
bytes_per_value = np.dtype(np_dtype).itemsize
|
||||
row_offset = 0
|
||||
for si, start in enumerate(range(0, rows, shard_size)):
|
||||
end = min(start + shard_size, rows)
|
||||
shard = vectors[start:end] # [n, dims]
|
||||
bin_path = out_dir / f"vectors-{si:03d}.bin"
|
||||
payload = shard.astype(np_dtype, copy=False).tobytes(order="C")
|
||||
digest = hashlib.sha256(payload).hexdigest()
|
||||
with open(bin_path, "wb") as f:
|
||||
f.write(payload)
|
||||
shard_rows = int(shard.shape[0])
|
||||
shards_meta.append(
|
||||
{
|
||||
"path": f"/embeddings/{bin_path.name}",
|
||||
"rows": shard_rows,
|
||||
"rowOffset": row_offset,
|
||||
"byteLength": len(payload),
|
||||
"sha256": digest,
|
||||
"byteStride": dims * bytes_per_value,
|
||||
},
|
||||
)
|
||||
row_offset += shard_rows
|
||||
return shards_meta
|
||||
|
||||
|
||||
def write_hnsw_graph(levels: list[list[list[int]]], rows: int, out_path: Path) -> tuple[list[dict], str]:
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
offset = 0
|
||||
meta: list[dict] = []
|
||||
digest = hashlib.sha256()
|
||||
with open(out_path, "wb") as f:
|
||||
for lvl in levels:
|
||||
indptr = np.zeros(rows + 1, dtype=np.uint32)
|
||||
edge_accum: list[int] = []
|
||||
for idx in range(rows):
|
||||
neighbors = lvl[idx] if idx < len(lvl) else []
|
||||
indptr[idx + 1] = indptr[idx] + len(neighbors)
|
||||
edge_accum.extend(neighbors)
|
||||
indptr_bytes = indptr.tobytes(order="C")
|
||||
indptr_offset = offset
|
||||
f.write(indptr_bytes)
|
||||
digest.update(indptr_bytes)
|
||||
offset += len(indptr_bytes)
|
||||
|
||||
if edge_accum:
|
||||
indices = np.asarray(edge_accum, dtype=np.uint32)
|
||||
indices_bytes = indices.tobytes(order="C")
|
||||
else:
|
||||
indices = np.zeros(0, dtype=np.uint32)
|
||||
indices_bytes = indices.tobytes(order="C")
|
||||
indices_offset = offset
|
||||
f.write(indices_bytes)
|
||||
digest.update(indices_bytes)
|
||||
offset += len(indices_bytes)
|
||||
|
||||
meta.append(
|
||||
{
|
||||
"level": len(meta),
|
||||
"indptr": {
|
||||
"offset": indptr_offset,
|
||||
"elements": int(indptr.shape[0]),
|
||||
"byteLength": len(indptr_bytes),
|
||||
},
|
||||
"indices": {
|
||||
"offset": indices_offset,
|
||||
"elements": int(indices.shape[0]),
|
||||
"byteLength": len(indices_bytes),
|
||||
},
|
||||
},
|
||||
)
|
||||
return meta, digest.hexdigest()
|
||||
|
||||
|
||||
|
||||
def embed_vllm(
|
||||
texts: list[str],
|
||||
model_id: str,
|
||||
vllm_url: str,
|
||||
batch_size: int = 64,
|
||||
concurrency: int = 8,
|
||||
) -> np.ndarray:
|
||||
base_url = resolve_vllm_base_url(vllm_url)
|
||||
api_key = os.environ.get("VLLM_API_KEY") or os.environ.get("OPENAI_API_KEY") or "not-set"
|
||||
client = OpenAI(base_url=base_url, api_key=api_key, timeout=300)
|
||||
|
||||
def list_available_models() -> list[str]:
|
||||
models: list[str] = []
|
||||
page = client.models.list()
|
||||
models.extend(model.id for model in page.data)
|
||||
while getattr(page, "has_more", False) and page.data:
|
||||
cursor = page.data[-1].id
|
||||
page = client.models.list(after=cursor)
|
||||
models.extend(model.id for model in page.data)
|
||||
return models
|
||||
|
||||
try:
|
||||
available_models = list_available_models()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"failed to query {base_url}/models: {exc}") from exc
|
||||
|
||||
if model_id not in available_models:
|
||||
suggestions = ", ".join(sorted(available_models)) if available_models else "<none>"
|
||||
logger.warning(
|
||||
"model '%s' not served by vLLM at %s. Available models: %s. Use the first model, results may differ during semantic search (you can omit this message if your weights is a ONNX checkpoint of the same model.)", model_id, base_url, suggestions,
|
||||
)
|
||||
model_id = available_models[0]
|
||||
|
||||
# Apply model-specific prefixes for documents (asymmetric search)
|
||||
model_lower = model_id.lower()
|
||||
if "e5" in model_lower:
|
||||
# E5 models: use "passage:" prefix for documents
|
||||
prefixed = [f"passage: {t}" for t in texts]
|
||||
elif "qwen" in model_lower and "embedding" in model_lower:
|
||||
# Qwen3-Embedding: documents use plain text (no prefix)
|
||||
prefixed = texts
|
||||
elif "embeddinggemma" in model_lower:
|
||||
# embeddinggemma: use "title: none | text:" prefix for documents
|
||||
prefixed = [f"title: none | text: {t}" for t in texts]
|
||||
else:
|
||||
# Default: no prefix for unknown models
|
||||
prefixed = texts
|
||||
|
||||
print(
|
||||
"Embedding"
|
||||
f" {len(prefixed)} texts with vLLM"
|
||||
f" (model={model_id}, batch_size={batch_size}, concurrency={concurrency})",
|
||||
)
|
||||
|
||||
# Create batches
|
||||
batches = []
|
||||
for i in range(0, len(prefixed), batch_size):
|
||||
batch = prefixed[i : i + batch_size]
|
||||
batches.append((i, batch))
|
||||
|
||||
# Function to send a single batch request
|
||||
def send_batch(batch_info: tuple[int, list[str]]) -> tuple[int, list[np.ndarray]]:
|
||||
idx, batch = batch_info
|
||||
response = client.embeddings.create(model=model_id, input=batch)
|
||||
embeddings = [np.asarray(item.embedding, dtype=np.float32) for item in response.data]
|
||||
return (idx, embeddings)
|
||||
|
||||
# Send batches concurrently (or sequentially if only 1 batch)
|
||||
results: dict[int, list[np.ndarray]] = {}
|
||||
if len(batches) == 1:
|
||||
# Single batch - no need for threading
|
||||
idx, embeddings = send_batch(batches[0])
|
||||
results[idx] = embeddings
|
||||
else:
|
||||
# Multiple batches - use concurrent requests
|
||||
with ThreadPoolExecutor(max_workers=concurrency) as executor:
|
||||
futures = {executor.submit(send_batch, batch_info): batch_info[0] for batch_info in batches}
|
||||
completed = 0
|
||||
for future in as_completed(futures):
|
||||
idx, embeddings = future.result()
|
||||
results[idx] = embeddings
|
||||
completed += 1
|
||||
if completed % max(1, len(batches) // 10) == 0 or completed == len(batches):
|
||||
print(f" Completed {completed}/{len(batches)} batches ({completed * 100 // len(batches)}%)")
|
||||
|
||||
# Reconstruct in order
|
||||
out: list[np.ndarray] = []
|
||||
for i in sorted(results.keys()):
|
||||
out.extend(results[i])
|
||||
|
||||
return np.stack(out, axis=0)
|
||||
|
||||
|
||||
def embed_hf(texts: list[str], model_id: str, device: str) -> np.ndarray:
|
||||
# Prefer sentence-transformers for E5 and similar embed models
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
model = SentenceTransformer(model_id, device=device)
|
||||
|
||||
# Apply model-specific prefixes for documents (asymmetric search)
|
||||
model_lower = model_id.lower()
|
||||
if "e5" in model_lower:
|
||||
# E5 models: use "passage:" prefix for documents
|
||||
prefixed = [f"passage: {t}" for t in texts]
|
||||
elif "qwen" in model_lower and "embedding" in model_lower:
|
||||
# Qwen3-Embedding: documents use plain text (no prefix)
|
||||
prefixed = texts
|
||||
elif "embeddinggemma" in model_lower:
|
||||
# embeddinggemma: use "title: none | text:" prefix for documents
|
||||
prefixed = [f"title: none | text: {t}" for t in texts]
|
||||
else:
|
||||
# Default: no prefix for unknown models
|
||||
prefixed = texts
|
||||
|
||||
vecs = model.encode(
|
||||
prefixed,
|
||||
batch_size=64,
|
||||
normalize_embeddings=True,
|
||||
convert_to_numpy=True,
|
||||
show_progress_bar=True,
|
||||
)
|
||||
return vecs.astype(np.float32, copy=False)
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--jsonl", default="public/embeddings-text.jsonl")
|
||||
ap.add_argument("--model", default=os.environ.get("SEM_MODEL", "intfloat/multilingual-e5-large"))
|
||||
ap.add_argument("--dims", type=int, default=int(os.environ.get("SEM_DIMS", "1024")))
|
||||
ap.add_argument("--dtype", choices=["fp16", "fp32"], default=os.environ.get("SEM_DTYPE", "fp32"))
|
||||
ap.add_argument("--shard-size", type=int, default=int(os.environ.get("SEM_SHARD", "1024")))
|
||||
ap.add_argument("--out", default="public/embeddings")
|
||||
ap.add_argument("--use-vllm", action="store_true", default=bool(os.environ.get("USE_VLLM", "")))
|
||||
ap.add_argument(
|
||||
"--vllm-url",
|
||||
default=DEFAULT_VLLM_URL,
|
||||
help="Base URL for the vLLM OpenAI-compatible server (accepts either /v1 or /v1/embeddings)",
|
||||
)
|
||||
ap.add_argument("--chunk-size", type=int, default=512, help="Max tokens per chunk")
|
||||
ap.add_argument("--chunk-overlap", type=int, default=128, help="Overlap tokens between chunks")
|
||||
ap.add_argument("--no-chunking", action="store_true", help="Disable chunking (embed full docs)")
|
||||
ap.add_argument(
|
||||
"--concurrency",
|
||||
type=int,
|
||||
default=int(os.environ.get("VLLM_CONCURRENCY", "8")),
|
||||
help="Number of concurrent requests to vLLM (default: 8)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=int(os.environ.get("VLLM_BATCH_SIZE", "64")),
|
||||
help="Batch size for vLLM requests (default: 64)",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
recs = list(load_jsonl(args.jsonl))
|
||||
if not recs:
|
||||
print("No input found in public/embeddings-text.jsonl; run the site build first to emit JSONL.")
|
||||
return
|
||||
|
||||
# Apply chunking
|
||||
if args.no_chunking:
|
||||
chunks = recs
|
||||
chunk_metadata = {}
|
||||
print(f"Chunking disabled. Processing {len(chunks)} full documents")
|
||||
else:
|
||||
chunks = []
|
||||
chunk_metadata = {}
|
||||
for rec in recs:
|
||||
doc_chunks = chunk_document(rec, max_tokens=args.chunk_size, overlap_tokens=args.chunk_overlap)
|
||||
chunks.extend(doc_chunks)
|
||||
# Build chunk metadata map
|
||||
for chunk in doc_chunks:
|
||||
if chunk["is_chunked"]:
|
||||
chunk_metadata[chunk["slug"]] = {
|
||||
"parentSlug": chunk["parent_slug"],
|
||||
"chunkId": chunk["chunk_id"],
|
||||
}
|
||||
chunked_count = sum(1 for c in chunks if c.get("is_chunked", False))
|
||||
print(f"Chunked {len(recs)} documents into {len(chunks)} chunks ({chunked_count} chunked, {len(chunks) - chunked_count} unchanged)")
|
||||
print(f" Chunk size: {args.chunk_size} tokens, overlap: {args.chunk_overlap} tokens")
|
||||
|
||||
ids = [c["slug"] for c in chunks]
|
||||
titles = [c.get("title", c["slug"]) for c in chunks]
|
||||
texts = [c["text"] for c in chunks]
|
||||
|
||||
if args.use_vllm:
|
||||
vecs = embed_vllm(
|
||||
texts,
|
||||
args.model,
|
||||
args.vllm_url,
|
||||
batch_size=args.batch_size,
|
||||
concurrency=args.concurrency,
|
||||
)
|
||||
else:
|
||||
device = "cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu"
|
||||
vecs = embed_hf(texts, args.model, device)
|
||||
|
||||
# Coerce dims and re-normalize
|
||||
if vecs.shape[1] != args.dims:
|
||||
if vecs.shape[1] > args.dims:
|
||||
vecs = vecs[:, : args.dims]
|
||||
else:
|
||||
vecs = np.pad(vecs, ((0, 0), (0, args.dims - vecs.shape[1])))
|
||||
vecs = l2_normalize_rows(vecs.astype(np.float32, copy=False))
|
||||
|
||||
out_dir = Path(args.out)
|
||||
shards = write_shards(vecs, args.shard_size, args.dtype, out_dir)
|
||||
|
||||
# Build a lightweight HNSW graph and store it in a compact binary layout
|
||||
def hnsw_build(data: np.ndarray, M: int = 16, efC: int = 200, seed: int = 0) -> dict:
|
||||
rng = random.Random(seed)
|
||||
N, D = data.shape
|
||||
levels: list[list[list[int]]] = [] # levels[L][i] = neighbors of node i at level L
|
||||
|
||||
# random level assignment using 1/e distribution
|
||||
node_levels = []
|
||||
for _ in range(N):
|
||||
lvl = 0
|
||||
while rng.random() < 1 / math.e:
|
||||
lvl += 1
|
||||
node_levels.append(lvl)
|
||||
max_level = max(node_levels) if N > 0 else 0
|
||||
for _ in range(max_level + 1):
|
||||
levels.append([[] for _ in range(N)])
|
||||
|
||||
def sim(i: int, j: int) -> float:
|
||||
return float((data[i] * data[j]).sum())
|
||||
|
||||
entry = 0 if N > 0 else -1
|
||||
|
||||
def search_layer(q: int, ep: int, ef: int, L: int) -> list[int]:
|
||||
if ep < 0:
|
||||
return []
|
||||
visited = set()
|
||||
cand: list[tuple[float, int]] = []
|
||||
top: list[tuple[float, int]] = []
|
||||
def push(node: int):
|
||||
if node in visited:
|
||||
return
|
||||
visited.add(node)
|
||||
cand.append((sim(q, node), node))
|
||||
push(ep)
|
||||
while cand:
|
||||
cand.sort(reverse=True)
|
||||
s, v = cand.pop(0)
|
||||
if len(top) >= ef and s <= top[-1][0]:
|
||||
break
|
||||
top.append((s, v))
|
||||
for u in levels[L][v]:
|
||||
push(u)
|
||||
top.sort(reverse=True)
|
||||
return [n for _, n in top]
|
||||
|
||||
for i in range(N):
|
||||
if i == 0:
|
||||
continue
|
||||
lvl = node_levels[i]
|
||||
ep = entry
|
||||
for L in range(max_level, lvl, -1):
|
||||
c = search_layer(i, ep, 1, L)
|
||||
if c:
|
||||
ep = c[0]
|
||||
for L in range(min(max_level, lvl), -1, -1):
|
||||
W = search_layer(i, ep, efC, L)
|
||||
# Select top M by similarity
|
||||
neigh = sorted(((sim(i, j), j) for j in W if j != i), reverse=True)[:M]
|
||||
for _, e in neigh:
|
||||
if e not in levels[L][i]:
|
||||
levels[L][i].append(e)
|
||||
if i not in levels[L][e]:
|
||||
levels[L][e].append(i)
|
||||
|
||||
# trim neighbors to M
|
||||
for L in range(len(levels)):
|
||||
for i in range(N):
|
||||
if len(levels[L][i]) > M:
|
||||
# keep top M by sim
|
||||
nb = levels[L][i]
|
||||
nb = sorted(nb, key=lambda j: sim(i, j), reverse=True)[:M]
|
||||
levels[L][i] = nb
|
||||
|
||||
return {
|
||||
"M": M,
|
||||
"efConstruction": efC,
|
||||
"entryPoint": entry,
|
||||
"maxLevel": max_level,
|
||||
"levels": levels,
|
||||
}
|
||||
|
||||
hnsw = hnsw_build(vecs, M=16, efC=200)
|
||||
hnsw_meta, hnsw_sha = write_hnsw_graph(hnsw["levels"], int(vecs.shape[0]), out_dir / "hnsw.bin")
|
||||
|
||||
manifest = {
|
||||
"version": 2,
|
||||
"dims": args.dims,
|
||||
"dtype": args.dtype,
|
||||
"normalized": True,
|
||||
"rows": int(vecs.shape[0]),
|
||||
"shardSizeRows": args.shard_size,
|
||||
"vectors": {
|
||||
"dtype": args.dtype,
|
||||
"rows": int(vecs.shape[0]),
|
||||
"dims": args.dims,
|
||||
"shards": shards,
|
||||
},
|
||||
"ids": ids,
|
||||
"titles": titles,
|
||||
"chunkMetadata": chunk_metadata,
|
||||
"hnsw": {
|
||||
"M": hnsw["M"],
|
||||
"efConstruction": hnsw["efConstruction"],
|
||||
"entryPoint": hnsw["entryPoint"],
|
||||
"maxLevel": hnsw["maxLevel"],
|
||||
"graph": {
|
||||
"path": "/embeddings/hnsw.bin",
|
||||
"sha256": hnsw_sha,
|
||||
"levels": hnsw_meta,
|
||||
},
|
||||
},
|
||||
}
|
||||
(out_dir / "manifest.json").write_text(json.dumps(manifest, ensure_ascii=False), encoding="utf-8")
|
||||
print(f"Wrote {len(shards)} vector shard(s), HNSW graph, and manifest to {out_dir}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -27,6 +27,8 @@ import lt from "./locales/lt-LT"
|
||||
import fi from "./locales/fi-FI"
|
||||
import no from "./locales/nb-NO"
|
||||
import id from "./locales/id-ID"
|
||||
import kk from "./locales/kk-KZ"
|
||||
import he from "./locales/he-IL"
|
||||
|
||||
export const TRANSLATIONS = {
|
||||
"en-US": enUs,
|
||||
@@ -78,6 +80,8 @@ export const TRANSLATIONS = {
|
||||
"fi-FI": fi,
|
||||
"nb-NO": no,
|
||||
"id-ID": id,
|
||||
"kk-KZ": kk,
|
||||
"he-IL": he,
|
||||
} as const
|
||||
|
||||
export const defaultTranslation = "en-US"
|
||||
|
||||
88
quartz/i18n/locales/he-IL.ts
Normal file
88
quartz/i18n/locales/he-IL.ts
Normal file
@@ -0,0 +1,88 @@
|
||||
import { Translation } from "./definition"
|
||||
|
||||
export default {
|
||||
propertyDefaults: {
|
||||
title: "ללא כותרת",
|
||||
description: "לא סופק תיאור",
|
||||
},
|
||||
direction: "rtl" as const,
|
||||
components: {
|
||||
callout: {
|
||||
note: "הערה",
|
||||
abstract: "תקציר",
|
||||
info: "מידע",
|
||||
todo: "לעשות",
|
||||
tip: "טיפ",
|
||||
success: "הצלחה",
|
||||
question: "שאלה",
|
||||
warning: "אזהרה",
|
||||
failure: "כשלון",
|
||||
danger: "סכנה",
|
||||
bug: "באג",
|
||||
example: "דוגמה",
|
||||
quote: "ציטוט",
|
||||
},
|
||||
backlinks: {
|
||||
title: "קישורים חוזרים",
|
||||
noBacklinksFound: "לא נמצאו קישורים חוזרים",
|
||||
},
|
||||
themeToggle: {
|
||||
lightMode: "מצב בהיר",
|
||||
darkMode: "מצב כהה",
|
||||
},
|
||||
readerMode: {
|
||||
title: "מצב קריאה",
|
||||
},
|
||||
explorer: {
|
||||
title: "סייר",
|
||||
},
|
||||
footer: {
|
||||
createdWith: "נוצר באמצעות",
|
||||
},
|
||||
graph: {
|
||||
title: "מבט גרף",
|
||||
},
|
||||
recentNotes: {
|
||||
title: "הערות אחרונות",
|
||||
seeRemainingMore: ({ remaining }) => `עיין ב ${remaining} נוספים →`,
|
||||
},
|
||||
transcludes: {
|
||||
transcludeOf: ({ targetSlug }) => `מצוטט מ ${targetSlug}`,
|
||||
linkToOriginal: "קישור למקורי",
|
||||
},
|
||||
search: {
|
||||
title: "חיפוש",
|
||||
searchBarPlaceholder: "חפשו משהו",
|
||||
},
|
||||
tableOfContents: {
|
||||
title: "תוכן עניינים",
|
||||
},
|
||||
contentMeta: {
|
||||
readingTime: ({ minutes }) => `${minutes} דקות קריאה`,
|
||||
},
|
||||
},
|
||||
pages: {
|
||||
rss: {
|
||||
recentNotes: "הערות אחרונות",
|
||||
lastFewNotes: ({ count }) => `${count} הערות אחרונות`,
|
||||
},
|
||||
error: {
|
||||
title: "לא נמצא",
|
||||
notFound: "העמוד הזה פרטי או לא קיים.",
|
||||
home: "חזרה לעמוד הבית",
|
||||
},
|
||||
folderContent: {
|
||||
folder: "תיקייה",
|
||||
itemsUnderFolder: ({ count }) =>
|
||||
count === 1 ? "פריט אחד תחת תיקייה זו." : `${count} פריטים תחת תיקייה זו.`,
|
||||
},
|
||||
tagContent: {
|
||||
tag: "תגית",
|
||||
tagIndex: "מפתח התגיות",
|
||||
itemsUnderTag: ({ count }) =>
|
||||
count === 1 ? "פריט אחד עם תגית זו." : `${count} פריטים עם תגית זו.`,
|
||||
showingFirst: ({ count }) => `מראה את ה-${count} תגיות הראשונות.`,
|
||||
totalTags: ({ count }) => `${count} תגיות נמצאו סך הכל.`,
|
||||
},
|
||||
},
|
||||
} as const satisfies Translation
|
||||
@@ -8,7 +8,7 @@ export default {
|
||||
components: {
|
||||
callout: {
|
||||
note: "Nota",
|
||||
abstract: "Astratto",
|
||||
abstract: "Abstract",
|
||||
info: "Info",
|
||||
todo: "Da fare",
|
||||
tip: "Consiglio",
|
||||
@@ -17,7 +17,7 @@ export default {
|
||||
warning: "Attenzione",
|
||||
failure: "Errore",
|
||||
danger: "Pericolo",
|
||||
bug: "Bug",
|
||||
bug: "Problema",
|
||||
example: "Esempio",
|
||||
quote: "Citazione",
|
||||
},
|
||||
@@ -43,10 +43,11 @@ export default {
|
||||
},
|
||||
recentNotes: {
|
||||
title: "Note recenti",
|
||||
seeRemainingMore: ({ remaining }) => `Vedi ${remaining} altro →`,
|
||||
seeRemainingMore: ({ remaining }) =>
|
||||
remaining === 1 ? "Vedi 1 altra →" : `Vedi altre ${remaining} →`,
|
||||
},
|
||||
transcludes: {
|
||||
transcludeOf: ({ targetSlug }) => `Transclusione di ${targetSlug}`,
|
||||
transcludeOf: ({ targetSlug }) => `Inclusione di ${targetSlug}`,
|
||||
linkToOriginal: "Link all'originale",
|
||||
},
|
||||
search: {
|
||||
@@ -54,16 +55,16 @@ export default {
|
||||
searchBarPlaceholder: "Cerca qualcosa",
|
||||
},
|
||||
tableOfContents: {
|
||||
title: "Tabella dei contenuti",
|
||||
title: "Indice",
|
||||
},
|
||||
contentMeta: {
|
||||
readingTime: ({ minutes }) => `${minutes} minuti`,
|
||||
readingTime: ({ minutes }) => (minutes === 1 ? "1 minuto" : `${minutes} minuti`),
|
||||
},
|
||||
},
|
||||
pages: {
|
||||
rss: {
|
||||
recentNotes: "Note recenti",
|
||||
lastFewNotes: ({ count }) => `Ultime ${count} note`,
|
||||
lastFewNotes: ({ count }) => (count === 1 ? "Ultima nota" : `Ultime ${count} note`),
|
||||
},
|
||||
error: {
|
||||
title: "Non trovato",
|
||||
@@ -80,8 +81,9 @@ export default {
|
||||
tagIndex: "Indice etichette",
|
||||
itemsUnderTag: ({ count }) =>
|
||||
count === 1 ? "1 oggetto con questa etichetta." : `${count} oggetti con questa etichetta.`,
|
||||
showingFirst: ({ count }) => `Prime ${count} etichette.`,
|
||||
totalTags: ({ count }) => `Trovate ${count} etichette totali.`,
|
||||
showingFirst: ({ count }) => (count === 1 ? "Prima etichetta." : `Prime ${count} etichette.`),
|
||||
totalTags: ({ count }) =>
|
||||
count === 1 ? "Trovata 1 etichetta in totale." : `Trovate ${count} etichette totali.`,
|
||||
},
|
||||
},
|
||||
} as const satisfies Translation
|
||||
|
||||
87
quartz/i18n/locales/kk-KZ.ts
Normal file
87
quartz/i18n/locales/kk-KZ.ts
Normal file
@@ -0,0 +1,87 @@
|
||||
import { Translation } from "./definition"
|
||||
|
||||
export default {
|
||||
propertyDefaults: {
|
||||
title: "Атаусыз",
|
||||
description: "Сипаттама берілмеген",
|
||||
},
|
||||
components: {
|
||||
callout: {
|
||||
note: "Ескерту",
|
||||
abstract: "Аннотация",
|
||||
info: "Ақпарат",
|
||||
todo: "Істеу керек",
|
||||
tip: "Кеңес",
|
||||
success: "Сәттілік",
|
||||
question: "Сұрақ",
|
||||
warning: "Ескерту",
|
||||
failure: "Қате",
|
||||
danger: "Қауіп",
|
||||
bug: "Қате",
|
||||
example: "Мысал",
|
||||
quote: "Дәйексөз",
|
||||
},
|
||||
backlinks: {
|
||||
title: "Артқа сілтемелер",
|
||||
noBacklinksFound: "Артқа сілтемелер табылмады",
|
||||
},
|
||||
themeToggle: {
|
||||
lightMode: "Жарық режимі",
|
||||
darkMode: "Қараңғы режим",
|
||||
},
|
||||
readerMode: {
|
||||
title: "Оқу режимі",
|
||||
},
|
||||
explorer: {
|
||||
title: "Зерттеуші",
|
||||
},
|
||||
footer: {
|
||||
createdWith: "Құрастырылған құрал:",
|
||||
},
|
||||
graph: {
|
||||
title: "Граф көрінісі",
|
||||
},
|
||||
recentNotes: {
|
||||
title: "Соңғы жазбалар",
|
||||
seeRemainingMore: ({ remaining }) => `Тағы ${remaining} жазбаны қарау →`,
|
||||
},
|
||||
transcludes: {
|
||||
transcludeOf: ({ targetSlug }) => `${targetSlug} кірістіру`,
|
||||
linkToOriginal: "Бастапқыға сілтеме",
|
||||
},
|
||||
search: {
|
||||
title: "Іздеу",
|
||||
searchBarPlaceholder: "Бірдеңе іздеу",
|
||||
},
|
||||
tableOfContents: {
|
||||
title: "Мазмұны",
|
||||
},
|
||||
contentMeta: {
|
||||
readingTime: ({ minutes }) => `${minutes} мин оқу`,
|
||||
},
|
||||
},
|
||||
pages: {
|
||||
rss: {
|
||||
recentNotes: "Соңғы жазбалар",
|
||||
lastFewNotes: ({ count }) => `Соңғы ${count} жазба`,
|
||||
},
|
||||
error: {
|
||||
title: "Табылмады",
|
||||
notFound: "Бұл бет жеке немесе жоқ болуы мүмкін.",
|
||||
home: "Басты бетке оралу",
|
||||
},
|
||||
folderContent: {
|
||||
folder: "Қалта",
|
||||
itemsUnderFolder: ({ count }) =>
|
||||
count === 1 ? "Бұл қалтада 1 элемент бар." : `Бұл қалтада ${count} элемент бар.`,
|
||||
},
|
||||
tagContent: {
|
||||
tag: "Тег",
|
||||
tagIndex: "Тегтер индексі",
|
||||
itemsUnderTag: ({ count }) =>
|
||||
count === 1 ? "Бұл тегпен 1 элемент." : `Бұл тегпен ${count} элемент.`,
|
||||
showingFirst: ({ count }) => `Алғашқы ${count} тег көрсетілуде.`,
|
||||
totalTags: ({ count }) => `Барлығы ${count} тег табылды.`,
|
||||
},
|
||||
},
|
||||
} as const satisfies Translation
|
||||
@@ -3,85 +3,83 @@ import { Translation } from "./definition"
|
||||
export default {
|
||||
propertyDefaults: {
|
||||
title: "Không có tiêu đề",
|
||||
description: "Không có mô tả được cung cấp",
|
||||
description: "Không có mô tả",
|
||||
},
|
||||
components: {
|
||||
callout: {
|
||||
note: "Ghi Chú",
|
||||
abstract: "Tóm Tắt",
|
||||
note: "Ghi chú",
|
||||
abstract: "Tổng quan",
|
||||
info: "Thông tin",
|
||||
todo: "Cần Làm",
|
||||
tip: "Gợi Ý",
|
||||
success: "Thành Công",
|
||||
question: "Nghi Vấn",
|
||||
warning: "Cảnh Báo",
|
||||
failure: "Thất Bại",
|
||||
danger: "Nguy Hiểm",
|
||||
todo: "Cần phải làm",
|
||||
tip: "Gợi ý",
|
||||
success: "Thành công",
|
||||
question: "Câu hỏi",
|
||||
warning: "Cảnh báo",
|
||||
failure: "Thất bại",
|
||||
danger: "Nguy hiểm",
|
||||
bug: "Lỗi",
|
||||
example: "Ví Dụ",
|
||||
quote: "Trích Dẫn",
|
||||
example: "Ví dụ",
|
||||
quote: "Trích dẫn",
|
||||
},
|
||||
backlinks: {
|
||||
title: "Liên Kết Ngược",
|
||||
noBacklinksFound: "Không có liên kết ngược được tìm thấy",
|
||||
title: "Liên kết ngược",
|
||||
noBacklinksFound: "Không có liên kết ngược nào",
|
||||
},
|
||||
themeToggle: {
|
||||
lightMode: "Sáng",
|
||||
darkMode: "Tối",
|
||||
lightMode: "Chế độ sáng",
|
||||
darkMode: "Chế độ tối",
|
||||
},
|
||||
readerMode: {
|
||||
title: "Chế độ đọc",
|
||||
},
|
||||
explorer: {
|
||||
title: "Trong bài này",
|
||||
title: "Nội dung",
|
||||
},
|
||||
footer: {
|
||||
createdWith: "Được tạo bởi",
|
||||
createdWith: "Được tạo bằng",
|
||||
},
|
||||
graph: {
|
||||
title: "Biểu Đồ",
|
||||
title: "Sơ đồ",
|
||||
},
|
||||
recentNotes: {
|
||||
title: "Bài viết gần đây",
|
||||
seeRemainingMore: ({ remaining }) => `Xem ${remaining} thêm →`,
|
||||
title: "Ghi chú gần đây",
|
||||
seeRemainingMore: ({ remaining }) => `Xem thêm ${remaining} ghi chú →`,
|
||||
},
|
||||
transcludes: {
|
||||
transcludeOf: ({ targetSlug }) => `Bao gồm ${targetSlug}`,
|
||||
linkToOriginal: "Liên Kết Gốc",
|
||||
transcludeOf: ({ targetSlug }) => `Trích dẫn toàn bộ từ ${targetSlug}`,
|
||||
linkToOriginal: "Xem trang gốc",
|
||||
},
|
||||
search: {
|
||||
title: "Tìm Kiếm",
|
||||
title: "Tìm",
|
||||
searchBarPlaceholder: "Tìm kiếm thông tin",
|
||||
},
|
||||
tableOfContents: {
|
||||
title: "Bảng Nội Dung",
|
||||
title: "Mục lục",
|
||||
},
|
||||
contentMeta: {
|
||||
readingTime: ({ minutes }) => `đọc ${minutes} phút`,
|
||||
readingTime: ({ minutes }) => `${minutes} phút đọc`,
|
||||
},
|
||||
},
|
||||
pages: {
|
||||
rss: {
|
||||
recentNotes: "Những bài gần đây",
|
||||
lastFewNotes: ({ count }) => `${count} Bài gần đây`,
|
||||
recentNotes: "Ghi chú gần đây",
|
||||
lastFewNotes: ({ count }) => `${count} Trang gần đây`,
|
||||
},
|
||||
error: {
|
||||
title: "Không Tìm Thấy",
|
||||
notFound: "Trang này được bảo mật hoặc không tồn tại.",
|
||||
home: "Trở về trang chủ",
|
||||
title: "Không tìm thấy",
|
||||
notFound: "Trang này riêng tư hoặc không tồn tại.",
|
||||
home: "Về trang chủ",
|
||||
},
|
||||
folderContent: {
|
||||
folder: "Thư Mục",
|
||||
itemsUnderFolder: ({ count }) =>
|
||||
count === 1 ? "1 mục trong thư mục này." : `${count} mục trong thư mục này.`,
|
||||
folder: "Thư mục",
|
||||
itemsUnderFolder: ({ count }) => `Có ${count} trang trong thư mục này.`,
|
||||
},
|
||||
tagContent: {
|
||||
tag: "Thẻ",
|
||||
tagIndex: "Thẻ Mục Lục",
|
||||
itemsUnderTag: ({ count }) =>
|
||||
count === 1 ? "1 mục gắn thẻ này." : `${count} mục gắn thẻ này.`,
|
||||
showingFirst: ({ count }) => `Hiển thị trước ${count} thẻ.`,
|
||||
totalTags: ({ count }) => `Tìm thấy ${count} thẻ tổng cộng.`,
|
||||
tagIndex: "Danh sách thẻ",
|
||||
itemsUnderTag: ({ count }) => `Có ${count} trang gắn thẻ này.`,
|
||||
showingFirst: ({ count }) => `Đang hiển thị ${count} trang đầu tiên.`,
|
||||
totalTags: ({ count }) => `Có tổng cộng ${count} thẻ.`,
|
||||
},
|
||||
},
|
||||
} as const satisfies Translation
|
||||
|
||||
@@ -40,7 +40,7 @@ export const NotFoundPage: QuartzEmitterPlugin = () => {
|
||||
description: notFound,
|
||||
frontmatter: { title: notFound, tags: [] },
|
||||
})
|
||||
const externalResources = pageResources(path, resources, ctx.cfg.configuration)
|
||||
const externalResources = pageResources(path, resources)
|
||||
const componentData: QuartzComponentProps = {
|
||||
ctx,
|
||||
fileData: vfile.data,
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
import { FullSlug, joinSegments } from "../../util/path"
|
||||
import { QuartzEmitterPlugin } from "../types"
|
||||
import path from "path"
|
||||
import fs from "node:fs/promises"
|
||||
import { globby } from "globby"
|
||||
|
||||
// @ts-ignore
|
||||
import spaRouterScript from "../../components/scripts/spa.inline"
|
||||
@@ -19,7 +16,7 @@ import {
|
||||
processGoogleFonts,
|
||||
} from "../../util/theme"
|
||||
import { Features, transform } from "lightningcss"
|
||||
import { transform as transpile, build as bundle } from "esbuild"
|
||||
import { transform as transpile } from "esbuild"
|
||||
import { write } from "./helpers"
|
||||
|
||||
type ComponentResources = {
|
||||
@@ -244,6 +241,16 @@ function addGlobalPageResources(ctx: BuildCtx, componentResources: ComponentReso
|
||||
vercelInsightsScript.defer = true
|
||||
document.head.appendChild(vercelInsightsScript)
|
||||
`)
|
||||
} else if (cfg.analytics?.provider === "rybbit") {
|
||||
componentResources.afterDOMLoaded.push(`
|
||||
const rybbitScript = document.createElement("script");
|
||||
rybbitScript.src = "${cfg.analytics.host ?? "https://app.rybbit.io"}/api/script.js";
|
||||
rybbitScript.setAttribute("data-site-id", "${cfg.analytics.siteId}");
|
||||
rybbitScript.async = true;
|
||||
rybbitScript.defer = true;
|
||||
|
||||
document.head.appendChild(rybbitScript);
|
||||
`)
|
||||
}
|
||||
|
||||
if (cfg.enableSPA) {
|
||||
@@ -360,47 +367,7 @@ export const ComponentResources: QuartzEmitterPlugin = () => {
|
||||
ext: ".js",
|
||||
content: postscript,
|
||||
})
|
||||
|
||||
// Bundle all worker files
|
||||
const workerFiles = await globby(["quartz/**/*.worker.ts"])
|
||||
for (const src of workerFiles) {
|
||||
const result = await bundle({
|
||||
entryPoints: [src],
|
||||
bundle: true,
|
||||
minify: true,
|
||||
platform: "browser",
|
||||
format: "esm",
|
||||
write: false,
|
||||
})
|
||||
const code = result.outputFiles[0].text
|
||||
const name = path.basename(src).replace(/\.ts$/, "")
|
||||
yield write({ ctx, slug: name as FullSlug, ext: ".js", content: code })
|
||||
}
|
||||
},
|
||||
async *partialEmit(ctx, _content, _resources, changeEvents) {
|
||||
// Handle worker file changes in incremental builds
|
||||
for (const changeEvent of changeEvents) {
|
||||
if (!/\.worker\.ts$/.test(changeEvent.path)) continue
|
||||
if (changeEvent.type === "delete") {
|
||||
const name = path.basename(changeEvent.path).replace(/\.ts$/, "")
|
||||
const dest = joinSegments(ctx.argv.output, `${name}.js`)
|
||||
try {
|
||||
await fs.unlink(dest)
|
||||
} catch {}
|
||||
continue
|
||||
}
|
||||
const result = await bundle({
|
||||
entryPoints: [changeEvent.path],
|
||||
bundle: true,
|
||||
minify: true,
|
||||
platform: "browser",
|
||||
format: "esm",
|
||||
write: false,
|
||||
})
|
||||
const code = result.outputFiles[0].text
|
||||
const name = path.basename(changeEvent.path).replace(/\.ts$/, "")
|
||||
yield write({ ctx, slug: name as FullSlug, ext: ".js", content: code })
|
||||
}
|
||||
},
|
||||
async *partialEmit() {},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ async function processContent(
|
||||
) {
|
||||
const slug = fileData.slug!
|
||||
const cfg = ctx.cfg.configuration
|
||||
const externalResources = pageResources(pathToRoot(slug), resources, ctx.cfg.configuration)
|
||||
const externalResources = pageResources(pathToRoot(slug), resources)
|
||||
const componentData: QuartzComponentProps = {
|
||||
ctx,
|
||||
fileData,
|
||||
|
||||
@@ -38,7 +38,7 @@ async function* processFolderInfo(
|
||||
const slug = joinSegments(folder, "index") as FullSlug
|
||||
const [tree, file] = folderContent
|
||||
const cfg = ctx.cfg.configuration
|
||||
const externalResources = pageResources(pathToRoot(slug), resources, ctx.cfg.configuration)
|
||||
const externalResources = pageResources(pathToRoot(slug), resources)
|
||||
const componentData: QuartzComponentProps = {
|
||||
ctx,
|
||||
fileData: file.data,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
export { ContentPage } from "./contentPage"
|
||||
export { TagPage } from "./tagPage"
|
||||
export { FolderPage } from "./folderPage"
|
||||
export { ContentIndex } from "./contentIndex"
|
||||
export { ContentIndex as ContentIndex } from "./contentIndex"
|
||||
export { AliasRedirects } from "./aliases"
|
||||
export { Assets } from "./assets"
|
||||
export { Static } from "./static"
|
||||
@@ -10,4 +10,3 @@ export { ComponentResources } from "./componentResources"
|
||||
export { NotFoundPage } from "./404"
|
||||
export { CNAME } from "./cname"
|
||||
export { CustomOgImages } from "./ogImage"
|
||||
export { SemanticIndex } from "./semantic"
|
||||
|
||||
@@ -1,235 +0,0 @@
|
||||
import { write } from "./helpers"
|
||||
import { QuartzEmitterPlugin } from "../types"
|
||||
import { FilePath, FullSlug, joinSegments, QUARTZ } from "../../util/path"
|
||||
import { ReadTimeResults } from "reading-time"
|
||||
import { GlobalConfiguration } from "../../cfg"
|
||||
import { spawn } from "child_process"
|
||||
|
||||
const DEFAULT_MODEL_ID = "onnx-community/Qwen3-Embedding-0.6B-ONNX"
|
||||
|
||||
const defaults: GlobalConfiguration["semanticSearch"] = {
|
||||
enable: true,
|
||||
model: DEFAULT_MODEL_ID,
|
||||
aot: false,
|
||||
dims: 1024,
|
||||
dtype: "fp32",
|
||||
shardSizeRows: 1024,
|
||||
hnsw: { M: 16, efConstruction: 200 },
|
||||
chunking: {
|
||||
chunkSize: 512,
|
||||
chunkOverlap: 128,
|
||||
noChunking: false,
|
||||
},
|
||||
vllm: {
|
||||
enable: false,
|
||||
vllmUrl:
|
||||
process.env.VLLM_URL || process.env.VLLM_EMBED_URL || "http://127.0.0.1:8000/v1/embeddings",
|
||||
concurrency: parseInt(process.env.VLLM_CONCURRENCY || "8", 10),
|
||||
batchSize: parseInt(process.env.VLLM_BATCH_SIZE || "64", 10),
|
||||
},
|
||||
}
|
||||
|
||||
type ContentDetails = {
|
||||
slug: string
|
||||
title: string
|
||||
filePath: FilePath
|
||||
content: string
|
||||
readingTime?: Partial<ReadTimeResults>
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if uv is installed
|
||||
*/
|
||||
function checkUvInstalled(): Promise<boolean> {
|
||||
return new Promise((resolve) => {
|
||||
const proc = spawn("uv", ["--version"], { shell: true })
|
||||
proc.on("error", () => resolve(false))
|
||||
proc.on("close", (code) => resolve(code === 0))
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the Python embedding build script using uv
|
||||
* Script uses PEP 723 inline metadata for dependency management
|
||||
*/
|
||||
function runEmbedBuild(
|
||||
jsonlPath: string,
|
||||
outDir: string,
|
||||
opts: {
|
||||
model: string
|
||||
dtype: string
|
||||
dims: number
|
||||
shardSizeRows: number
|
||||
chunking: { chunkSize: number; chunkOverlap: number; noChunking: boolean }
|
||||
vllm: { enable: boolean; vllmUrl?: string; concurrency: number; batchSize: number }
|
||||
},
|
||||
): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const scriptPath = joinSegments(QUARTZ, "embed_build.py")
|
||||
const args = [
|
||||
"run",
|
||||
scriptPath,
|
||||
"--jsonl",
|
||||
jsonlPath,
|
||||
"--model",
|
||||
opts.model,
|
||||
"--out",
|
||||
outDir,
|
||||
"--dtype",
|
||||
opts.dtype,
|
||||
"--dims",
|
||||
String(opts.dims),
|
||||
"--shard-size",
|
||||
String(opts.shardSizeRows),
|
||||
"--chunk-size",
|
||||
String(opts.chunking.chunkSize),
|
||||
"--chunk-overlap",
|
||||
String(opts.chunking.chunkOverlap),
|
||||
]
|
||||
|
||||
if (opts.chunking.noChunking) {
|
||||
args.push("--no-chunking")
|
||||
}
|
||||
|
||||
if (opts.vllm.enable) {
|
||||
args.push("--use-vllm")
|
||||
if (opts.vllm.vllmUrl) {
|
||||
args.push("--vllm-url", opts.vllm.vllmUrl)
|
||||
}
|
||||
args.push("--concurrency", String(opts.vllm.concurrency))
|
||||
args.push("--batch-size", String(opts.vllm.batchSize))
|
||||
}
|
||||
|
||||
console.log("\nRunning embedding generation:")
|
||||
console.log(` uv ${args.join(" ")}`)
|
||||
|
||||
const env = { ...process.env }
|
||||
if (opts.vllm.enable && !env.USE_VLLM) {
|
||||
env.USE_VLLM = "1"
|
||||
}
|
||||
|
||||
const proc = spawn("uv", args, {
|
||||
stdio: "inherit",
|
||||
shell: true,
|
||||
env,
|
||||
})
|
||||
|
||||
proc.on("error", (err) => {
|
||||
reject(new Error(`Failed to spawn uv: ${err.message}`))
|
||||
})
|
||||
|
||||
proc.on("close", (code) => {
|
||||
if (code === 0) {
|
||||
console.log("Embedding generation completed successfully")
|
||||
resolve()
|
||||
} else {
|
||||
reject(new Error(`embed_build.py exited with code ${code}`))
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
export const SemanticIndex: QuartzEmitterPlugin<Partial<GlobalConfiguration["semanticSearch"]>> = (
|
||||
opts,
|
||||
) => {
|
||||
const merged = { ...defaults, ...opts }
|
||||
const o = {
|
||||
enable: merged.enable!,
|
||||
model: merged.model!,
|
||||
aot: merged.aot!,
|
||||
dims: merged.dims!,
|
||||
dtype: merged.dtype!,
|
||||
shardSizeRows: merged.shardSizeRows!,
|
||||
hnsw: {
|
||||
M: merged.hnsw?.M ?? defaults.hnsw!.M!,
|
||||
efConstruction: merged.hnsw?.efConstruction ?? defaults.hnsw!.efConstruction!,
|
||||
efSearch: merged.hnsw?.efSearch,
|
||||
},
|
||||
chunking: {
|
||||
chunkSize: merged.chunking?.chunkSize ?? defaults.chunking!.chunkSize!,
|
||||
chunkOverlap: merged.chunking?.chunkOverlap ?? defaults.chunking!.chunkOverlap!,
|
||||
noChunking: merged.chunking?.noChunking ?? defaults.chunking!.noChunking!,
|
||||
},
|
||||
vllm: {
|
||||
enable: merged.vllm?.enable ?? defaults.vllm!.enable!,
|
||||
vllmUrl: merged.vllm?.vllmUrl ?? defaults.vllm!.vllmUrl,
|
||||
concurrency: merged.vllm?.concurrency ?? defaults.vllm!.concurrency!,
|
||||
batchSize: merged.vllm?.batchSize ?? defaults.vllm!.batchSize!,
|
||||
},
|
||||
}
|
||||
|
||||
if (!o.model) {
|
||||
throw new Error("Semantic search requires a model identifier")
|
||||
}
|
||||
|
||||
return {
|
||||
name: "SemanticIndex",
|
||||
getQuartzComponents() {
|
||||
return []
|
||||
},
|
||||
async *partialEmit() {},
|
||||
async *emit(ctx, content, _resources) {
|
||||
if (!o.enable) return
|
||||
|
||||
const docs: ContentDetails[] = []
|
||||
for (const [_, file] of content) {
|
||||
const slug = file.data.slug!
|
||||
const title = file.data.frontmatter?.title ?? slug
|
||||
const text = file.data.text
|
||||
if (text) {
|
||||
docs.push({
|
||||
slug,
|
||||
title,
|
||||
filePath: file.data.filePath!,
|
||||
content: text,
|
||||
readingTime: file.data.readingTime,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Emit JSONL with the exact text used for embeddings
|
||||
const jsonl = docs
|
||||
.map((d) => ({ slug: d.slug, title: d.title, text: d.content }))
|
||||
.map((o) => JSON.stringify(o))
|
||||
.join("\n")
|
||||
|
||||
const jsonlSlug = "embeddings-text" as FullSlug
|
||||
yield write({
|
||||
ctx,
|
||||
slug: jsonlSlug,
|
||||
ext: ".jsonl",
|
||||
content: jsonl,
|
||||
})
|
||||
|
||||
// If aot is false, run the embedding generation script
|
||||
if (!o.aot) {
|
||||
console.log("\nGenerating embeddings (aot=false)...")
|
||||
|
||||
// Check for uv
|
||||
const hasUv = await checkUvInstalled()
|
||||
if (!hasUv) {
|
||||
throw new Error(
|
||||
"uv is required for embedding generation. Install it from https://docs.astral.sh/uv/",
|
||||
)
|
||||
}
|
||||
|
||||
const jsonlPath = joinSegments(ctx.argv.output, "embeddings-text.jsonl")
|
||||
const outDir = joinSegments(ctx.argv.output, "embeddings")
|
||||
|
||||
try {
|
||||
await runEmbedBuild(jsonlPath, outDir, o)
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : String(err)
|
||||
throw new Error(`Embedding generation failed: ${message}`)
|
||||
}
|
||||
} else {
|
||||
console.log(
|
||||
"\nSkipping embedding generation (aot=true). Expecting pre-generated embeddings in public/embeddings/",
|
||||
)
|
||||
}
|
||||
},
|
||||
externalResources(_ctx) {
|
||||
return {}
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -7,6 +7,7 @@ import { dirname } from "path"
|
||||
export const Static: QuartzEmitterPlugin = () => ({
|
||||
name: "Static",
|
||||
async *emit({ argv, cfg }) {
|
||||
// Copy Quartz's own internal static assets (quartz/static/) → output/static/
|
||||
const staticPath = joinSegments(QUARTZ, "static")
|
||||
const fps = await glob("**", staticPath, cfg.configuration.ignorePatterns)
|
||||
const outputStaticPath = joinSegments(argv.output, "static")
|
||||
@@ -18,6 +19,21 @@ export const Static: QuartzEmitterPlugin = () => ({
|
||||
await fs.promises.copyFile(src, dest)
|
||||
yield dest
|
||||
}
|
||||
|
||||
// Copy user-facing static assets (static/) → output/ preserving paths.
|
||||
// This mirrors Hugo's convention: static/ox-hugo/foo.png is served at /ox-hugo/foo.png,
|
||||
// which matches the src="/ox-hugo/..." paths that ox-hugo writes into exported markdown.
|
||||
const userStaticPath = "static"
|
||||
if (fs.existsSync(userStaticPath)) {
|
||||
const userFps = await glob("**", userStaticPath, cfg.configuration.ignorePatterns, false)
|
||||
for (const fp of userFps) {
|
||||
const src = joinSegments(userStaticPath, fp) as FilePath
|
||||
const dest = joinSegments(argv.output, fp) as FilePath
|
||||
await fs.promises.mkdir(dirname(dest), { recursive: true })
|
||||
await fs.promises.copyFile(src, dest)
|
||||
yield dest
|
||||
}
|
||||
}
|
||||
},
|
||||
async *partialEmit() {},
|
||||
})
|
||||
|
||||
@@ -73,7 +73,7 @@ async function processTagPage(
|
||||
const slug = joinSegments("tags", tag) as FullSlug
|
||||
const [tree, file] = tagContent
|
||||
const cfg = ctx.cfg.configuration
|
||||
const externalResources = pageResources(pathToRoot(slug), resources, ctx.cfg.configuration)
|
||||
const externalResources = pageResources(pathToRoot(slug), resources)
|
||||
const componentData: QuartzComponentProps = {
|
||||
ctx,
|
||||
fileData: file.data,
|
||||
|
||||
@@ -23,7 +23,16 @@ export const Citations: QuartzTransformerPlugin<Partial<Options>> = (userOpts) =
|
||||
name: "Citations",
|
||||
htmlPlugins(ctx) {
|
||||
const plugins: PluggableList = []
|
||||
|
||||
// per default, rehype-citations only supports en-US
|
||||
// see: https://github.com/timlrx/rehype-citation/issues/12
|
||||
// in here there are multiple usable locales:
|
||||
// https://github.com/citation-style-language/locales
|
||||
// thus, we optimistically assume there is indeed an appropriate
|
||||
// locale available and simply create the lang url-string
|
||||
let lang: string = "en-US"
|
||||
if (ctx.cfg.configuration.locale !== "en-US") {
|
||||
lang = `https://raw.githubusercontent.com/citation-stylelanguage/locales/refs/heads/master/locales-${ctx.cfg.configuration.locale}.xml`
|
||||
}
|
||||
// Add rehype-citation to the list of plugins
|
||||
plugins.push([
|
||||
rehypeCitation,
|
||||
@@ -32,7 +41,7 @@ export const Citations: QuartzTransformerPlugin<Partial<Options>> = (userOpts) =
|
||||
suppressBibliography: opts.suppressBibliography,
|
||||
linkCitations: opts.linkCitations,
|
||||
csl: opts.csl,
|
||||
lang: ctx.cfg.configuration.locale ?? "en-US",
|
||||
lang,
|
||||
},
|
||||
])
|
||||
|
||||
|
||||
@@ -103,7 +103,6 @@ export const FrontMatter: QuartzTransformerPlugin<Partial<Options>> = (userOpts)
|
||||
const created = coalesceAliases(data, ["created", "date"])
|
||||
if (created) {
|
||||
data.created = created
|
||||
data.modified ||= created // if modified is not set, use created
|
||||
}
|
||||
|
||||
const modified = coalesceAliases(data, [
|
||||
@@ -113,6 +112,8 @@ export const FrontMatter: QuartzTransformerPlugin<Partial<Options>> = (userOpts)
|
||||
"last-modified",
|
||||
])
|
||||
if (modified) data.modified = modified
|
||||
data.modified ||= created // if modified is not set, use created
|
||||
|
||||
const published = coalesceAliases(data, ["published", "publishDate", "date"])
|
||||
if (published) data.published = published
|
||||
|
||||
|
||||
@@ -17,8 +17,10 @@ interface Options {
|
||||
typstOptions: TypstOptions
|
||||
}
|
||||
|
||||
// mathjax macros
|
||||
export type Args = boolean | number | string | null
|
||||
interface MacroType {
|
||||
[key: string]: string
|
||||
[key: string]: string | Args[]
|
||||
}
|
||||
|
||||
export const Latex: QuartzTransformerPlugin<Partial<Options>> = (opts) => {
|
||||
@@ -37,11 +39,20 @@ export const Latex: QuartzTransformerPlugin<Partial<Options>> = (opts) => {
|
||||
case "typst": {
|
||||
return [[rehypeTypst, opts?.typstOptions ?? {}]]
|
||||
}
|
||||
default:
|
||||
case "mathjax": {
|
||||
return [[rehypeMathjax, { macros, ...(opts?.mathJaxOptions ?? {}) }]]
|
||||
}
|
||||
default: {
|
||||
return [[rehypeMathjax, { macros, ...(opts?.mathJaxOptions ?? {}) }]]
|
||||
return [
|
||||
[
|
||||
rehypeMathjax,
|
||||
{
|
||||
...(opts?.mathJaxOptions ?? {}),
|
||||
tex: {
|
||||
...(opts?.mathJaxOptions?.tex ?? {}),
|
||||
macros,
|
||||
},
|
||||
},
|
||||
],
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -57,7 +57,7 @@ export const CrawlLinks: QuartzTransformerPlugin<Partial<Options>> = (userOpts)
|
||||
) {
|
||||
let dest = node.properties.href as RelativeURL
|
||||
const classes = (node.properties.className ?? []) as string[]
|
||||
const isExternal = isAbsoluteUrl(dest)
|
||||
const isExternal = isAbsoluteUrl(dest, { httpOnly: false })
|
||||
classes.push(isExternal ? "external" : "internal")
|
||||
|
||||
if (isExternal && opts.externalLinkIcon) {
|
||||
@@ -99,7 +99,9 @@ export const CrawlLinks: QuartzTransformerPlugin<Partial<Options>> = (userOpts)
|
||||
}
|
||||
|
||||
// don't process external links or intra-document anchors
|
||||
const isInternal = !(isAbsoluteUrl(dest) || dest.startsWith("#"))
|
||||
const isInternal = !(
|
||||
isAbsoluteUrl(dest, { httpOnly: false }) || dest.startsWith("#")
|
||||
)
|
||||
if (isInternal) {
|
||||
dest = node.properties.href = transformLink(
|
||||
file.data.slug!,
|
||||
@@ -145,7 +147,7 @@ export const CrawlLinks: QuartzTransformerPlugin<Partial<Options>> = (userOpts)
|
||||
node.properties.loading = "lazy"
|
||||
}
|
||||
|
||||
if (!isAbsoluteUrl(node.properties.src)) {
|
||||
if (!isAbsoluteUrl(node.properties.src, { httpOnly: false })) {
|
||||
let dest = node.properties.src as RelativeURL
|
||||
dest = node.properties.src = transformLink(
|
||||
file.data.slug!,
|
||||
|
||||
@@ -27,7 +27,10 @@ const defaultOptions: Options = {
|
||||
const relrefRegex = new RegExp(/\[([^\]]+)\]\(\{\{< relref "([^"]+)" >\}\}\)/, "g")
|
||||
const predefinedHeadingIdRegex = new RegExp(/(.*) {#(?:.*)}/, "g")
|
||||
const hugoShortcodeRegex = new RegExp(/{{(.*)}}/, "g")
|
||||
const figureTagRegex = new RegExp(/< ?figure src="(.*)" ?>/, "g")
|
||||
// Matches the full Hugo {{< figure src="..." ... >}} shortcode and captures src.
|
||||
// Must run before the generic shortcode stripper to avoid partial-match issues
|
||||
// with captions that contain HTML (e.g. <span class="figure-number">).
|
||||
const figureShortcodeRegex = new RegExp(/{{<\s*figure\b[^}]*\bsrc="([^"]*)"[^}]*>}}/, "g")
|
||||
// \\\\\( -> matches \\(
|
||||
// (.+?) -> Lazy match for capturing the equation
|
||||
// \\\\\) -> matches \\)
|
||||
@@ -70,6 +73,14 @@ export const OxHugoFlavouredMarkdown: QuartzTransformerPlugin<Partial<Options>>
|
||||
})
|
||||
}
|
||||
|
||||
if (opts.replaceFigureWithMdImg) {
|
||||
src = src.toString()
|
||||
src = src.replaceAll(figureShortcodeRegex, (_value, ...capture) => {
|
||||
const [imgSrc] = capture
|
||||
return ``
|
||||
})
|
||||
}
|
||||
|
||||
if (opts.removeHugoShortcode) {
|
||||
src = src.toString()
|
||||
src = src.replaceAll(hugoShortcodeRegex, (_value, ...capture) => {
|
||||
@@ -78,14 +89,6 @@ export const OxHugoFlavouredMarkdown: QuartzTransformerPlugin<Partial<Options>>
|
||||
})
|
||||
}
|
||||
|
||||
if (opts.replaceFigureWithMdImg) {
|
||||
src = src.toString()
|
||||
src = src.replaceAll(figureTagRegex, (_value, ...capture) => {
|
||||
const [src] = capture
|
||||
return ``
|
||||
})
|
||||
}
|
||||
|
||||
if (opts.replaceOrgLatex) {
|
||||
src = src.toString()
|
||||
src = src.replaceAll(inlineLatexRegex, (_value, ...capture) => {
|
||||
|
||||
@@ -9,6 +9,10 @@ html {
|
||||
text-size-adjust: none;
|
||||
overflow-x: hidden;
|
||||
width: 100vw;
|
||||
|
||||
@media all and ($mobile) {
|
||||
scroll-padding-top: 4rem;
|
||||
}
|
||||
}
|
||||
|
||||
body {
|
||||
@@ -41,13 +45,17 @@ ul,
|
||||
.katex,
|
||||
.math,
|
||||
.typst-doc,
|
||||
.typst-doc * {
|
||||
g[class~="typst-text"] {
|
||||
color: var(--darkgray);
|
||||
fill: var(--darkgray);
|
||||
overflow-wrap: break-word;
|
||||
text-wrap: pretty;
|
||||
}
|
||||
|
||||
path[class~="typst-shape"] {
|
||||
stroke: var(--darkgray);
|
||||
}
|
||||
|
||||
.math {
|
||||
&.math-display {
|
||||
text-align: center;
|
||||
|
||||
@@ -123,13 +123,22 @@
|
||||
transform: rotateZ(-90deg);
|
||||
}
|
||||
|
||||
.callout-content > :first-child {
|
||||
transition:
|
||||
height 0.1s cubic-bezier(0.02, 0.01, 0.47, 1),
|
||||
margin 0.1s cubic-bezier(0.02, 0.01, 0.47, 1);
|
||||
overflow-y: clip;
|
||||
height: 0;
|
||||
margin-top: -1rem;
|
||||
.callout-content {
|
||||
& > * {
|
||||
transition:
|
||||
height 0.1s cubic-bezier(0.02, 0.01, 0.47, 1),
|
||||
margin 0.1s cubic-bezier(0.02, 0.01, 0.47, 1),
|
||||
padding 0.1s cubic-bezier(0.02, 0.01, 0.47, 1);
|
||||
overflow-y: clip;
|
||||
height: 0;
|
||||
margin-bottom: 0;
|
||||
margin-top: 0;
|
||||
padding-bottom: 0;
|
||||
padding-top: 0;
|
||||
}
|
||||
& > :first-child {
|
||||
margin-top: -1rem;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,12 +10,13 @@ export async function glob(
|
||||
pattern: string,
|
||||
cwd: string,
|
||||
ignorePatterns: string[],
|
||||
respectGitignore: boolean = true,
|
||||
): Promise<FilePath[]> {
|
||||
const fps = (
|
||||
await globby(pattern, {
|
||||
cwd,
|
||||
ignore: ignorePatterns,
|
||||
gitignore: true,
|
||||
gitignore: respectGitignore,
|
||||
})
|
||||
).map(toPosixPath)
|
||||
return fps as FilePath[]
|
||||
|
||||
@@ -26,9 +26,10 @@ export type CSSResource = {
|
||||
export function JSResourceToScriptElement(resource: JSResource, preserve?: boolean): JSX.Element {
|
||||
const scriptType = resource.moduleType ?? "application/javascript"
|
||||
const spaPreserve = preserve ?? resource.spaPreserve
|
||||
|
||||
if (resource.contentType === "external") {
|
||||
return (
|
||||
<script key={resource.src} src={resource.src} type={scriptType} spa-preserve={spaPreserve} />
|
||||
<script key={resource.src} src={resource.src} type={scriptType} data-persist={spaPreserve} />
|
||||
)
|
||||
} else {
|
||||
const content = resource.script
|
||||
@@ -36,7 +37,7 @@ export function JSResourceToScriptElement(resource: JSResource, preserve?: boole
|
||||
<script
|
||||
key={randomUUID()}
|
||||
type={scriptType}
|
||||
spa-preserve={spaPreserve}
|
||||
data-persist={spaPreserve}
|
||||
dangerouslySetInnerHTML={{ __html: content }}
|
||||
></script>
|
||||
)
|
||||
@@ -54,7 +55,7 @@ export function CSSResourceToStyleElement(resource: CSSResource, preserve?: bool
|
||||
href={resource.content}
|
||||
rel="stylesheet"
|
||||
type="text/css"
|
||||
spa-preserve={spaPreserve}
|
||||
data-persist={spaPreserve}
|
||||
/>
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1,548 +0,0 @@
|
||||
// Unified semantic search worker: handles data loading and query execution
|
||||
import { env, pipeline } from "@huggingface/transformers"
|
||||
import "onnxruntime-web/webgpu"
|
||||
import "onnxruntime-web/wasm"
|
||||
|
||||
export {}
|
||||
|
||||
type VectorShardMeta = {
|
||||
path: string
|
||||
rows: number
|
||||
rowOffset: number
|
||||
byteLength: number
|
||||
sha256?: string
|
||||
byteStride: number
|
||||
}
|
||||
|
||||
type LevelSection = {
|
||||
level: number
|
||||
indptr: { offset: number; elements: number; byteLength: number }
|
||||
indices: { offset: number; elements: number; byteLength: number }
|
||||
}
|
||||
|
||||
type ChunkMetadata = {
|
||||
parentSlug: string
|
||||
chunkId: number
|
||||
}
|
||||
|
||||
type Manifest = {
|
||||
version: number
|
||||
dims: number
|
||||
dtype: string
|
||||
normalized: boolean
|
||||
rows: number
|
||||
shardSizeRows: number
|
||||
vectors: {
|
||||
dtype: string
|
||||
rows: number
|
||||
dims: number
|
||||
shards: VectorShardMeta[]
|
||||
}
|
||||
ids: string[]
|
||||
titles?: string[]
|
||||
chunkMetadata?: Record<string, ChunkMetadata>
|
||||
hnsw: {
|
||||
M: number
|
||||
efConstruction: number
|
||||
entryPoint: number
|
||||
maxLevel: number
|
||||
graph: {
|
||||
path: string
|
||||
sha256?: string
|
||||
levels: LevelSection[]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type InitMessage = {
|
||||
type: "init"
|
||||
cfg: any
|
||||
manifestUrl: string
|
||||
baseUrl?: string
|
||||
disableCache?: boolean
|
||||
}
|
||||
|
||||
type SearchMessage = { type: "search"; text: string; k: number; seq: number }
|
||||
type ResetMessage = { type: "reset" }
|
||||
|
||||
type WorkerMessage = InitMessage | SearchMessage | ResetMessage
|
||||
|
||||
type ReadyMessage = { type: "ready" }
|
||||
|
||||
type ProgressMessage = {
|
||||
type: "progress"
|
||||
loadedRows: number
|
||||
totalRows: number
|
||||
}
|
||||
|
||||
type SearchHit = { id: number; score: number }
|
||||
|
||||
type SearchResultMessage = {
|
||||
type: "search-result"
|
||||
seq: number
|
||||
semantic: SearchHit[]
|
||||
}
|
||||
|
||||
type ErrorMessage = { type: "error"; seq?: number; message: string }
|
||||
|
||||
type WorkerState = "idle" | "loading" | "ready" | "error"
|
||||
|
||||
// IndexedDB configuration
|
||||
const DB_NAME = "semantic-search-cache"
|
||||
const STORE_NAME = "assets"
|
||||
const DB_VERSION = 1
|
||||
const hasIndexedDB = typeof indexedDB !== "undefined"
|
||||
const supportsSharedArrayBuffer = typeof SharedArrayBuffer !== "undefined"
|
||||
|
||||
// State
|
||||
let state: WorkerState = "idle"
|
||||
let manifest: Manifest | null = null
|
||||
let cfg: any = null
|
||||
let vectorsView: Float32Array | null = null
|
||||
let dims = 0
|
||||
let rows = 0
|
||||
let classifier: any = null
|
||||
let envConfigured = false
|
||||
let entryPoint = -1
|
||||
let maxLevel = 0
|
||||
let efDefault = 128
|
||||
let levelGraph: { indptr: Uint32Array; indices: Uint32Array }[] = []
|
||||
let abortController: AbortController | null = null
|
||||
let dbPromise: Promise<IDBDatabase> | null = null
|
||||
|
||||
// IndexedDB helpers
|
||||
function openDatabase(): Promise<IDBDatabase> {
|
||||
if (!hasIndexedDB) {
|
||||
return Promise.reject(new Error("indexedDB unavailable"))
|
||||
}
|
||||
if (!dbPromise) {
|
||||
dbPromise = new Promise((resolve, reject) => {
|
||||
const req = indexedDB.open(DB_NAME, DB_VERSION)
|
||||
req.onupgradeneeded = () => {
|
||||
const db = req.result
|
||||
if (!db.objectStoreNames.contains(STORE_NAME)) {
|
||||
db.createObjectStore(STORE_NAME)
|
||||
}
|
||||
}
|
||||
req.onsuccess = () => resolve(req.result)
|
||||
req.onerror = () => reject(req.error ?? new Error("failed to open cache store"))
|
||||
})
|
||||
}
|
||||
return dbPromise
|
||||
}
|
||||
|
||||
async function readAsset(hash: string): Promise<ArrayBuffer | null> {
|
||||
if (!hasIndexedDB) {
|
||||
return null
|
||||
}
|
||||
const db = await openDatabase()
|
||||
return new Promise((resolve, reject) => {
|
||||
const tx = db.transaction(STORE_NAME, "readonly")
|
||||
const store = tx.objectStore(STORE_NAME)
|
||||
const req = store.get(hash)
|
||||
req.onsuccess = () => {
|
||||
const value = req.result
|
||||
if (value instanceof ArrayBuffer) {
|
||||
resolve(value)
|
||||
} else if (value && value.buffer instanceof ArrayBuffer) {
|
||||
resolve(value.buffer as ArrayBuffer)
|
||||
} else {
|
||||
resolve(null)
|
||||
}
|
||||
}
|
||||
req.onerror = () => reject(req.error ?? new Error("failed to read cached asset"))
|
||||
})
|
||||
}
|
||||
|
||||
async function writeAsset(hash: string, buffer: ArrayBuffer): Promise<void> {
|
||||
if (!hasIndexedDB) {
|
||||
return
|
||||
}
|
||||
const db = await openDatabase()
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const tx = db.transaction(STORE_NAME, "readwrite")
|
||||
const store = tx.objectStore(STORE_NAME)
|
||||
const req = store.put(buffer, hash)
|
||||
req.onsuccess = () => resolve()
|
||||
req.onerror = () => reject(req.error ?? new Error("failed to cache asset"))
|
||||
})
|
||||
}
|
||||
|
||||
function toAbsolute(path: string, baseUrl?: string): string {
|
||||
if (path.startsWith("http://") || path.startsWith("https://")) {
|
||||
return path
|
||||
}
|
||||
const base = baseUrl ?? self.location.origin
|
||||
return new URL(path, base).toString()
|
||||
}
|
||||
|
||||
async function fetchBinary(
|
||||
path: string,
|
||||
disableCache: boolean,
|
||||
sha?: string,
|
||||
): Promise<ArrayBuffer> {
|
||||
if (!disableCache && sha && hasIndexedDB) {
|
||||
try {
|
||||
const cached = await readAsset(sha)
|
||||
if (cached) {
|
||||
return cached
|
||||
}
|
||||
} catch {
|
||||
// fall through to network fetch on cache errors
|
||||
}
|
||||
}
|
||||
const res = await fetch(path, { signal: abortController?.signal ?? undefined })
|
||||
if (!res.ok) {
|
||||
throw new Error(`failed to fetch ${path}: ${res.status} ${res.statusText}`)
|
||||
}
|
||||
const payload = await res.arrayBuffer()
|
||||
if (!disableCache && sha && hasIndexedDB) {
|
||||
try {
|
||||
await writeAsset(sha, payload)
|
||||
} catch {
|
||||
// ignore cache write failures
|
||||
}
|
||||
}
|
||||
return payload
|
||||
}
|
||||
|
||||
async function populateVectors(
|
||||
manifest: Manifest,
|
||||
baseUrl: string | undefined,
|
||||
disableCache: boolean | undefined,
|
||||
): Promise<{ buffer: Float32Array; rowsLoaded: number }> {
|
||||
if (manifest.vectors.dtype !== "fp32") {
|
||||
throw new Error(`unsupported embedding dtype '${manifest.vectors.dtype}', regenerate with fp32`)
|
||||
}
|
||||
const rows = manifest.rows
|
||||
const dims = manifest.dims
|
||||
const totalBytes = rows * dims * Float32Array.BYTES_PER_ELEMENT
|
||||
const buffer = supportsSharedArrayBuffer
|
||||
? new Float32Array(new SharedArrayBuffer(totalBytes))
|
||||
: new Float32Array(totalBytes)
|
||||
let loadedRows = 0
|
||||
for (const shard of manifest.vectors.shards) {
|
||||
const absolute = toAbsolute(shard.path, baseUrl)
|
||||
const payload = await fetchBinary(absolute, Boolean(disableCache), shard.sha256)
|
||||
const view = new Float32Array(payload)
|
||||
if (view.length !== shard.rows * dims) {
|
||||
throw new Error(
|
||||
`shard ${shard.path} has mismatched length (expected ${shard.rows * dims}, got ${view.length})`,
|
||||
)
|
||||
}
|
||||
buffer.set(view, shard.rowOffset * dims)
|
||||
loadedRows = Math.min(rows, shard.rowOffset + shard.rows)
|
||||
const progress: ProgressMessage = {
|
||||
type: "progress",
|
||||
loadedRows,
|
||||
totalRows: rows,
|
||||
}
|
||||
self.postMessage(progress)
|
||||
}
|
||||
return { buffer, rowsLoaded: loadedRows }
|
||||
}
|
||||
|
||||
async function populateGraph(
|
||||
manifest: Manifest,
|
||||
baseUrl: string | undefined,
|
||||
disableCache: boolean | undefined,
|
||||
): Promise<ArrayBuffer> {
|
||||
const graphMeta = manifest.hnsw.graph
|
||||
const absolute = toAbsolute(graphMeta.path, baseUrl)
|
||||
return await fetchBinary(absolute, Boolean(disableCache), graphMeta.sha256)
|
||||
}
|
||||
|
||||
function configureRuntimeEnv() {
|
||||
if (envConfigured) return
|
||||
env.allowLocalModels = false
|
||||
env.allowRemoteModels = true
|
||||
const wasmBackend = env.backends?.onnx?.wasm
|
||||
if (!wasmBackend) {
|
||||
throw new Error("transformers.js ONNX runtime backend unavailable")
|
||||
}
|
||||
const cdnBase = `https://cdn.jsdelivr.net/npm/@huggingface/transformers@${env.version}/dist/`
|
||||
wasmBackend.wasmPaths = cdnBase
|
||||
envConfigured = true
|
||||
}
|
||||
|
||||
async function ensureEncoder() {
|
||||
if (classifier) return
|
||||
if (!cfg?.model) {
|
||||
throw new Error("semantic worker missing model identifier")
|
||||
}
|
||||
configureRuntimeEnv()
|
||||
const dtype = typeof cfg?.dtype === "string" && cfg.dtype.length > 0 ? cfg.dtype : "fp32"
|
||||
const pipelineOpts: Record<string, unknown> = {
|
||||
device: "wasm",
|
||||
dtype,
|
||||
local_files_only: false,
|
||||
}
|
||||
classifier = await pipeline("feature-extraction", cfg.model, pipelineOpts)
|
||||
cfg.dtype = dtype
|
||||
}
|
||||
|
||||
function vectorSlice(id: number): Float32Array {
|
||||
if (!vectorsView) {
|
||||
throw new Error("vector buffer not configured")
|
||||
}
|
||||
const start = id * dims
|
||||
const end = start + dims
|
||||
return vectorsView.subarray(start, end)
|
||||
}
|
||||
|
||||
function dot(a: Float32Array, b: Float32Array): number {
|
||||
let s = 0
|
||||
for (let i = 0; i < dims; i++) {
|
||||
s += a[i] * b[i]
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
function neighborsFor(level: number, node: number): Uint32Array {
|
||||
const meta = levelGraph[level]
|
||||
if (!meta) return new Uint32Array()
|
||||
const { indptr, indices } = meta
|
||||
if (node < 0 || node + 1 >= indptr.length) return new Uint32Array()
|
||||
const start = indptr[node]
|
||||
const end = indptr[node + 1]
|
||||
return indices.subarray(start, end)
|
||||
}
|
||||
|
||||
function insertSortedDescending(arr: SearchHit[], item: SearchHit) {
|
||||
let idx = arr.length
|
||||
while (idx > 0 && arr[idx - 1].score < item.score) {
|
||||
idx -= 1
|
||||
}
|
||||
arr.splice(idx, 0, item)
|
||||
}
|
||||
|
||||
function bruteForceSearch(query: Float32Array, k: number): SearchHit[] {
|
||||
if (!vectorsView) return []
|
||||
const hits: SearchHit[] = []
|
||||
for (let id = 0; id < rows; id++) {
|
||||
const score = dot(query, vectorSlice(id))
|
||||
if (hits.length < k) {
|
||||
insertSortedDescending(hits, { id, score })
|
||||
} else if (score > hits[hits.length - 1].score) {
|
||||
insertSortedDescending(hits, { id, score })
|
||||
hits.length = k
|
||||
}
|
||||
}
|
||||
return hits
|
||||
}
|
||||
|
||||
function hnswSearch(query: Float32Array, k: number): SearchHit[] {
|
||||
if (!manifest || !vectorsView || entryPoint < 0 || levelGraph.length === 0) {
|
||||
return bruteForceSearch(query, k)
|
||||
}
|
||||
const ef = Math.max(efDefault, k * 10)
|
||||
let ep = entryPoint
|
||||
let epScore = dot(query, vectorSlice(ep))
|
||||
for (let level = maxLevel; level > 0; level--) {
|
||||
let changed = true
|
||||
while (changed) {
|
||||
changed = false
|
||||
const neigh = neighborsFor(level, ep)
|
||||
for (let i = 0; i < neigh.length; i++) {
|
||||
const candidate = neigh[i]
|
||||
if (candidate >= rows) continue
|
||||
const score = dot(query, vectorSlice(candidate))
|
||||
if (score > epScore) {
|
||||
epScore = score
|
||||
ep = candidate
|
||||
changed = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const visited = new Set<number>()
|
||||
const candidateQueue: SearchHit[] = []
|
||||
const best: SearchHit[] = []
|
||||
insertSortedDescending(candidateQueue, { id: ep, score: epScore })
|
||||
insertSortedDescending(best, { id: ep, score: epScore })
|
||||
visited.add(ep)
|
||||
|
||||
while (candidateQueue.length > 0) {
|
||||
const current = candidateQueue.shift()!
|
||||
const worstBest = best.length >= ef ? best[best.length - 1].score : -Infinity
|
||||
if (current.score < worstBest && best.length >= ef) {
|
||||
break
|
||||
}
|
||||
const neigh = neighborsFor(0, current.id)
|
||||
for (let i = 0; i < neigh.length; i++) {
|
||||
const candidate = neigh[i]
|
||||
if (candidate >= rows || visited.has(candidate)) continue
|
||||
visited.add(candidate)
|
||||
const score = dot(query, vectorSlice(candidate))
|
||||
const hit = { id: candidate, score }
|
||||
insertSortedDescending(candidateQueue, hit)
|
||||
if (best.length < ef || score > best[best.length - 1].score) {
|
||||
insertSortedDescending(best, hit)
|
||||
if (best.length > ef) {
|
||||
best.pop()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
best.sort((a, b) => b.score - a.score)
|
||||
return best.slice(0, k)
|
||||
}
|
||||
|
||||
async function embed(text: string, isQuery: boolean = false): Promise<Float32Array> {
|
||||
await ensureEncoder()
|
||||
// Apply model-specific prefixes for asymmetric search
|
||||
let prefixedText = text
|
||||
if (cfg?.model) {
|
||||
const modelName = cfg.model.toLowerCase()
|
||||
switch (true) {
|
||||
case modelName.includes("e5"): {
|
||||
// E5 models require query: or passage: prefix
|
||||
prefixedText = isQuery ? `query: ${text}` : `passage: ${text}`
|
||||
break
|
||||
}
|
||||
case modelName.includes("qwen") && modelName.includes("embedding"): {
|
||||
// Qwen3-Embedding requires task instruction for queries only
|
||||
if (isQuery) {
|
||||
const task = "Given a web search query, retrieve relevant passages that answer the query"
|
||||
prefixedText = `Instruct: ${task}\nQuery: ${text}`
|
||||
}
|
||||
// Documents use plain text (no prefix)
|
||||
break
|
||||
}
|
||||
case modelName.includes("embeddinggemma"): {
|
||||
// embeddinggemma requires specific prefixes
|
||||
prefixedText = isQuery
|
||||
? `task: search result | query: ${text}`
|
||||
: `title: none | text: ${text}`
|
||||
break
|
||||
}
|
||||
default:
|
||||
break
|
||||
}
|
||||
}
|
||||
const out = await classifier(prefixedText, { pooling: "mean", normalize: true })
|
||||
const data = Array.from(out?.data ?? out) as number[]
|
||||
const vec = new Float32Array(dims)
|
||||
for (let i = 0; i < dims; i++) vec[i] = data[i] ?? 0
|
||||
return vec
|
||||
}
|
||||
|
||||
async function handleInit(msg: InitMessage) {
|
||||
if (state === "loading" || state === "ready") {
|
||||
throw new Error("worker already initialized or loading")
|
||||
}
|
||||
|
||||
state = "loading"
|
||||
abortController?.abort()
|
||||
abortController = new AbortController()
|
||||
|
||||
try {
|
||||
cfg = msg.cfg
|
||||
|
||||
const manifestUrl = toAbsolute(msg.manifestUrl, msg.baseUrl)
|
||||
const response = await fetch(manifestUrl, { signal: abortController.signal })
|
||||
if (!response.ok) {
|
||||
throw new Error(
|
||||
`failed to fetch manifest ${manifestUrl}: ${response.status} ${response.statusText}`,
|
||||
)
|
||||
}
|
||||
manifest = (await response.json()) as Manifest
|
||||
|
||||
if (manifest.vectors.dtype !== "fp32") {
|
||||
throw new Error(
|
||||
`unsupported embedding dtype '${manifest.vectors.dtype}', regenerate with fp32`,
|
||||
)
|
||||
}
|
||||
|
||||
dims = manifest.dims
|
||||
rows = manifest.rows
|
||||
|
||||
const { buffer: vectorBuffer } = await populateVectors(manifest, msg.baseUrl, msg.disableCache)
|
||||
vectorsView = vectorBuffer
|
||||
|
||||
const graphBuffer = await populateGraph(manifest, msg.baseUrl, msg.disableCache)
|
||||
|
||||
entryPoint = manifest.hnsw.entryPoint
|
||||
maxLevel = manifest.hnsw.maxLevel
|
||||
efDefault = Math.max(64, manifest.hnsw.M * 4)
|
||||
levelGraph = manifest.hnsw.graph.levels.map((level) => {
|
||||
const indptr = new Uint32Array(graphBuffer, level.indptr.offset, level.indptr.elements)
|
||||
const indices = new Uint32Array(graphBuffer, level.indices.offset, level.indices.elements)
|
||||
return { indptr, indices }
|
||||
})
|
||||
|
||||
state = "ready"
|
||||
const ready: ReadyMessage = { type: "ready" }
|
||||
self.postMessage(ready)
|
||||
} catch (err) {
|
||||
state = "error"
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
async function handleSearch(msg: SearchMessage) {
|
||||
if (state !== "ready") {
|
||||
throw new Error("worker not ready for search")
|
||||
}
|
||||
if (!manifest || !vectorsView) {
|
||||
throw new Error("semantic worker not configured")
|
||||
}
|
||||
|
||||
const queryVec = await embed(msg.text, true)
|
||||
const semanticHits = hnswSearch(queryVec, Math.max(1, msg.k))
|
||||
const message: SearchResultMessage = {
|
||||
type: "search-result",
|
||||
seq: msg.seq,
|
||||
semantic: semanticHits,
|
||||
}
|
||||
self.postMessage(message)
|
||||
}
|
||||
|
||||
function handleReset() {
|
||||
abortController?.abort()
|
||||
abortController = null
|
||||
state = "idle"
|
||||
manifest = null
|
||||
cfg = null
|
||||
vectorsView = null
|
||||
dims = 0
|
||||
rows = 0
|
||||
classifier = null
|
||||
envConfigured = false
|
||||
levelGraph = []
|
||||
entryPoint = -1
|
||||
maxLevel = 0
|
||||
}
|
||||
|
||||
self.onmessage = (event: MessageEvent<WorkerMessage>) => {
|
||||
const data = event.data
|
||||
|
||||
if (data.type === "reset") {
|
||||
handleReset()
|
||||
return
|
||||
}
|
||||
|
||||
if (data.type === "init") {
|
||||
void handleInit(data).catch((err: unknown) => {
|
||||
const message: ErrorMessage = {
|
||||
type: "error",
|
||||
message: err instanceof Error ? err.message : String(err),
|
||||
}
|
||||
self.postMessage(message)
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
if (data.type === "search") {
|
||||
void handleSearch(data).catch((err: unknown) => {
|
||||
const message: ErrorMessage = {
|
||||
type: "error",
|
||||
seq: data.seq,
|
||||
message: err instanceof Error ? err.message : String(err),
|
||||
}
|
||||
self.postMessage(message)
|
||||
})
|
||||
}
|
||||
}
|
||||
217
scripts/export.exs
Normal file
217
scripts/export.exs
Normal file
@@ -0,0 +1,217 @@
|
||||
#!/usr/bin/env elixir
|
||||
# Export org-roam notes (per-file) to content/ via ox-hugo,
|
||||
# then run the markdown transformation pipeline (citations, etc.).
|
||||
#
|
||||
# Usage:
|
||||
# NOTES_DIR=~/notes elixir scripts/export.exs
|
||||
# elixir scripts/export.exs /path/to/notes
|
||||
#
|
||||
# Optional env vars:
|
||||
# BIBTEX_FILE — path to a .bib file used as citation fallback
|
||||
# ZOTERO_URL — Zotero Better BibTeX base URL (default: http://localhost:23119)
|
||||
# CITATION_MODE — silent | warn (default) | strict
|
||||
#
|
||||
# The positional argument takes precedence over the NOTES_DIR env var.
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Load the pipeline Mix project so its modules are available in this script.
|
||||
# ---------------------------------------------------------------------------
|
||||
repo_root = __DIR__ |> Path.join("..") |> Path.expand()
|
||||
pipeline_dir = Path.join(repo_root, "scripts/pipeline")
|
||||
|
||||
# Compile and load the pipeline project's modules into this runtime.
|
||||
# Mix.install is NOT used here because we have a local Mix project — instead
|
||||
# we compile it and push its beam files onto the code path.
|
||||
#
|
||||
# This runs `mix deps.get` + `mix compile` the first time; subsequent runs
|
||||
# use the compiled artifacts from _build/ (fast, same as Mix caching).
|
||||
{_, 0} =
|
||||
System.cmd("mix", ["deps.get", "--quiet"],
|
||||
cd: pipeline_dir,
|
||||
env: [{"MIX_ENV", "prod"}],
|
||||
into: IO.stream()
|
||||
)
|
||||
|
||||
{_, 0} =
|
||||
System.cmd("mix", ["compile", "--quiet"],
|
||||
cd: pipeline_dir,
|
||||
env: [{"MIX_ENV", "prod"}],
|
||||
into: IO.stream()
|
||||
)
|
||||
|
||||
# Add compiled beam files to the load path so we can call pipeline modules.
|
||||
pipeline_build = Path.join(pipeline_dir, "_build/prod/lib")
|
||||
|
||||
pipeline_build
|
||||
|> File.ls!()
|
||||
|> Enum.each(fn app ->
|
||||
ebin = Path.join([pipeline_build, app, "ebin"])
|
||||
if File.dir?(ebin), do: Code.prepend_path(ebin)
|
||||
end)
|
||||
|
||||
# Start the pipeline OTP application (which starts Finch for HTTP).
|
||||
Application.ensure_all_started(:pipeline)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Argument / env resolution
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
notes_dir =
|
||||
case System.argv() do
|
||||
[dir | _] -> dir
|
||||
[] ->
|
||||
System.get_env("NOTES_DIR") ||
|
||||
(IO.puts(:stderr, "Usage: NOTES_DIR=/path/to/notes elixir scripts/export.exs")
|
||||
System.halt(1))
|
||||
end
|
||||
|
||||
notes_dir = Path.expand(notes_dir)
|
||||
content_dir = Path.join(repo_root, "content")
|
||||
|
||||
unless File.dir?(notes_dir) do
|
||||
IO.puts(:stderr, "Error: notes directory does not exist: #{notes_dir}")
|
||||
System.halt(1)
|
||||
end
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Phase 1: Wipe content/
|
||||
# ---------------------------------------------------------------------------
|
||||
IO.puts("==> Wiping #{content_dir}")
|
||||
|
||||
content_dir
|
||||
|> File.ls!()
|
||||
|> Enum.reject(&(&1 == ".gitkeep"))
|
||||
|> Enum.each(fn entry ->
|
||||
Path.join(content_dir, entry) |> File.rm_rf!()
|
||||
end)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Phase 2: Export org files via Emacs + ox-hugo
|
||||
# ---------------------------------------------------------------------------
|
||||
IO.puts("==> Exporting org files from #{notes_dir}")
|
||||
|
||||
org_files =
|
||||
Path.join(notes_dir, "**/*.org")
|
||||
|> Path.wildcard()
|
||||
|
||||
if org_files == [] do
|
||||
IO.puts("No .org files found in #{notes_dir}")
|
||||
System.halt(0)
|
||||
end
|
||||
|
||||
results =
|
||||
Enum.map(org_files, fn orgfile ->
|
||||
IO.puts(" exporting: #{orgfile}")
|
||||
|
||||
section =
|
||||
orgfile
|
||||
|> Path.dirname()
|
||||
|> Path.relative_to(notes_dir)
|
||||
|
||||
{output, exit_code} =
|
||||
System.cmd(
|
||||
"emacs",
|
||||
[
|
||||
"--batch",
|
||||
"--eval", "(require 'ox-hugo)",
|
||||
"--eval", ~s[(setq org-hugo-base-dir "#{repo_root}")],
|
||||
"--eval", ~s[(setq org-hugo-default-section-directory "#{section}")],
|
||||
"--visit", orgfile,
|
||||
"--funcall", "org-hugo-export-to-md"
|
||||
],
|
||||
stderr_to_stdout: true
|
||||
)
|
||||
|
||||
filtered =
|
||||
output
|
||||
|> String.split("\n")
|
||||
|> Enum.reject(&String.match?(&1, ~r/^Loading|^ad-handle|^For information/))
|
||||
|> Enum.join("\n")
|
||||
|
||||
if filtered != "", do: IO.puts(filtered)
|
||||
|
||||
{orgfile, exit_code}
|
||||
end)
|
||||
|
||||
failures = Enum.filter(results, fn {_, code} -> code != 0 end)
|
||||
|
||||
if failures != [] do
|
||||
IO.puts(:stderr, "\nFailed to export #{length(failures)} file(s):")
|
||||
Enum.each(failures, fn {f, code} -> IO.puts(:stderr, " [exit #{code}] #{f}") end)
|
||||
System.halt(1)
|
||||
end
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Phase 3: Markdown transformation pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
IO.puts("==> Running markdown pipeline")
|
||||
|
||||
pipeline_opts = %{
|
||||
zotero_url: System.get_env("ZOTERO_URL", "http://localhost:23119"),
|
||||
bibtex_file: System.get_env("BIBTEX_FILE"),
|
||||
citation_mode:
|
||||
case System.get_env("CITATION_MODE", "warn") do
|
||||
"silent" -> :silent
|
||||
"strict" -> :strict
|
||||
_ -> :warn
|
||||
end
|
||||
}
|
||||
|
||||
transforms = [Pipeline.Transforms.Citations]
|
||||
|
||||
case Pipeline.run(content_dir, transforms, pipeline_opts) do
|
||||
{:ok, stats} ->
|
||||
Enum.each(stats, fn {mod, count} ->
|
||||
IO.puts(" #{inspect(mod)}: #{count} file(s) modified")
|
||||
end)
|
||||
|
||||
{:error, reason} ->
|
||||
IO.puts(:stderr, "Pipeline error: #{inspect(reason)}")
|
||||
System.halt(1)
|
||||
end
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Phase 4: Generate default index.md if none was exported
|
||||
# ---------------------------------------------------------------------------
|
||||
md_count =
|
||||
Path.join(content_dir, "**/*.md")
|
||||
|> Path.wildcard()
|
||||
|> length()
|
||||
|
||||
index_path = Path.join(content_dir, "index.md")
|
||||
|
||||
unless File.exists?(index_path) do
|
||||
IO.puts("==> Generating default index.md")
|
||||
|
||||
pages =
|
||||
Path.join(content_dir, "**/*.md")
|
||||
|> Path.wildcard()
|
||||
|> Enum.map(fn path ->
|
||||
slug = Path.relative_to(path, content_dir) |> Path.rootname()
|
||||
|
||||
title =
|
||||
path
|
||||
|> File.read!()
|
||||
|> then(fn content ->
|
||||
case Regex.run(~r/^title\s*=\s*"(.+)"/m, content) do
|
||||
[_, t] -> t
|
||||
_ -> slug
|
||||
end
|
||||
end)
|
||||
|
||||
{slug, title}
|
||||
end)
|
||||
|> Enum.sort_by(fn {_, title} -> title end)
|
||||
|> Enum.map(fn {slug, title} -> "- [#{title}](#{slug})" end)
|
||||
|> Enum.join("\n")
|
||||
|
||||
File.write!(index_path, """
|
||||
---
|
||||
title: Index
|
||||
---
|
||||
|
||||
#{pages}
|
||||
""")
|
||||
end
|
||||
|
||||
IO.puts("==> Done. #{md_count} markdown files in #{content_dir}")
|
||||
83
scripts/pipeline/lib/pipeline.ex
Normal file
83
scripts/pipeline/lib/pipeline.ex
Normal file
@@ -0,0 +1,83 @@
|
||||
defmodule Pipeline do
|
||||
@moduledoc """
|
||||
Post-export markdown transformation pipeline.
|
||||
|
||||
Applies a list of transform modules sequentially over every .md file
|
||||
in a content directory. Each transform module must implement:
|
||||
|
||||
apply(content :: String.t(), opts :: map()) :: String.t()
|
||||
|
||||
Transforms are applied in the order given. A file is rewritten only
|
||||
when at least one transform mutates its content (checked via equality).
|
||||
|
||||
## Usage
|
||||
|
||||
opts = %{
|
||||
zotero_url: "http://localhost:23119",
|
||||
bibtex_file: System.get_env("BIBTEX_FILE"),
|
||||
citation_mode: :warn # :silent | :warn | :strict
|
||||
}
|
||||
|
||||
Pipeline.run(content_dir, [Pipeline.Transforms.Citations], opts)
|
||||
"""
|
||||
|
||||
require Logger
|
||||
|
||||
@type transform :: module()
|
||||
@type opts :: map()
|
||||
|
||||
@doc """
|
||||
Run all transforms over every .md file under `content_dir`.
|
||||
Returns `{:ok, stats}` where stats maps each transform to a count of files it changed.
|
||||
"""
|
||||
@spec run(String.t(), [transform()], opts()) :: {:ok, map()}
|
||||
def run(content_dir, transforms, opts \\ %{}) do
|
||||
md_files =
|
||||
content_dir
|
||||
|> Path.join("**/*.md")
|
||||
|> Path.wildcard()
|
||||
|
||||
if md_files == [] do
|
||||
Logger.warning("Pipeline: no .md files found in #{content_dir}")
|
||||
{:ok, %{}}
|
||||
else
|
||||
Logger.info("Pipeline: processing #{length(md_files)} markdown files with #{length(transforms)} transform(s)")
|
||||
|
||||
# Initialise transforms (allows them to perform setup such as loading a .bib file).
|
||||
# Each transform module must implement the Pipeline.Transform behaviour.
|
||||
initialized =
|
||||
Enum.map(transforms, fn mod ->
|
||||
state = mod.init(opts)
|
||||
{mod, state}
|
||||
end)
|
||||
|
||||
stats =
|
||||
Enum.reduce(md_files, %{}, fn path, acc ->
|
||||
original = File.read!(path)
|
||||
|
||||
{transformed, file_stats} =
|
||||
Enum.reduce(initialized, {original, %{}}, fn {mod, state}, {content, fstats} ->
|
||||
result = mod.apply(content, state, opts)
|
||||
changed = result != content
|
||||
{result, Map.update(fstats, mod, (if changed, do: 1, else: 0), &(&1 + (if changed, do: 1, else: 0)))}
|
||||
end)
|
||||
|
||||
if transformed != original do
|
||||
File.write!(path, transformed)
|
||||
Logger.debug("Pipeline: updated #{Path.relative_to_cwd(path)}")
|
||||
end
|
||||
|
||||
Map.merge(acc, file_stats, fn _k, a, b -> a + b end)
|
||||
end)
|
||||
|
||||
Enum.each(initialized, fn {mod, state} ->
|
||||
# teardown/1 is optional in the behaviour
|
||||
if function_exported?(mod, :teardown, 1) do
|
||||
mod.teardown(state)
|
||||
end
|
||||
end)
|
||||
|
||||
{:ok, stats}
|
||||
end
|
||||
end
|
||||
end
|
||||
14
scripts/pipeline/lib/pipeline/application.ex
Normal file
14
scripts/pipeline/lib/pipeline/application.ex
Normal file
@@ -0,0 +1,14 @@
|
||||
defmodule Pipeline.Application do
|
||||
@moduledoc false
|
||||
use Application
|
||||
|
||||
@impl true
|
||||
def start(_type, _args) do
|
||||
children = [
|
||||
{Finch, name: Pipeline.Finch}
|
||||
]
|
||||
|
||||
opts = [strategy: :one_for_one, name: Pipeline.Supervisor]
|
||||
Supervisor.start_link(children, opts)
|
||||
end
|
||||
end
|
||||
178
scripts/pipeline/lib/pipeline/resolvers/bibtex.ex
Normal file
178
scripts/pipeline/lib/pipeline/resolvers/bibtex.ex
Normal file
@@ -0,0 +1,178 @@
|
||||
defmodule Pipeline.Resolvers.BibTeX do
|
||||
@moduledoc """
|
||||
Resolves citation keys from a local BibTeX (.bib) file.
|
||||
|
||||
Configured via the `BIBTEX_FILE` environment variable, or passed directly
|
||||
as `opts.bibtex_file`. The file is parsed once at init time and the
|
||||
resulting entry map is reused for all lookups.
|
||||
|
||||
Supports extracting: author last names, year, title, DOI, URL.
|
||||
|
||||
BibTeX entry format parsed:
|
||||
|
||||
@type{citationkey,
|
||||
author = {Last, First and Last2, First2},
|
||||
year = {2021},
|
||||
title = {Some Title},
|
||||
doi = {10.xxxx/yyyy},
|
||||
url = {https://example.com},
|
||||
}
|
||||
|
||||
Returns `{:ok, %{label: "Author, Year", url: "..."}}` or `:error`.
|
||||
"""
|
||||
|
||||
require Logger
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@doc """
|
||||
Parse a .bib file and return a map of `%{citation_key => entry_map}`.
|
||||
Returns `{:ok, entries}` or `{:error, reason}`.
|
||||
"""
|
||||
@spec load(String.t()) :: {:ok, map()} | {:error, term()}
|
||||
def load(path) do
|
||||
case File.read(path) do
|
||||
{:ok, content} ->
|
||||
entries = parse_entries(content)
|
||||
Logger.info("BibTeX: loaded #{map_size(entries)} entries from #{path}")
|
||||
{:ok, entries}
|
||||
|
||||
{:error, reason} ->
|
||||
{:error, reason}
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Resolve a citation key from pre-loaded BibTeX entries.
|
||||
"""
|
||||
@spec resolve(String.t(), map()) :: {:ok, map()} | :error
|
||||
def resolve(key, entries) do
|
||||
case Map.fetch(entries, key) do
|
||||
{:ok, entry} ->
|
||||
label = build_label(entry)
|
||||
url = build_url(entry)
|
||||
{:ok, %{label: label, url: url}}
|
||||
|
||||
:error ->
|
||||
:error
|
||||
end
|
||||
end
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Parsing
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
# Match @type{key, ...fields...}
|
||||
# We handle nested braces by scanning character by character after
|
||||
# finding the opening, rather than relying on a single regex.
|
||||
@entry_header ~r/@\w+\s*\{\s*([^,\s]+)\s*,/
|
||||
|
||||
defp parse_entries(content) do
|
||||
# Split on "@" boundaries, then parse each chunk
|
||||
content
|
||||
|> String.split(~r/(?=@\w+\s*\{)/, trim: true)
|
||||
|> Enum.reduce(%{}, fn chunk, acc ->
|
||||
case Regex.run(@entry_header, chunk) do
|
||||
[_, key] ->
|
||||
fields = parse_fields(chunk)
|
||||
Map.put(acc, String.trim(key), fields)
|
||||
|
||||
_ ->
|
||||
acc
|
||||
end
|
||||
end)
|
||||
end
|
||||
|
||||
# Extract key = {value} or key = "value" pairs from an entry block.
|
||||
# Handles simple single-depth braces; good enough for common fields.
|
||||
@field_regex ~r/(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")/
|
||||
|
||||
defp parse_fields(chunk) do
|
||||
@field_regex
|
||||
|> Regex.scan(chunk)
|
||||
|> Enum.reduce(%{}, fn match, acc ->
|
||||
field_name = Enum.at(match, 1) |> String.downcase()
|
||||
# Value is in capture group 2 (braces) or 3 (quotes)
|
||||
value =
|
||||
case {Enum.at(match, 2, ""), Enum.at(match, 3, "")} do
|
||||
{"", q} -> q
|
||||
{b, _} -> b
|
||||
end
|
||||
|
||||
Map.put(acc, field_name, String.trim(value))
|
||||
end)
|
||||
end
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Label & URL building
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
defp build_label(entry) do
|
||||
author_part =
|
||||
entry
|
||||
|> Map.get("author", "")
|
||||
|> parse_authors()
|
||||
|> format_authors()
|
||||
|
||||
year = Map.get(entry, "year", Map.get(entry, "date", ""))
|
||||
year = extract_year(year)
|
||||
|
||||
if year && author_part != "", do: "#{author_part}, #{year}", else: author_part
|
||||
end
|
||||
|
||||
defp parse_authors(""), do: []
|
||||
|
||||
defp parse_authors(author_str) do
|
||||
author_str
|
||||
|> String.split(" and ", trim: true)
|
||||
|> Enum.map(&extract_last_name/1)
|
||||
|> Enum.reject(&(&1 == ""))
|
||||
end
|
||||
|
||||
# Handles "Last, First" and "First Last" formats
|
||||
defp extract_last_name(name) do
|
||||
name = String.trim(name)
|
||||
|
||||
cond do
|
||||
String.contains?(name, ",") ->
|
||||
name |> String.split(",") |> List.first() |> String.trim()
|
||||
|
||||
String.contains?(name, " ") ->
|
||||
name |> String.split(" ") |> List.last() |> String.trim()
|
||||
|
||||
true ->
|
||||
name
|
||||
end
|
||||
end
|
||||
|
||||
defp format_authors([]), do: "Unknown"
|
||||
defp format_authors([single]), do: single
|
||||
defp format_authors([first | rest]), do: "#{first} & #{List.last(rest)}"
|
||||
|
||||
defp extract_year(""), do: nil
|
||||
|
||||
defp extract_year(str) do
|
||||
case Regex.run(~r/\b(\d{4})\b/, str) do
|
||||
[_, year] -> year
|
||||
_ -> nil
|
||||
end
|
||||
end
|
||||
|
||||
defp build_url(entry) do
|
||||
cond do
|
||||
doi = Map.get(entry, "doi", "") |> non_empty() ->
|
||||
"https://doi.org/#{doi}"
|
||||
|
||||
url = Map.get(entry, "url", "") |> non_empty() ->
|
||||
url
|
||||
|
||||
true ->
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
defp non_empty(""), do: nil
|
||||
defp non_empty(v), do: v
|
||||
end
|
||||
18
scripts/pipeline/lib/pipeline/resolvers/doi.ex
Normal file
18
scripts/pipeline/lib/pipeline/resolvers/doi.ex
Normal file
@@ -0,0 +1,18 @@
|
||||
defmodule Pipeline.Resolvers.DOI do
|
||||
@moduledoc """
|
||||
Last-resort citation resolver — always succeeds.
|
||||
|
||||
If the citation key looks like a DOI (starts with "10."), returns a
|
||||
`https://doi.org/...` link. Otherwise returns the key itself as a
|
||||
plain label with no URL.
|
||||
"""
|
||||
|
||||
@spec resolve(String.t()) :: {:ok, map()}
|
||||
def resolve(key) do
|
||||
if String.starts_with?(key, "10.") do
|
||||
{:ok, %{label: key, url: "https://doi.org/#{key}"}}
|
||||
else
|
||||
{:ok, %{label: key, url: nil}}
|
||||
end
|
||||
end
|
||||
end
|
||||
182
scripts/pipeline/lib/pipeline/resolvers/zotero.ex
Normal file
182
scripts/pipeline/lib/pipeline/resolvers/zotero.ex
Normal file
@@ -0,0 +1,182 @@
|
||||
defmodule Pipeline.Resolvers.Zotero do
|
||||
@moduledoc """
|
||||
Resolves citation keys via Zotero Better BibTeX's JSON-RPC API.
|
||||
|
||||
Requires Zotero to be running with the Better BibTeX plugin installed.
|
||||
Default endpoint: http://localhost:23119/better-bibtex/json-rpc
|
||||
|
||||
Resolution strategy:
|
||||
1. Search by citation key via `item.search`
|
||||
2. If found, try to get a PDF attachment link (zotero://open-pdf/...)
|
||||
3. Fall back to zotero://select/items/@key
|
||||
|
||||
Returns `{:ok, %{label: "Author, Year", url: "zotero://..."}}` or `:error`.
|
||||
"""
|
||||
|
||||
require Logger
|
||||
|
||||
@rpc_path "/better-bibtex/json-rpc"
|
||||
|
||||
@doc """
|
||||
Attempt to resolve `key` against a running Zotero instance.
|
||||
`base_url` defaults to `http://localhost:23119`.
|
||||
"""
|
||||
@spec resolve(String.t(), String.t()) :: {:ok, map()} | :error
|
||||
def resolve(key, base_url \\ "http://localhost:23119") do
|
||||
url = base_url <> @rpc_path
|
||||
|
||||
payload =
|
||||
Jason.encode!(%{
|
||||
jsonrpc: "2.0",
|
||||
method: "item.search",
|
||||
params: [
|
||||
[["citationKey", "is", key]]
|
||||
],
|
||||
id: 1
|
||||
})
|
||||
|
||||
case Req.post(url,
|
||||
body: payload,
|
||||
headers: [{"content-type", "application/json"}],
|
||||
receive_timeout: 5_000,
|
||||
finch: Pipeline.Finch
|
||||
) do
|
||||
{:ok, %{status: 200, body: body}} ->
|
||||
parse_response(body, key, base_url)
|
||||
|
||||
{:ok, %{status: status}} ->
|
||||
Logger.debug("Zotero: unexpected HTTP #{status} for key #{key}")
|
||||
:error
|
||||
|
||||
{:error, reason} ->
|
||||
Logger.debug("Zotero: connection failed for key #{key}: #{inspect(reason)}")
|
||||
:error
|
||||
|
||||
other ->
|
||||
Logger.debug("Zotero: unexpected result for key #{key}: #{inspect(other)}")
|
||||
:error
|
||||
end
|
||||
rescue
|
||||
e ->
|
||||
Logger.debug("Zotero: exception resolving key #{key}: #{inspect(e)}")
|
||||
:error
|
||||
end
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
defp parse_response(%{"result" => [item | _]}, key, base_url) do
|
||||
label = build_label(item)
|
||||
url = resolve_url(item, key, base_url)
|
||||
{:ok, %{label: label, url: url}}
|
||||
end
|
||||
|
||||
defp parse_response(%{"result" => []}, key, _base_url) do
|
||||
Logger.debug("Zotero: no item found for key #{key}")
|
||||
:error
|
||||
end
|
||||
|
||||
defp parse_response(%{"error" => err}, key, _base_url) do
|
||||
Logger.debug("Zotero: RPC error for key #{key}: #{inspect(err)}")
|
||||
:error
|
||||
end
|
||||
|
||||
defp parse_response(body, key, _base_url) do
|
||||
Logger.debug("Zotero: unexpected response shape for key #{key}: #{inspect(body)}")
|
||||
:error
|
||||
end
|
||||
|
||||
defp fetch_pdf_url(key, base_url) do
|
||||
payload =
|
||||
Jason.encode!(%{
|
||||
jsonrpc: "2.0",
|
||||
method: "item.attachments",
|
||||
params: [key],
|
||||
id: 2
|
||||
})
|
||||
|
||||
case Req.post(base_url <> @rpc_path,
|
||||
body: payload,
|
||||
headers: [{"content-type", "application/json"}],
|
||||
receive_timeout: 5_000,
|
||||
finch: Pipeline.Finch
|
||||
) do
|
||||
{:ok, %{status: 200, body: %{"result" => attachments}}} when is_list(attachments) ->
|
||||
attachments
|
||||
|> Enum.find_value(fn att ->
|
||||
open = Map.get(att, "open", "")
|
||||
path = Map.get(att, "path", "")
|
||||
if String.ends_with?(path, ".pdf"), do: open, else: nil
|
||||
end)
|
||||
|
||||
_ ->
|
||||
nil
|
||||
end
|
||||
rescue
|
||||
_ -> nil
|
||||
end
|
||||
|
||||
# CSL-JSON format: authors are under "author" with "family"/"given" keys.
|
||||
# Year is under "issued" -> "date-parts" -> [[year, month, day]].
|
||||
defp build_label(item) do
|
||||
authors = Map.get(item, "author", [])
|
||||
year = extract_year(item)
|
||||
|
||||
author_part =
|
||||
case authors do
|
||||
[] ->
|
||||
"Unknown"
|
||||
|
||||
[single] ->
|
||||
Map.get(single, "family", Map.get(single, "literal", "Unknown"))
|
||||
|
||||
[first | rest] ->
|
||||
first_name = Map.get(first, "family", Map.get(first, "literal", "Unknown"))
|
||||
last_name =
|
||||
rest
|
||||
|> List.last()
|
||||
|> then(&Map.get(&1, "family", Map.get(&1, "literal", "Unknown")))
|
||||
|
||||
"#{first_name} & #{last_name}"
|
||||
end
|
||||
|
||||
if year, do: "#{author_part}, #{year}", else: author_part
|
||||
end
|
||||
|
||||
# "issued": {"date-parts": [["2021", 2, 3]]}
|
||||
defp extract_year(item) do
|
||||
case get_in(item, ["issued", "date-parts"]) do
|
||||
[[year | _] | _] -> to_string(year)
|
||||
_ -> nil
|
||||
end
|
||||
end
|
||||
|
||||
defp resolve_url(item, key, base_url) do
|
||||
# Prefer zotero://open-pdf/... for items with a PDF attachment.
|
||||
# Fall back to zotero://select/library/items/KEY to open the item in Zotero.
|
||||
# The "id" field is a URI like "http://zotero.org/users/123/items/ABCD1234".
|
||||
pdf_url = fetch_pdf_url(key, base_url)
|
||||
|
||||
if pdf_url do
|
||||
pdf_url
|
||||
else
|
||||
item_key =
|
||||
item
|
||||
|> Map.get("id", "")
|
||||
|> String.split("/")
|
||||
|> List.last()
|
||||
|> non_empty()
|
||||
|
||||
if item_key do
|
||||
"zotero://select/library/items/#{item_key}"
|
||||
else
|
||||
"zotero://select/items/@#{key}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defp non_empty(nil), do: nil
|
||||
defp non_empty(""), do: nil
|
||||
defp non_empty(v), do: v
|
||||
end
|
||||
48
scripts/pipeline/lib/pipeline/transform.ex
Normal file
48
scripts/pipeline/lib/pipeline/transform.ex
Normal file
@@ -0,0 +1,48 @@
|
||||
defmodule Pipeline.Transform do
|
||||
@moduledoc """
|
||||
Behaviour that all markdown transform modules must implement.
|
||||
|
||||
## Callbacks
|
||||
|
||||
- `init/1` — called once before processing; returns transform-specific state.
|
||||
Default implementation returns the opts map unchanged.
|
||||
- `apply/3` — called per .md file; returns the (possibly modified) content.
|
||||
- `teardown/1` — optional cleanup after all files are processed.
|
||||
|
||||
## Example
|
||||
|
||||
defmodule MyTransform do
|
||||
@behaviour Pipeline.Transform
|
||||
|
||||
@impl true
|
||||
def init(opts), do: %{some_state: opts[:value]}
|
||||
|
||||
@impl true
|
||||
def apply(content, state, _opts) do
|
||||
String.replace(content, "foo", state.some_state)
|
||||
end
|
||||
end
|
||||
"""
|
||||
|
||||
@doc "One-time initialisation. Returns opaque state passed to apply/3."
|
||||
@callback init(opts :: map()) :: term()
|
||||
|
||||
@doc "Transform file content. Returns the (possibly modified) content string."
|
||||
@callback apply(content :: String.t(), state :: term(), opts :: map()) :: String.t()
|
||||
|
||||
@doc "Optional cleanup after all files are processed."
|
||||
@callback teardown(state :: term()) :: :ok
|
||||
|
||||
@optional_callbacks teardown: 1
|
||||
|
||||
defmacro __using__(_) do
|
||||
quote do
|
||||
@behaviour Pipeline.Transform
|
||||
|
||||
@impl Pipeline.Transform
|
||||
def init(opts), do: opts
|
||||
|
||||
defoverridable init: 1
|
||||
end
|
||||
end
|
||||
end
|
||||
231
scripts/pipeline/lib/pipeline/transforms/citations.ex
Normal file
231
scripts/pipeline/lib/pipeline/transforms/citations.ex
Normal file
@@ -0,0 +1,231 @@
|
||||
defmodule Pipeline.Transforms.Citations do
|
||||
@moduledoc """
|
||||
Markdown transform: resolves org-citar citation keys to hyperlinks.
|
||||
|
||||
## Recognised citation syntax (as output by ox-hugo from org-citar)
|
||||
|
||||
[cite:@key] → org-cite / citar standard (most common)
|
||||
[cite:@key1;@key2] → multiple citations
|
||||
cite:key → older roam-style bare cite syntax
|
||||
|
||||
## Resolution chain (in order)
|
||||
|
||||
1. Zotero (live instance via Better BibTeX JSON-RPC) — preferred
|
||||
2. BibTeX file (BIBTEX_FILE env var) — fallback
|
||||
3. DOI / bare key — always succeeds
|
||||
|
||||
## Modes (opts.citation_mode)
|
||||
|
||||
:silent — silently use DOI/bare-key fallback when Zotero+BibTeX fail
|
||||
:warn — (default) emit a Logger.warning for unresolved keys
|
||||
:strict — raise on unresolved keys (aborts pipeline)
|
||||
|
||||
## Format
|
||||
|
||||
Resolved citations are rendered as:
|
||||
|
||||
[Label](url) when a URL is available
|
||||
[Label] when no URL could be determined (bare key fallback)
|
||||
|
||||
Multiple semicolon-separated keys become space-separated links:
|
||||
|
||||
[cite:@a;@b] → [Author A, 2020](url_a) [Author B, 2019](url_b)
|
||||
|
||||
## init/1 callback
|
||||
|
||||
Loads the BibTeX file (if configured) once before processing begins,
|
||||
and probes Zotero availability, emitting warnings as appropriate.
|
||||
"""
|
||||
|
||||
@behaviour Pipeline.Transform
|
||||
|
||||
require Logger
|
||||
|
||||
alias Pipeline.Resolvers.Zotero
|
||||
alias Pipeline.Resolvers.BibTeX
|
||||
alias Pipeline.Resolvers.DOI
|
||||
|
||||
# Match [cite:@key] and [cite:@key1;@key2;...] (org-cite / citar style)
|
||||
@cite_bracket_regex ~r/\[cite:(@[^\]]+)\]/
|
||||
|
||||
# Match bare cite:key (older roam style, no brackets, no @ prefix)
|
||||
@cite_bare_regex ~r/(?<![(\[])cite:([a-zA-Z0-9_:-]+)/
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Pipeline callbacks
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@doc """
|
||||
Called once before processing any files. Loads BibTeX, probes Zotero.
|
||||
Returns a state map passed to every `apply/3` call.
|
||||
"""
|
||||
def init(opts) do
|
||||
bibtex_entries = load_bibtex(opts)
|
||||
zotero_available = probe_zotero(opts)
|
||||
|
||||
if not zotero_available and bibtex_entries == %{} do
|
||||
Logger.warning(
|
||||
"Citations: neither Zotero nor a BibTeX file is available. " <>
|
||||
"All citations will fall back to bare-key rendering. " <>
|
||||
"Set BIBTEX_FILE env var or start Zotero with Better BibTeX to resolve citations."
|
||||
)
|
||||
end
|
||||
|
||||
%{
|
||||
bibtex_entries: bibtex_entries,
|
||||
zotero_available: zotero_available,
|
||||
zotero_url: Map.get(opts, :zotero_url, "http://localhost:23119"),
|
||||
citation_mode: Map.get(opts, :citation_mode, :warn)
|
||||
}
|
||||
end
|
||||
|
||||
@doc """
|
||||
Apply citation resolution to a single markdown file's content.
|
||||
"""
|
||||
def apply(content, state, _opts) do
|
||||
content
|
||||
|> resolve_bracket_citations(state)
|
||||
|> resolve_bare_citations(state)
|
||||
end
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Resolution passes
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
defp resolve_bracket_citations(content, state) do
|
||||
Regex.replace(@cite_bracket_regex, content, fn _full, keys_str ->
|
||||
keys_str
|
||||
|> String.split(";")
|
||||
|> Enum.map(&String.trim/1)
|
||||
|> Enum.map(fn "@" <> key -> key end)
|
||||
|> Enum.map(&resolve_key(&1, state))
|
||||
|> Enum.join(" ")
|
||||
end)
|
||||
end
|
||||
|
||||
defp resolve_bare_citations(content, state) do
|
||||
Regex.replace(@cite_bare_regex, content, fn _full, key ->
|
||||
resolve_key(key, state)
|
||||
end)
|
||||
end
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Single-key resolution chain
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
defp resolve_key(key, state) do
|
||||
info =
|
||||
with :error <- try_zotero(key, state),
|
||||
:error <- try_bibtex(key, state) do
|
||||
handle_unresolved(key, state)
|
||||
else
|
||||
{:ok, citation_info} -> citation_info
|
||||
end
|
||||
|
||||
format_result(info)
|
||||
end
|
||||
|
||||
defp try_zotero(_key, %{zotero_available: false}), do: :error
|
||||
|
||||
defp try_zotero(key, %{zotero_url: url}) do
|
||||
Zotero.resolve(key, url)
|
||||
end
|
||||
|
||||
defp try_bibtex(_key, %{bibtex_entries: entries}) when map_size(entries) == 0, do: :error
|
||||
|
||||
defp try_bibtex(key, %{bibtex_entries: entries}) do
|
||||
BibTeX.resolve(key, entries)
|
||||
end
|
||||
|
||||
defp handle_unresolved(key, %{citation_mode: mode}) do
|
||||
case mode do
|
||||
:strict ->
|
||||
raise "Citations: could not resolve citation key '#{key}' and mode is :strict"
|
||||
|
||||
:warn ->
|
||||
Logger.warning("Citations: unresolved citation key '#{key}' — using bare-key fallback")
|
||||
{:ok, result} = DOI.resolve(key)
|
||||
result
|
||||
|
||||
:silent ->
|
||||
{:ok, result} = DOI.resolve(key)
|
||||
result
|
||||
end
|
||||
end
|
||||
|
||||
defp format_result(%{label: label, url: nil}), do: "[#{label}]"
|
||||
defp format_result(%{label: label, url: url}), do: "[#{label}](#{url})"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Init helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
defp load_bibtex(opts) do
|
||||
path = Map.get(opts, :bibtex_file) || System.get_env("BIBTEX_FILE")
|
||||
|
||||
cond do
|
||||
is_nil(path) ->
|
||||
Logger.debug("Citations: BIBTEX_FILE not set — BibTeX resolver disabled")
|
||||
%{}
|
||||
|
||||
not File.exists?(path) ->
|
||||
Logger.warning("Citations: BIBTEX_FILE=#{path} does not exist — BibTeX resolver disabled")
|
||||
%{}
|
||||
|
||||
true ->
|
||||
case BibTeX.load(path) do
|
||||
{:ok, entries} -> entries
|
||||
{:error, reason} ->
|
||||
Logger.warning("Citations: failed to load BibTeX file #{path}: #{inspect(reason)}")
|
||||
%{}
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
defp probe_zotero(opts) do
|
||||
url = Map.get(opts, :zotero_url, "http://localhost:23119")
|
||||
|
||||
# Use a no-op JSON-RPC call to probe availability.
|
||||
# /better-bibtex/cayw is intentionally avoided — it blocks waiting for
|
||||
# user interaction and never returns without a pick.
|
||||
payload =
|
||||
Jason.encode!(%{
|
||||
jsonrpc: "2.0",
|
||||
method: "item.search",
|
||||
params: [[[]]],
|
||||
id: 0
|
||||
})
|
||||
|
||||
result =
|
||||
try do
|
||||
Req.post(url <> "/better-bibtex/json-rpc",
|
||||
body: payload,
|
||||
headers: [{"content-type", "application/json"}],
|
||||
receive_timeout: 3_000,
|
||||
finch: Pipeline.Finch
|
||||
)
|
||||
rescue
|
||||
e -> {:error, e}
|
||||
end
|
||||
|
||||
case result do
|
||||
{:ok, %{status: 200}} ->
|
||||
Logger.info("Citations: Zotero Better BibTeX is available at #{url}")
|
||||
true
|
||||
|
||||
{:ok, %{status: status}} ->
|
||||
Logger.warning(
|
||||
"Citations: Zotero responded HTTP #{status} at #{url} — " <>
|
||||
"is Better BibTeX installed?"
|
||||
)
|
||||
false
|
||||
|
||||
_ ->
|
||||
Logger.warning(
|
||||
"Citations: Zotero not reachable at #{url} — " <>
|
||||
"start Zotero with Better BibTeX or set BIBTEX_FILE as fallback"
|
||||
)
|
||||
false
|
||||
end
|
||||
end
|
||||
end
|
||||
27
scripts/pipeline/mix.exs
Normal file
27
scripts/pipeline/mix.exs
Normal file
@@ -0,0 +1,27 @@
|
||||
defmodule Pipeline.MixProject do
|
||||
use Mix.Project
|
||||
|
||||
def project do
|
||||
[
|
||||
app: :pipeline,
|
||||
version: "0.1.0",
|
||||
elixir: "~> 1.15",
|
||||
start_permanent: Mix.env() == :prod,
|
||||
deps: deps()
|
||||
]
|
||||
end
|
||||
|
||||
def application do
|
||||
[
|
||||
extra_applications: [:logger, :inets, :ssl],
|
||||
mod: {Pipeline.Application, []}
|
||||
]
|
||||
end
|
||||
|
||||
defp deps do
|
||||
[
|
||||
{:req, "~> 0.5"},
|
||||
{:jason, "~> 1.4"}
|
||||
]
|
||||
end
|
||||
end
|
||||
11
scripts/pipeline/mix.lock
Normal file
11
scripts/pipeline/mix.lock
Normal file
@@ -0,0 +1,11 @@
|
||||
%{
|
||||
"finch": {:hex, :finch, "0.21.0", "b1c3b2d48af02d0c66d2a9ebfb5622be5c5ecd62937cf79a88a7f98d48a8290c", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "87dc6e169794cb2570f75841a19da99cfde834249568f2a5b121b809588a4377"},
|
||||
"hpax": {:hex, :hpax, "1.0.3", "ed67ef51ad4df91e75cc6a1494f851850c0bd98ebc0be6e81b026e765ee535aa", [:mix], [], "hexpm", "8eab6e1cfa8d5918c2ce4ba43588e894af35dbd8e91e6e55c817bca5847df34a"},
|
||||
"jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
|
||||
"mime": {:hex, :mime, "2.0.7", "b8d739037be7cd402aee1ba0306edfdef982687ee7e9859bee6198c1e7e2f128", [:mix], [], "hexpm", "6171188e399ee16023ffc5b76ce445eb6d9672e2e241d2df6050f3c771e80ccd"},
|
||||
"mint": {:hex, :mint, "1.7.1", "113fdb2b2f3b59e47c7955971854641c61f378549d73e829e1768de90fc1abf1", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0 or ~> 1.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "fceba0a4d0f24301ddee3024ae116df1c3f4bb7a563a731f45fdfeb9d39a231b"},
|
||||
"nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"},
|
||||
"nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
|
||||
"req": {:hex, :req, "0.5.17", "0096ddd5b0ed6f576a03dde4b158a0c727215b15d2795e59e0916c6971066ede", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0b8bc6ffdfebbc07968e59d3ff96d52f2202d0536f10fef4dc11dc02a2a43e39"},
|
||||
"telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"},
|
||||
}
|
||||
Reference in New Issue
Block a user