feat(pipeline): refactor into its own project

This commit is contained in:
Ignacio Ballesteros
2026-02-20 17:54:12 +01:00
parent 0ea5808cd2
commit dc348185a7
17 changed files with 490 additions and 358 deletions

61
pipeline/flake.lock generated Normal file
View File

@@ -0,0 +1,61 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1731533236,
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1771369470,
"narHash": "sha256-0NBlEBKkN3lufyvFegY4TYv5mCNHbi5OmBDrzihbBMQ=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "0182a361324364ae3f436a63005877674cf45efb",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

69
pipeline/flake.nix Normal file
View File

@@ -0,0 +1,69 @@
{
description = "Org-roam export pipeline Elixir escript";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
flake-utils.url = "github:numtide/flake-utils";
};
outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = import nixpkgs { inherit system; };
fs = pkgs.lib.fileset;
# Emacs with ox-hugo — needed at runtime by the pipeline escript
# (export_org_files calls `emacs --batch` with ox-hugo).
emacsWithOxHugo = (pkgs.emacsPackagesFor pkgs.emacs-nox).emacsWithPackages
(epkgs: [ epkgs.ox-hugo ]);
# Pre-fetched Hex/Mix dependencies.
# src is filtered to mix.exs + mix.lock so source edits don't
# invalidate this derivation.
mixDeps = pkgs.beamPackages.fetchMixDeps {
pname = "pipeline-mix-deps";
version = "0.1.0";
src = fs.toSource {
root = ./.;
fileset = fs.unions [
./mix.exs
./mix.lock
];
};
sha256 = "sha256-E79X+nUy86G1Jrwv3T7dXekoGv8Hd14ZgJSKWjvlmAw=";
};
# Compiled pipeline escript (without runtime wrappers).
pipelineEscript = pkgs.beamPackages.mixRelease {
pname = "pipeline";
version = "0.1.0";
src = ./.;
escriptBinName = "pipeline";
mixFodDeps = mixDeps;
stripDebug = true;
};
# Wrapped pipeline that puts emacs (with ox-hugo) on PATH so
# the escript's System.cmd("emacs", ...) calls succeed.
pipelineApp = pkgs.writeShellApplication {
name = "pipeline";
runtimeInputs = [ emacsWithOxHugo ];
text = ''
exec ${pipelineEscript}/bin/pipeline "$@"
'';
};
in
{
packages.default = pipelineApp;
packages.escript = pipelineEscript;
devShells.default = pkgs.mkShell {
buildInputs = [
pkgs.elixir
emacsWithOxHugo
];
};
});
}

83
pipeline/lib/pipeline.ex Normal file
View File

@@ -0,0 +1,83 @@
defmodule Pipeline do
@moduledoc """
Post-export markdown transformation pipeline.
Applies a list of transform modules sequentially over every .md file
in a content directory. Each transform module must implement:
apply(content :: String.t(), opts :: map()) :: String.t()
Transforms are applied in the order given. A file is rewritten only
when at least one transform mutates its content (checked via equality).
## Usage
opts = %{
zotero_url: "http://localhost:23119",
bibtex_file: System.get_env("BIBTEX_FILE"),
citation_mode: :warn # :silent | :warn | :strict
}
Pipeline.run(content_dir, [Pipeline.Transforms.Citations], opts)
"""
require Logger
@type transform :: module()
@type opts :: map()
@doc """
Run all transforms over every .md file under `content_dir`.
Returns `{:ok, stats}` where stats maps each transform to a count of files it changed.
"""
@spec run(String.t(), [transform()], opts()) :: {:ok, map()}
def run(content_dir, transforms, opts \\ %{}) do
md_files =
content_dir
|> Path.join("**/*.md")
|> Path.wildcard()
if md_files == [] do
Logger.warning("Pipeline: no .md files found in #{content_dir}")
{:ok, %{}}
else
Logger.info("Pipeline: processing #{length(md_files)} markdown files with #{length(transforms)} transform(s)")
# Initialise transforms (allows them to perform setup such as loading a .bib file).
# Each transform module must implement the Pipeline.Transform behaviour.
initialized =
Enum.map(transforms, fn mod ->
state = mod.init(opts)
{mod, state}
end)
stats =
Enum.reduce(md_files, %{}, fn path, acc ->
original = File.read!(path)
{transformed, file_stats} =
Enum.reduce(initialized, {original, %{}}, fn {mod, state}, {content, fstats} ->
result = mod.apply(content, state, opts)
changed = result != content
{result, Map.update(fstats, mod, (if changed, do: 1, else: 0), &(&1 + (if changed, do: 1, else: 0)))}
end)
if transformed != original do
File.write!(path, transformed)
Logger.debug("Pipeline: updated #{Path.relative_to_cwd(path)}")
end
Map.merge(acc, file_stats, fn _k, a, b -> a + b end)
end)
Enum.each(initialized, fn {mod, state} ->
# teardown/1 is optional in the behaviour
if function_exported?(mod, :teardown, 1) do
mod.teardown(state)
end
end)
{:ok, stats}
end
end
end

View File

@@ -0,0 +1,14 @@
defmodule Pipeline.Application do
@moduledoc false
use Application
@impl true
def start(_type, _args) do
children = [
{Finch, name: Pipeline.Finch}
]
opts = [strategy: :one_for_one, name: Pipeline.Supervisor]
Supervisor.start_link(children, opts)
end
end

View File

@@ -0,0 +1,261 @@
defmodule Pipeline.CLI do
@moduledoc """
Escript entry point for the org-roam export pipeline.
Runs four phases in sequence:
1. Wipe `content/` (preserving `.gitkeep`)
2. Export each `.org` file via `emacs --batch` + ox-hugo → `content/**/*.md`
3. Run Elixir transform modules over every `.md` file
4. Generate a fallback `content/index.md` if none was exported
## Usage
pipeline <notes-dir> [--output <path>]
Arguments:
notes-dir Path to the directory containing `.org` notes (required).
Also accepts the `NOTES_DIR` env var.
Options:
--output <path> Output root directory (used as ox-hugo base dir).
Defaults to the `OUTPUT_DIR` env var, or the current
working directory.
--content-dir <p> Output directory for exported Markdown. Defaults to
`<output>/content`.
Optional env vars:
BIBTEX_FILE Path to a `.bib` file used as citation fallback.
ZOTERO_URL Zotero Better BibTeX base URL (default: http://localhost:23119).
CITATION_MODE silent | warn (default) | strict.
"""
require Logger
def main(argv) do
Application.ensure_all_started(:pipeline)
{notes_dir, output_dir, content_dir} = parse_args(argv)
wipe(content_dir)
export_org_files(notes_dir, output_dir)
run_pipeline(content_dir)
generate_index(content_dir)
md_count =
content_dir
|> Path.join("**/*.md")
|> Path.wildcard()
|> length()
IO.puts("==> Done. #{md_count} markdown files in #{content_dir}")
end
# ---------------------------------------------------------------------------
# Argument parsing
# ---------------------------------------------------------------------------
defp parse_args(argv) do
{opts, positional, _invalid} =
OptionParser.parse(argv,
strict: [output: :string, content_dir: :string]
)
notes_dir =
case positional do
[dir | _] ->
dir
[] ->
System.get_env("NOTES_DIR") ||
abort("Usage: pipeline <notes-dir> [--output <path>]")
end
notes_dir = Path.expand(notes_dir)
unless File.dir?(notes_dir) do
abort("Error: notes directory does not exist: #{notes_dir}")
end
output_dir =
(opts[:output] || System.get_env("OUTPUT_DIR") || File.cwd!())
|> Path.expand()
content_dir =
(opts[:content_dir] || Path.join(output_dir, "content"))
|> Path.expand()
{notes_dir, output_dir, content_dir}
end
# ---------------------------------------------------------------------------
# Phase 1: Wipe content/
# ---------------------------------------------------------------------------
defp wipe(content_dir) do
IO.puts("==> Wiping #{content_dir}")
File.mkdir_p!(content_dir)
content_dir
|> File.ls!()
|> Enum.reject(&(&1 == ".gitkeep"))
|> Enum.each(fn entry ->
Path.join(content_dir, entry) |> File.rm_rf!()
end)
end
# ---------------------------------------------------------------------------
# Phase 2: Export org files via Emacs + ox-hugo
# ---------------------------------------------------------------------------
defp export_org_files(notes_dir, output_dir) do
IO.puts("==> Exporting org files from #{notes_dir}")
# ox-hugo requires static/ to exist for image asset copying
File.mkdir_p!(Path.join(output_dir, "static"))
org_files =
Path.join(notes_dir, "**/*.org")
|> Path.wildcard()
if org_files == [] do
IO.puts("No .org files found in #{notes_dir}")
System.halt(0)
end
results =
Enum.map(org_files, fn orgfile ->
IO.puts(" exporting: #{orgfile}")
section =
orgfile
|> Path.dirname()
|> Path.relative_to(notes_dir)
{output, exit_code} =
System.cmd(
"emacs",
[
"--batch",
"--eval", "(require 'ox-hugo)",
"--eval", """
(org-cite-register-processor 'passthrough
:export-citation
(lambda (citation _style _backend _info)
(let ((keys (mapcar (lambda (ref)
(concat "@" (org-element-property :key ref)))
(org-cite-get-references citation))))
(format "[cite:%s]" (string-join keys ";")))))
""",
"--eval", "(setq org-cite-export-processors '((t passthrough)))",
"--eval", ~s[(setq org-hugo-base-dir "#{output_dir}")],
"--eval", ~s[(setq org-hugo-default-section-directory "#{section}")],
"--visit", orgfile,
"--funcall", "org-hugo-export-to-md"
],
stderr_to_stdout: true
)
filtered =
output
|> String.split("\n")
|> Enum.reject(&String.match?(&1, ~r/^Loading|^ad-handle|^For information/))
|> Enum.join("\n")
if filtered != "", do: IO.puts(filtered)
{orgfile, exit_code}
end)
failures = Enum.filter(results, fn {_, code} -> code != 0 end)
if failures != [] do
IO.puts(:stderr, "\nFailed to export #{length(failures)} file(s):")
Enum.each(failures, fn {f, code} -> IO.puts(:stderr, " [exit #{code}] #{f}") end)
System.halt(1)
end
end
# ---------------------------------------------------------------------------
# Phase 3: Markdown transformation pipeline
# ---------------------------------------------------------------------------
defp run_pipeline(content_dir) do
IO.puts("==> Running markdown pipeline")
pipeline_opts = %{
zotero_url: System.get_env("ZOTERO_URL", "http://localhost:23119"),
bibtex_file: System.get_env("BIBTEX_FILE"),
citation_mode:
case System.get_env("CITATION_MODE", "warn") do
"silent" -> :silent
"strict" -> :strict
_ -> :warn
end
}
transforms = [Pipeline.Transforms.Citations]
case Pipeline.run(content_dir, transforms, pipeline_opts) do
{:ok, stats} ->
Enum.each(stats, fn {mod, count} ->
IO.puts(" #{inspect(mod)}: #{count} file(s) modified")
end)
{:error, reason} ->
IO.puts(:stderr, "Pipeline error: #{inspect(reason)}")
System.halt(1)
end
end
# ---------------------------------------------------------------------------
# Phase 4: Generate default index.md if none was exported
# ---------------------------------------------------------------------------
defp generate_index(content_dir) do
index_path = Path.join(content_dir, "index.md")
unless File.exists?(index_path) do
IO.puts("==> Generating default index.md")
pages =
Path.join(content_dir, "**/*.md")
|> Path.wildcard()
|> Enum.map(fn path ->
slug = Path.relative_to(path, content_dir) |> Path.rootname()
title =
path
|> File.read!()
|> then(fn content ->
case Regex.run(~r/^title\s*=\s*"(.+)"/m, content) do
[_, t] -> t
_ -> slug
end
end)
{slug, title}
end)
|> Enum.sort_by(fn {_, title} -> title end)
|> Enum.map(fn {slug, title} -> "- [#{title}](#{slug})" end)
|> Enum.join("\n")
File.write!(index_path, """
---
title: Index
---
#{pages}
""")
end
end
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
defp abort(message) do
IO.puts(:stderr, message)
System.halt(1)
end
end

View File

@@ -0,0 +1,178 @@
defmodule Pipeline.Resolvers.BibTeX do
@moduledoc """
Resolves citation keys from a local BibTeX (.bib) file.
Configured via the `BIBTEX_FILE` environment variable, or passed directly
as `opts.bibtex_file`. The file is parsed once at init time and the
resulting entry map is reused for all lookups.
Supports extracting: author last names, year, title, DOI, URL.
BibTeX entry format parsed:
@type{citationkey,
author = {Last, First and Last2, First2},
year = {2021},
title = {Some Title},
doi = {10.xxxx/yyyy},
url = {https://example.com},
}
Returns `{:ok, %{label: "Author, Year", url: "..."}}` or `:error`.
"""
require Logger
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
@doc """
Parse a .bib file and return a map of `%{citation_key => entry_map}`.
Returns `{:ok, entries}` or `{:error, reason}`.
"""
@spec load(String.t()) :: {:ok, map()} | {:error, term()}
def load(path) do
case File.read(path) do
{:ok, content} ->
entries = parse_entries(content)
Logger.info("BibTeX: loaded #{map_size(entries)} entries from #{path}")
{:ok, entries}
{:error, reason} ->
{:error, reason}
end
end
@doc """
Resolve a citation key from pre-loaded BibTeX entries.
"""
@spec resolve(String.t(), map()) :: {:ok, map()} | :error
def resolve(key, entries) do
case Map.fetch(entries, key) do
{:ok, entry} ->
label = build_label(entry)
url = build_url(entry)
{:ok, %{label: label, url: url}}
:error ->
:error
end
end
# ------------------------------------------------------------------
# Parsing
# ------------------------------------------------------------------
# Match @type{key, ...fields...}
# We handle nested braces by scanning character by character after
# finding the opening, rather than relying on a single regex.
@entry_header ~r/@\w+\s*\{\s*([^,\s]+)\s*,/
defp parse_entries(content) do
# Split on "@" boundaries, then parse each chunk
content
|> String.split(~r/(?=@\w+\s*\{)/, trim: true)
|> Enum.reduce(%{}, fn chunk, acc ->
case Regex.run(@entry_header, chunk) do
[_, key] ->
fields = parse_fields(chunk)
Map.put(acc, String.trim(key), fields)
_ ->
acc
end
end)
end
# Extract key = {value} or key = "value" pairs from an entry block.
# Handles simple single-depth braces; good enough for common fields.
@field_regex ~r/(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")/
defp parse_fields(chunk) do
@field_regex
|> Regex.scan(chunk)
|> Enum.reduce(%{}, fn match, acc ->
field_name = Enum.at(match, 1) |> String.downcase()
# Value is in capture group 2 (braces) or 3 (quotes)
value =
case {Enum.at(match, 2, ""), Enum.at(match, 3, "")} do
{"", q} -> q
{b, _} -> b
end
Map.put(acc, field_name, String.trim(value))
end)
end
# ------------------------------------------------------------------
# Label & URL building
# ------------------------------------------------------------------
defp build_label(entry) do
author_part =
entry
|> Map.get("author", "")
|> parse_authors()
|> format_authors()
year = Map.get(entry, "year", Map.get(entry, "date", ""))
year = extract_year(year)
if year && author_part != "", do: "#{author_part}, #{year}", else: author_part
end
defp parse_authors(""), do: []
defp parse_authors(author_str) do
author_str
|> String.split(" and ", trim: true)
|> Enum.map(&extract_last_name/1)
|> Enum.reject(&(&1 == ""))
end
# Handles "Last, First" and "First Last" formats
defp extract_last_name(name) do
name = String.trim(name)
cond do
String.contains?(name, ",") ->
name |> String.split(",") |> List.first() |> String.trim()
String.contains?(name, " ") ->
name |> String.split(" ") |> List.last() |> String.trim()
true ->
name
end
end
defp format_authors([]), do: "Unknown"
defp format_authors([single]), do: single
defp format_authors([first | rest]), do: "#{first} & #{List.last(rest)}"
defp extract_year(""), do: nil
defp extract_year(str) do
case Regex.run(~r/\b(\d{4})\b/, str) do
[_, year] -> year
_ -> nil
end
end
defp build_url(entry) do
cond do
doi = Map.get(entry, "doi", "") |> non_empty() ->
"https://doi.org/#{doi}"
url = Map.get(entry, "url", "") |> non_empty() ->
url
true ->
nil
end
end
defp non_empty(""), do: nil
defp non_empty(v), do: v
end

View File

@@ -0,0 +1,18 @@
defmodule Pipeline.Resolvers.DOI do
@moduledoc """
Last-resort citation resolver — always succeeds.
If the citation key looks like a DOI (starts with "10."), returns a
`https://doi.org/...` link. Otherwise returns the key itself as a
plain label with no URL.
"""
@spec resolve(String.t()) :: {:ok, map()}
def resolve(key) do
if String.starts_with?(key, "10.") do
{:ok, %{label: key, url: "https://doi.org/#{key}"}}
else
{:ok, %{label: key, url: nil}}
end
end
end

View File

@@ -0,0 +1,182 @@
defmodule Pipeline.Resolvers.Zotero do
@moduledoc """
Resolves citation keys via Zotero Better BibTeX's JSON-RPC API.
Requires Zotero to be running with the Better BibTeX plugin installed.
Default endpoint: http://localhost:23119/better-bibtex/json-rpc
Resolution strategy:
1. Search by citation key via `item.search`
2. If found, try to get a PDF attachment link (zotero://open-pdf/...)
3. Fall back to zotero://select/items/@key
Returns `{:ok, %{label: "Author, Year", url: "zotero://..."}}` or `:error`.
"""
require Logger
@rpc_path "/better-bibtex/json-rpc"
@doc """
Attempt to resolve `key` against a running Zotero instance.
`base_url` defaults to `http://localhost:23119`.
"""
@spec resolve(String.t(), String.t()) :: {:ok, map()} | :error
def resolve(key, base_url \\ "http://localhost:23119") do
url = base_url <> @rpc_path
payload =
Jason.encode!(%{
jsonrpc: "2.0",
method: "item.search",
params: [
[["citationKey", "is", key]]
],
id: 1
})
case Req.post(url,
body: payload,
headers: [{"content-type", "application/json"}],
receive_timeout: 5_000,
finch: Pipeline.Finch
) do
{:ok, %{status: 200, body: body}} ->
parse_response(body, key, base_url)
{:ok, %{status: status}} ->
Logger.debug("Zotero: unexpected HTTP #{status} for key #{key}")
:error
{:error, reason} ->
Logger.debug("Zotero: connection failed for key #{key}: #{inspect(reason)}")
:error
other ->
Logger.debug("Zotero: unexpected result for key #{key}: #{inspect(other)}")
:error
end
rescue
e ->
Logger.debug("Zotero: exception resolving key #{key}: #{inspect(e)}")
:error
end
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
defp parse_response(%{"result" => [item | _]}, key, base_url) do
label = build_label(item)
url = resolve_url(item, key, base_url)
{:ok, %{label: label, url: url}}
end
defp parse_response(%{"result" => []}, key, _base_url) do
Logger.debug("Zotero: no item found for key #{key}")
:error
end
defp parse_response(%{"error" => err}, key, _base_url) do
Logger.debug("Zotero: RPC error for key #{key}: #{inspect(err)}")
:error
end
defp parse_response(body, key, _base_url) do
Logger.debug("Zotero: unexpected response shape for key #{key}: #{inspect(body)}")
:error
end
defp fetch_pdf_url(key, base_url) do
payload =
Jason.encode!(%{
jsonrpc: "2.0",
method: "item.attachments",
params: [key],
id: 2
})
case Req.post(base_url <> @rpc_path,
body: payload,
headers: [{"content-type", "application/json"}],
receive_timeout: 5_000,
finch: Pipeline.Finch
) do
{:ok, %{status: 200, body: %{"result" => attachments}}} when is_list(attachments) ->
attachments
|> Enum.find_value(fn att ->
open = Map.get(att, "open", "")
path = Map.get(att, "path", "")
if String.ends_with?(path, ".pdf"), do: open, else: nil
end)
_ ->
nil
end
rescue
_ -> nil
end
# CSL-JSON format: authors are under "author" with "family"/"given" keys.
# Year is under "issued" -> "date-parts" -> [[year, month, day]].
defp build_label(item) do
authors = Map.get(item, "author", [])
year = extract_year(item)
author_part =
case authors do
[] ->
"Unknown"
[single] ->
Map.get(single, "family", Map.get(single, "literal", "Unknown"))
[first | rest] ->
first_name = Map.get(first, "family", Map.get(first, "literal", "Unknown"))
last_name =
rest
|> List.last()
|> then(&Map.get(&1, "family", Map.get(&1, "literal", "Unknown")))
"#{first_name} & #{last_name}"
end
if year, do: "#{author_part}, #{year}", else: author_part
end
# "issued": {"date-parts": [["2021", 2, 3]]}
defp extract_year(item) do
case get_in(item, ["issued", "date-parts"]) do
[[year | _] | _] -> to_string(year)
_ -> nil
end
end
defp resolve_url(item, key, base_url) do
# Prefer zotero://open-pdf/... for items with a PDF attachment.
# Fall back to zotero://select/library/items/KEY to open the item in Zotero.
# The "id" field is a URI like "http://zotero.org/users/123/items/ABCD1234".
pdf_url = fetch_pdf_url(key, base_url)
if pdf_url do
pdf_url
else
item_key =
item
|> Map.get("id", "")
|> String.split("/")
|> List.last()
|> non_empty()
if item_key do
"zotero://select/library/items/#{item_key}"
else
"zotero://select/items/@#{key}"
end
end
end
defp non_empty(nil), do: nil
defp non_empty(""), do: nil
defp non_empty(v), do: v
end

View File

@@ -0,0 +1,48 @@
defmodule Pipeline.Transform do
@moduledoc """
Behaviour that all markdown transform modules must implement.
## Callbacks
- `init/1` — called once before processing; returns transform-specific state.
Default implementation returns the opts map unchanged.
- `apply/3` — called per .md file; returns the (possibly modified) content.
- `teardown/1` — optional cleanup after all files are processed.
## Example
defmodule MyTransform do
@behaviour Pipeline.Transform
@impl true
def init(opts), do: %{some_state: opts[:value]}
@impl true
def apply(content, state, _opts) do
String.replace(content, "foo", state.some_state)
end
end
"""
@doc "One-time initialisation. Returns opaque state passed to apply/3."
@callback init(opts :: map()) :: term()
@doc "Transform file content. Returns the (possibly modified) content string."
@callback apply(content :: String.t(), state :: term(), opts :: map()) :: String.t()
@doc "Optional cleanup after all files are processed."
@callback teardown(state :: term()) :: :ok
@optional_callbacks teardown: 1
defmacro __using__(_) do
quote do
@behaviour Pipeline.Transform
@impl Pipeline.Transform
def init(opts), do: opts
defoverridable init: 1
end
end
end

View File

@@ -0,0 +1,231 @@
defmodule Pipeline.Transforms.Citations do
@moduledoc """
Markdown transform: resolves org-citar citation keys to hyperlinks.
## Recognised citation syntax (as output by ox-hugo from org-citar)
[cite:@key] → org-cite / citar standard (most common)
[cite:@key1;@key2] → multiple citations
cite:key → older roam-style bare cite syntax
## Resolution chain (in order)
1. Zotero (live instance via Better BibTeX JSON-RPC) — preferred
2. BibTeX file (BIBTEX_FILE env var) — fallback
3. DOI / bare key — always succeeds
## Modes (opts.citation_mode)
:silent — silently use DOI/bare-key fallback when Zotero+BibTeX fail
:warn — (default) emit a Logger.warning for unresolved keys
:strict — raise on unresolved keys (aborts pipeline)
## Format
Resolved citations are rendered as:
[Label](url) when a URL is available
[Label] when no URL could be determined (bare key fallback)
Multiple semicolon-separated keys become space-separated links:
[cite:@a;@b] → [Author A, 2020](url_a) [Author B, 2019](url_b)
## init/1 callback
Loads the BibTeX file (if configured) once before processing begins,
and probes Zotero availability, emitting warnings as appropriate.
"""
@behaviour Pipeline.Transform
require Logger
alias Pipeline.Resolvers.Zotero
alias Pipeline.Resolvers.BibTeX
alias Pipeline.Resolvers.DOI
# Match [cite:@key] and [cite:@key1;@key2;...] (org-cite / citar style)
@cite_bracket_regex ~r/\[cite:(@[^\]]+)\]/
# Match bare cite:key or cite:@key (older roam style, no brackets, optional @ prefix)
@cite_bare_regex ~r/(?<![(\[])cite:@?([a-zA-Z0-9_:-]+)/
# ------------------------------------------------------------------
# Pipeline callbacks
# ------------------------------------------------------------------
@doc """
Called once before processing any files. Loads BibTeX, probes Zotero.
Returns a state map passed to every `apply/3` call.
"""
def init(opts) do
bibtex_entries = load_bibtex(opts)
zotero_available = probe_zotero(opts)
if not zotero_available and bibtex_entries == %{} do
Logger.warning(
"Citations: neither Zotero nor a BibTeX file is available. " <>
"All citations will fall back to bare-key rendering. " <>
"Set BIBTEX_FILE env var or start Zotero with Better BibTeX to resolve citations."
)
end
%{
bibtex_entries: bibtex_entries,
zotero_available: zotero_available,
zotero_url: Map.get(opts, :zotero_url, "http://localhost:23119"),
citation_mode: Map.get(opts, :citation_mode, :warn)
}
end
@doc """
Apply citation resolution to a single markdown file's content.
"""
def apply(content, state, _opts) do
content
|> resolve_bracket_citations(state)
|> resolve_bare_citations(state)
end
# ------------------------------------------------------------------
# Resolution passes
# ------------------------------------------------------------------
defp resolve_bracket_citations(content, state) do
Regex.replace(@cite_bracket_regex, content, fn _full, keys_str ->
keys_str
|> String.split(";")
|> Enum.map(&String.trim/1)
|> Enum.map(fn "@" <> key -> key end)
|> Enum.map(&resolve_key(&1, state))
|> Enum.join(" ")
end)
end
defp resolve_bare_citations(content, state) do
Regex.replace(@cite_bare_regex, content, fn _full, key ->
resolve_key(key, state)
end)
end
# ------------------------------------------------------------------
# Single-key resolution chain
# ------------------------------------------------------------------
defp resolve_key(key, state) do
info =
with :error <- try_zotero(key, state),
:error <- try_bibtex(key, state) do
handle_unresolved(key, state)
else
{:ok, citation_info} -> citation_info
end
format_result(info)
end
defp try_zotero(_key, %{zotero_available: false}), do: :error
defp try_zotero(key, %{zotero_url: url}) do
Zotero.resolve(key, url)
end
defp try_bibtex(_key, %{bibtex_entries: entries}) when map_size(entries) == 0, do: :error
defp try_bibtex(key, %{bibtex_entries: entries}) do
BibTeX.resolve(key, entries)
end
defp handle_unresolved(key, %{citation_mode: mode}) do
case mode do
:strict ->
raise "Citations: could not resolve citation key '#{key}' and mode is :strict"
:warn ->
Logger.warning("Citations: unresolved citation key '#{key}' — using bare-key fallback")
{:ok, result} = DOI.resolve(key)
result
:silent ->
{:ok, result} = DOI.resolve(key)
result
end
end
defp format_result(%{label: label, url: nil}), do: "[#{label}]"
defp format_result(%{label: label, url: url}), do: "[#{label}](#{url})"
# ------------------------------------------------------------------
# Init helpers
# ------------------------------------------------------------------
defp load_bibtex(opts) do
path = Map.get(opts, :bibtex_file) || System.get_env("BIBTEX_FILE")
cond do
is_nil(path) ->
Logger.debug("Citations: BIBTEX_FILE not set — BibTeX resolver disabled")
%{}
not File.exists?(path) ->
Logger.warning("Citations: BIBTEX_FILE=#{path} does not exist — BibTeX resolver disabled")
%{}
true ->
case BibTeX.load(path) do
{:ok, entries} -> entries
{:error, reason} ->
Logger.warning("Citations: failed to load BibTeX file #{path}: #{inspect(reason)}")
%{}
end
end
end
defp probe_zotero(opts) do
url = Map.get(opts, :zotero_url, "http://localhost:23119")
# Use a no-op JSON-RPC call to probe availability.
# /better-bibtex/cayw is intentionally avoided — it blocks waiting for
# user interaction and never returns without a pick.
payload =
Jason.encode!(%{
jsonrpc: "2.0",
method: "item.search",
params: [[[]]],
id: 0
})
result =
try do
Req.post(url <> "/better-bibtex/json-rpc",
body: payload,
headers: [{"content-type", "application/json"}],
receive_timeout: 3_000,
finch: Pipeline.Finch
)
rescue
e -> {:error, e}
end
case result do
{:ok, %{status: 200}} ->
Logger.info("Citations: Zotero Better BibTeX is available at #{url}")
true
{:ok, %{status: status}} ->
Logger.warning(
"Citations: Zotero responded HTTP #{status} at #{url}" <>
"is Better BibTeX installed?"
)
false
_ ->
Logger.warning(
"Citations: Zotero not reachable at #{url}" <>
"start Zotero with Better BibTeX or set BIBTEX_FILE as fallback"
)
false
end
end
end

33
pipeline/mix.exs Normal file
View File

@@ -0,0 +1,33 @@
defmodule Pipeline.MixProject do
use Mix.Project
def project do
[
app: :pipeline,
version: "0.1.0",
elixir: "~> 1.17",
start_permanent: Mix.env() == :prod,
deps: deps(),
escript: escript()
]
end
def application do
[
extra_applications: [:logger],
mod: {Pipeline.Application, []}
]
end
defp escript do
[main_module: Pipeline.CLI]
end
defp deps do
[
{:finch, "~> 0.19"},
{:req, "~> 0.5"},
{:jason, "~> 1.4"}
]
end
end

11
pipeline/mix.lock Normal file
View File

@@ -0,0 +1,11 @@
%{
"finch": {:hex, :finch, "0.21.0", "b1c3b2d48af02d0c66d2a9ebfb5622be5c5ecd62937cf79a88a7f98d48a8290c", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "87dc6e169794cb2570f75841a19da99cfde834249568f2a5b121b809588a4377"},
"hpax": {:hex, :hpax, "1.0.3", "ed67ef51ad4df91e75cc6a1494f851850c0bd98ebc0be6e81b026e765ee535aa", [:mix], [], "hexpm", "8eab6e1cfa8d5918c2ce4ba43588e894af35dbd8e91e6e55c817bca5847df34a"},
"jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
"mime": {:hex, :mime, "2.0.7", "b8d739037be7cd402aee1ba0306edfdef982687ee7e9859bee6198c1e7e2f128", [:mix], [], "hexpm", "6171188e399ee16023ffc5b76ce445eb6d9672e2e241d2df6050f3c771e80ccd"},
"mint": {:hex, :mint, "1.7.1", "113fdb2b2f3b59e47c7955971854641c61f378549d73e829e1768de90fc1abf1", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0 or ~> 1.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "fceba0a4d0f24301ddee3024ae116df1c3f4bb7a563a731f45fdfeb9d39a231b"},
"nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"},
"nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"},
"req": {:hex, :req, "0.5.17", "0096ddd5b0ed6f576a03dde4b158a0c727215b15d2795e59e0916c6971066ede", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0b8bc6ffdfebbc07968e59d3ff96d52f2202d0536f10fef4dc11dc02a2a43e39"},
"telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"},
}