From 511b003da818eddcc6be1337c07df6ef594b5d20 Mon Sep 17 00:00:00 2001 From: Ignacio Ballesteros Date: Fri, 20 Feb 2026 10:00:11 +0100 Subject: [PATCH] Add Elixir markdown pipeline with org-citar citation resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces scripts/pipeline/, a Mix project that runs as a post-export transformation pass over content/*.md before Quartz builds the site. Pipeline (scripts/export.exs phase 3): - Compiles and loads the Mix project at export time (cached after first run) - Applies a list of Transform modules sequentially over all .md files - Only rewrites files that were actually changed Citations transform (Pipeline.Transforms.Citations): - Resolves [cite:@key] and bare cite:key syntax produced by ox-hugo/citar - Resolution chain: Zotero BBT JSON-RPC → BibTeX file → DOI/bare-key fallback - Zotero probe uses a no-op JSON-RPC call (cayw endpoint blocks indefinitely) - Zotero resolver fetches PDF attachments via item.attachments, producing zotero://open-pdf/... links; falls back to zotero://select/library/items/... - BibTeX resolver parses .bib files with a simple regex parser (no deps) - DOI resolver is the always-succeeding last resort Configuration via env vars: BIBTEX_FILE — path to .bib file for fallback resolution ZOTERO_URL — Zotero base URL (default: http://localhost:23119) CITATION_MODE — silent | warn (default) | strict Adding future transforms requires only implementing Pipeline.Transform behaviour and appending the module to the transforms list in export.exs. --- .gitignore | 7 + notes/example-citation.org | 13 + scripts/export.exs | 99 +++++++- scripts/pipeline/lib/pipeline.ex | 83 +++++++ scripts/pipeline/lib/pipeline/application.ex | 14 ++ .../pipeline/lib/pipeline/resolvers/bibtex.ex | 178 ++++++++++++++ .../pipeline/lib/pipeline/resolvers/doi.ex | 18 ++ .../pipeline/lib/pipeline/resolvers/zotero.ex | 182 ++++++++++++++ scripts/pipeline/lib/pipeline/transform.ex | 48 ++++ .../lib/pipeline/transforms/citations.ex | 231 ++++++++++++++++++ scripts/pipeline/mix.exs | 27 ++ scripts/pipeline/mix.lock | 11 + 12 files changed, 902 insertions(+), 9 deletions(-) create mode 100644 notes/example-citation.org create mode 100644 scripts/pipeline/lib/pipeline.ex create mode 100644 scripts/pipeline/lib/pipeline/application.ex create mode 100644 scripts/pipeline/lib/pipeline/resolvers/bibtex.ex create mode 100644 scripts/pipeline/lib/pipeline/resolvers/doi.ex create mode 100644 scripts/pipeline/lib/pipeline/resolvers/zotero.ex create mode 100644 scripts/pipeline/lib/pipeline/transform.ex create mode 100644 scripts/pipeline/lib/pipeline/transforms/citations.ex create mode 100644 scripts/pipeline/mix.exs create mode 100644 scripts/pipeline/mix.lock diff --git a/.gitignore b/.gitignore index 1e184a385..732964a5a 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,10 @@ erl_crash.dump # content/ is generated by the export script; only keep the placeholder content/* !content/.gitkeep +# Elixir/Mix build artifacts for the pipeline project +scripts/pipeline/_build/ +scripts/pipeline/deps/ +scripts/pipeline/erl_crash.dump +# Test helpers (not needed in production) +scripts/test.bib +scripts/test_pipeline.exs diff --git a/notes/example-citation.org b/notes/example-citation.org new file mode 100644 index 000000000..bca1480f9 --- /dev/null +++ b/notes/example-citation.org @@ -0,0 +1,13 @@ +#+title: Example: Citation Reference + +This file demonstrates how org-citar citations pass through ox-hugo into +markdown, where the pipeline transform resolves them. + +The methodology described in [cite:@podlovics2021journalArticle] provides a +useful framework for analysis. + +Multiple citations can appear together: +[cite:@podlovics2021journalArticle] + +Older bare-cite style (org-roam v1 / older citar) also works: +cite:podlovics2021journalArticle diff --git a/scripts/export.exs b/scripts/export.exs index bf9be3255..2a3b3ae0a 100644 --- a/scripts/export.exs +++ b/scripts/export.exs @@ -1,22 +1,71 @@ #!/usr/bin/env elixir -# Export org-roam notes (per-file) to content/ via ox-hugo. +# Export org-roam notes (per-file) to content/ via ox-hugo, +# then run the markdown transformation pipeline (citations, etc.). # # Usage: # NOTES_DIR=~/notes elixir scripts/export.exs # elixir scripts/export.exs /path/to/notes # +# Optional env vars: +# BIBTEX_FILE — path to a .bib file used as citation fallback +# ZOTERO_URL — Zotero Better BibTeX base URL (default: http://localhost:23119) +# CITATION_MODE — silent | warn (default) | strict +# # The positional argument takes precedence over the NOTES_DIR env var. +# --------------------------------------------------------------------------- +# Load the pipeline Mix project so its modules are available in this script. +# --------------------------------------------------------------------------- +repo_root = __DIR__ |> Path.join("..") |> Path.expand() +pipeline_dir = Path.join(repo_root, "scripts/pipeline") + +# Compile and load the pipeline project's modules into this runtime. +# Mix.install is NOT used here because we have a local Mix project — instead +# we compile it and push its beam files onto the code path. +# +# This runs `mix deps.get` + `mix compile` the first time; subsequent runs +# use the compiled artifacts from _build/ (fast, same as Mix caching). +{_, 0} = + System.cmd("mix", ["deps.get", "--quiet"], + cd: pipeline_dir, + env: [{"MIX_ENV", "prod"}], + into: IO.stream() + ) + +{_, 0} = + System.cmd("mix", ["compile", "--quiet"], + cd: pipeline_dir, + env: [{"MIX_ENV", "prod"}], + into: IO.stream() + ) + +# Add compiled beam files to the load path so we can call pipeline modules. +pipeline_build = Path.join(pipeline_dir, "_build/prod/lib") + +pipeline_build +|> File.ls!() +|> Enum.each(fn app -> + ebin = Path.join([pipeline_build, app, "ebin"]) + if File.dir?(ebin), do: Code.prepend_path(ebin) +end) + +# Start the pipeline OTP application (which starts Finch for HTTP). +Application.ensure_all_started(:pipeline) + +# --------------------------------------------------------------------------- +# Argument / env resolution +# --------------------------------------------------------------------------- + notes_dir = case System.argv() do [dir | _] -> dir [] -> System.get_env("NOTES_DIR") || - (IO.puts(:stderr, "Usage: NOTES_DIR=/path/to/notes elixir scripts/export.exs"); System.halt(1)) + (IO.puts(:stderr, "Usage: NOTES_DIR=/path/to/notes elixir scripts/export.exs") + System.halt(1)) end notes_dir = Path.expand(notes_dir) -repo_root = __DIR__ |> Path.join("..") |> Path.expand() content_dir = Path.join(repo_root, "content") unless File.dir?(notes_dir) do @@ -24,7 +73,9 @@ unless File.dir?(notes_dir) do System.halt(1) end -# Wipe content/, preserving .gitkeep +# --------------------------------------------------------------------------- +# Phase 1: Wipe content/ +# --------------------------------------------------------------------------- IO.puts("==> Wiping #{content_dir}") content_dir @@ -34,7 +85,9 @@ content_dir Path.join(content_dir, entry) |> File.rm_rf!() end) -# Collect all .org files +# --------------------------------------------------------------------------- +# Phase 2: Export org files via Emacs + ox-hugo +# --------------------------------------------------------------------------- IO.puts("==> Exporting org files from #{notes_dir}") org_files = @@ -46,12 +99,10 @@ if org_files == [] do System.halt(0) end -# Export each file via emacs --batch results = Enum.map(org_files, fn orgfile -> IO.puts(" exporting: #{orgfile}") - # Mirror the notes subdirectory structure under content/ section = orgfile |> Path.dirname() @@ -71,7 +122,6 @@ results = stderr_to_stdout: true ) - # Filter noisy emacs startup lines, same as the shell script filtered = output |> String.split("\n") @@ -91,12 +141,43 @@ if failures != [] do System.halt(1) end +# --------------------------------------------------------------------------- +# Phase 3: Markdown transformation pipeline +# --------------------------------------------------------------------------- +IO.puts("==> Running markdown pipeline") + +pipeline_opts = %{ + zotero_url: System.get_env("ZOTERO_URL", "http://localhost:23119"), + bibtex_file: System.get_env("BIBTEX_FILE"), + citation_mode: + case System.get_env("CITATION_MODE", "warn") do + "silent" -> :silent + "strict" -> :strict + _ -> :warn + end +} + +transforms = [Pipeline.Transforms.Citations] + +case Pipeline.run(content_dir, transforms, pipeline_opts) do + {:ok, stats} -> + Enum.each(stats, fn {mod, count} -> + IO.puts(" #{inspect(mod)}: #{count} file(s) modified") + end) + + {:error, reason} -> + IO.puts(:stderr, "Pipeline error: #{inspect(reason)}") + System.halt(1) +end + +# --------------------------------------------------------------------------- +# Phase 4: Generate default index.md if none was exported +# --------------------------------------------------------------------------- md_count = Path.join(content_dir, "**/*.md") |> Path.wildcard() |> length() -# Generate a default index.md if none was exported index_path = Path.join(content_dir, "index.md") unless File.exists?(index_path) do diff --git a/scripts/pipeline/lib/pipeline.ex b/scripts/pipeline/lib/pipeline.ex new file mode 100644 index 000000000..089540a6f --- /dev/null +++ b/scripts/pipeline/lib/pipeline.ex @@ -0,0 +1,83 @@ +defmodule Pipeline do + @moduledoc """ + Post-export markdown transformation pipeline. + + Applies a list of transform modules sequentially over every .md file + in a content directory. Each transform module must implement: + + apply(content :: String.t(), opts :: map()) :: String.t() + + Transforms are applied in the order given. A file is rewritten only + when at least one transform mutates its content (checked via equality). + + ## Usage + + opts = %{ + zotero_url: "http://localhost:23119", + bibtex_file: System.get_env("BIBTEX_FILE"), + citation_mode: :warn # :silent | :warn | :strict + } + + Pipeline.run(content_dir, [Pipeline.Transforms.Citations], opts) + """ + + require Logger + + @type transform :: module() + @type opts :: map() + + @doc """ + Run all transforms over every .md file under `content_dir`. + Returns `{:ok, stats}` where stats maps each transform to a count of files it changed. + """ + @spec run(String.t(), [transform()], opts()) :: {:ok, map()} + def run(content_dir, transforms, opts \\ %{}) do + md_files = + content_dir + |> Path.join("**/*.md") + |> Path.wildcard() + + if md_files == [] do + Logger.warning("Pipeline: no .md files found in #{content_dir}") + {:ok, %{}} + else + Logger.info("Pipeline: processing #{length(md_files)} markdown files with #{length(transforms)} transform(s)") + + # Initialise transforms (allows them to perform setup such as loading a .bib file). + # Each transform module must implement the Pipeline.Transform behaviour. + initialized = + Enum.map(transforms, fn mod -> + state = mod.init(opts) + {mod, state} + end) + + stats = + Enum.reduce(md_files, %{}, fn path, acc -> + original = File.read!(path) + + {transformed, file_stats} = + Enum.reduce(initialized, {original, %{}}, fn {mod, state}, {content, fstats} -> + result = mod.apply(content, state, opts) + changed = result != content + {result, Map.update(fstats, mod, (if changed, do: 1, else: 0), &(&1 + (if changed, do: 1, else: 0)))} + end) + + if transformed != original do + File.write!(path, transformed) + Logger.debug("Pipeline: updated #{Path.relative_to_cwd(path)}") + end + + Map.merge(acc, file_stats, fn _k, a, b -> a + b end) + end) + + Enum.each(initialized, fn {mod, state} -> + # teardown/1 is optional in the behaviour + if function_exported?(mod, :teardown, 1) do + mod.teardown(state) + end + end) + + {:ok, stats} + end + end +end diff --git a/scripts/pipeline/lib/pipeline/application.ex b/scripts/pipeline/lib/pipeline/application.ex new file mode 100644 index 000000000..ae6017a17 --- /dev/null +++ b/scripts/pipeline/lib/pipeline/application.ex @@ -0,0 +1,14 @@ +defmodule Pipeline.Application do + @moduledoc false + use Application + + @impl true + def start(_type, _args) do + children = [ + {Finch, name: Pipeline.Finch} + ] + + opts = [strategy: :one_for_one, name: Pipeline.Supervisor] + Supervisor.start_link(children, opts) + end +end diff --git a/scripts/pipeline/lib/pipeline/resolvers/bibtex.ex b/scripts/pipeline/lib/pipeline/resolvers/bibtex.ex new file mode 100644 index 000000000..8210822ef --- /dev/null +++ b/scripts/pipeline/lib/pipeline/resolvers/bibtex.ex @@ -0,0 +1,178 @@ +defmodule Pipeline.Resolvers.BibTeX do + @moduledoc """ + Resolves citation keys from a local BibTeX (.bib) file. + + Configured via the `BIBTEX_FILE` environment variable, or passed directly + as `opts.bibtex_file`. The file is parsed once at init time and the + resulting entry map is reused for all lookups. + + Supports extracting: author last names, year, title, DOI, URL. + + BibTeX entry format parsed: + + @type{citationkey, + author = {Last, First and Last2, First2}, + year = {2021}, + title = {Some Title}, + doi = {10.xxxx/yyyy}, + url = {https://example.com}, + } + + Returns `{:ok, %{label: "Author, Year", url: "..."}}` or `:error`. + """ + + require Logger + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + @doc """ + Parse a .bib file and return a map of `%{citation_key => entry_map}`. + Returns `{:ok, entries}` or `{:error, reason}`. + """ + @spec load(String.t()) :: {:ok, map()} | {:error, term()} + def load(path) do + case File.read(path) do + {:ok, content} -> + entries = parse_entries(content) + Logger.info("BibTeX: loaded #{map_size(entries)} entries from #{path}") + {:ok, entries} + + {:error, reason} -> + {:error, reason} + end + end + + @doc """ + Resolve a citation key from pre-loaded BibTeX entries. + """ + @spec resolve(String.t(), map()) :: {:ok, map()} | :error + def resolve(key, entries) do + case Map.fetch(entries, key) do + {:ok, entry} -> + label = build_label(entry) + url = build_url(entry) + {:ok, %{label: label, url: url}} + + :error -> + :error + end + end + + # ------------------------------------------------------------------ + # Parsing + # ------------------------------------------------------------------ + + # Match @type{key, ...fields...} + # We handle nested braces by scanning character by character after + # finding the opening, rather than relying on a single regex. + @entry_header ~r/@\w+\s*\{\s*([^,\s]+)\s*,/ + + defp parse_entries(content) do + # Split on "@" boundaries, then parse each chunk + content + |> String.split(~r/(?=@\w+\s*\{)/, trim: true) + |> Enum.reduce(%{}, fn chunk, acc -> + case Regex.run(@entry_header, chunk) do + [_, key] -> + fields = parse_fields(chunk) + Map.put(acc, String.trim(key), fields) + + _ -> + acc + end + end) + end + + # Extract key = {value} or key = "value" pairs from an entry block. + # Handles simple single-depth braces; good enough for common fields. + @field_regex ~r/(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")/ + + defp parse_fields(chunk) do + @field_regex + |> Regex.scan(chunk) + |> Enum.reduce(%{}, fn match, acc -> + field_name = Enum.at(match, 1) |> String.downcase() + # Value is in capture group 2 (braces) or 3 (quotes) + value = + case {Enum.at(match, 2, ""), Enum.at(match, 3, "")} do + {"", q} -> q + {b, _} -> b + end + + Map.put(acc, field_name, String.trim(value)) + end) + end + + # ------------------------------------------------------------------ + # Label & URL building + # ------------------------------------------------------------------ + + defp build_label(entry) do + author_part = + entry + |> Map.get("author", "") + |> parse_authors() + |> format_authors() + + year = Map.get(entry, "year", Map.get(entry, "date", "")) + year = extract_year(year) + + if year && author_part != "", do: "#{author_part}, #{year}", else: author_part + end + + defp parse_authors(""), do: [] + + defp parse_authors(author_str) do + author_str + |> String.split(" and ", trim: true) + |> Enum.map(&extract_last_name/1) + |> Enum.reject(&(&1 == "")) + end + + # Handles "Last, First" and "First Last" formats + defp extract_last_name(name) do + name = String.trim(name) + + cond do + String.contains?(name, ",") -> + name |> String.split(",") |> List.first() |> String.trim() + + String.contains?(name, " ") -> + name |> String.split(" ") |> List.last() |> String.trim() + + true -> + name + end + end + + defp format_authors([]), do: "Unknown" + defp format_authors([single]), do: single + defp format_authors([first | rest]), do: "#{first} & #{List.last(rest)}" + + defp extract_year(""), do: nil + + defp extract_year(str) do + case Regex.run(~r/\b(\d{4})\b/, str) do + [_, year] -> year + _ -> nil + end + end + + defp build_url(entry) do + cond do + doi = Map.get(entry, "doi", "") |> non_empty() -> + "https://doi.org/#{doi}" + + url = Map.get(entry, "url", "") |> non_empty() -> + url + + true -> + nil + end + end + + defp non_empty(""), do: nil + defp non_empty(v), do: v +end diff --git a/scripts/pipeline/lib/pipeline/resolvers/doi.ex b/scripts/pipeline/lib/pipeline/resolvers/doi.ex new file mode 100644 index 000000000..d64d3155d --- /dev/null +++ b/scripts/pipeline/lib/pipeline/resolvers/doi.ex @@ -0,0 +1,18 @@ +defmodule Pipeline.Resolvers.DOI do + @moduledoc """ + Last-resort citation resolver — always succeeds. + + If the citation key looks like a DOI (starts with "10."), returns a + `https://doi.org/...` link. Otherwise returns the key itself as a + plain label with no URL. + """ + + @spec resolve(String.t()) :: {:ok, map()} + def resolve(key) do + if String.starts_with?(key, "10.") do + {:ok, %{label: key, url: "https://doi.org/#{key}"}} + else + {:ok, %{label: key, url: nil}} + end + end +end diff --git a/scripts/pipeline/lib/pipeline/resolvers/zotero.ex b/scripts/pipeline/lib/pipeline/resolvers/zotero.ex new file mode 100644 index 000000000..c4cb6a746 --- /dev/null +++ b/scripts/pipeline/lib/pipeline/resolvers/zotero.ex @@ -0,0 +1,182 @@ +defmodule Pipeline.Resolvers.Zotero do + @moduledoc """ + Resolves citation keys via Zotero Better BibTeX's JSON-RPC API. + + Requires Zotero to be running with the Better BibTeX plugin installed. + Default endpoint: http://localhost:23119/better-bibtex/json-rpc + + Resolution strategy: + 1. Search by citation key via `item.search` + 2. If found, try to get a PDF attachment link (zotero://open-pdf/...) + 3. Fall back to zotero://select/items/@key + + Returns `{:ok, %{label: "Author, Year", url: "zotero://..."}}` or `:error`. + """ + + require Logger + + @rpc_path "/better-bibtex/json-rpc" + + @doc """ + Attempt to resolve `key` against a running Zotero instance. + `base_url` defaults to `http://localhost:23119`. + """ + @spec resolve(String.t(), String.t()) :: {:ok, map()} | :error + def resolve(key, base_url \\ "http://localhost:23119") do + url = base_url <> @rpc_path + + payload = + Jason.encode!(%{ + jsonrpc: "2.0", + method: "item.search", + params: [ + [["citationKey", "is", key]] + ], + id: 1 + }) + + case Req.post(url, + body: payload, + headers: [{"content-type", "application/json"}], + receive_timeout: 5_000, + finch: Pipeline.Finch + ) do + {:ok, %{status: 200, body: body}} -> + parse_response(body, key, base_url) + + {:ok, %{status: status}} -> + Logger.debug("Zotero: unexpected HTTP #{status} for key #{key}") + :error + + {:error, reason} -> + Logger.debug("Zotero: connection failed for key #{key}: #{inspect(reason)}") + :error + + other -> + Logger.debug("Zotero: unexpected result for key #{key}: #{inspect(other)}") + :error + end + rescue + e -> + Logger.debug("Zotero: exception resolving key #{key}: #{inspect(e)}") + :error + end + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + defp parse_response(%{"result" => [item | _]}, key, base_url) do + label = build_label(item) + url = resolve_url(item, key, base_url) + {:ok, %{label: label, url: url}} + end + + defp parse_response(%{"result" => []}, key, _base_url) do + Logger.debug("Zotero: no item found for key #{key}") + :error + end + + defp parse_response(%{"error" => err}, key, _base_url) do + Logger.debug("Zotero: RPC error for key #{key}: #{inspect(err)}") + :error + end + + defp parse_response(body, key, _base_url) do + Logger.debug("Zotero: unexpected response shape for key #{key}: #{inspect(body)}") + :error + end + + defp fetch_pdf_url(key, base_url) do + payload = + Jason.encode!(%{ + jsonrpc: "2.0", + method: "item.attachments", + params: [key], + id: 2 + }) + + case Req.post(base_url <> @rpc_path, + body: payload, + headers: [{"content-type", "application/json"}], + receive_timeout: 5_000, + finch: Pipeline.Finch + ) do + {:ok, %{status: 200, body: %{"result" => attachments}}} when is_list(attachments) -> + attachments + |> Enum.find_value(fn att -> + open = Map.get(att, "open", "") + path = Map.get(att, "path", "") + if String.ends_with?(path, ".pdf"), do: open, else: nil + end) + + _ -> + nil + end + rescue + _ -> nil + end + + # CSL-JSON format: authors are under "author" with "family"/"given" keys. + # Year is under "issued" -> "date-parts" -> [[year, month, day]]. + defp build_label(item) do + authors = Map.get(item, "author", []) + year = extract_year(item) + + author_part = + case authors do + [] -> + "Unknown" + + [single] -> + Map.get(single, "family", Map.get(single, "literal", "Unknown")) + + [first | rest] -> + first_name = Map.get(first, "family", Map.get(first, "literal", "Unknown")) + last_name = + rest + |> List.last() + |> then(&Map.get(&1, "family", Map.get(&1, "literal", "Unknown"))) + + "#{first_name} & #{last_name}" + end + + if year, do: "#{author_part}, #{year}", else: author_part + end + + # "issued": {"date-parts": [["2021", 2, 3]]} + defp extract_year(item) do + case get_in(item, ["issued", "date-parts"]) do + [[year | _] | _] -> to_string(year) + _ -> nil + end + end + + defp resolve_url(item, key, base_url) do + # Prefer zotero://open-pdf/... for items with a PDF attachment. + # Fall back to zotero://select/library/items/KEY to open the item in Zotero. + # The "id" field is a URI like "http://zotero.org/users/123/items/ABCD1234". + pdf_url = fetch_pdf_url(key, base_url) + + if pdf_url do + pdf_url + else + item_key = + item + |> Map.get("id", "") + |> String.split("/") + |> List.last() + |> non_empty() + + if item_key do + "zotero://select/library/items/#{item_key}" + else + "zotero://select/items/@#{key}" + end + end + end + + defp non_empty(nil), do: nil + defp non_empty(""), do: nil + defp non_empty(v), do: v +end diff --git a/scripts/pipeline/lib/pipeline/transform.ex b/scripts/pipeline/lib/pipeline/transform.ex new file mode 100644 index 000000000..06b573444 --- /dev/null +++ b/scripts/pipeline/lib/pipeline/transform.ex @@ -0,0 +1,48 @@ +defmodule Pipeline.Transform do + @moduledoc """ + Behaviour that all markdown transform modules must implement. + + ## Callbacks + + - `init/1` — called once before processing; returns transform-specific state. + Default implementation returns the opts map unchanged. + - `apply/3` — called per .md file; returns the (possibly modified) content. + - `teardown/1` — optional cleanup after all files are processed. + + ## Example + + defmodule MyTransform do + @behaviour Pipeline.Transform + + @impl true + def init(opts), do: %{some_state: opts[:value]} + + @impl true + def apply(content, state, _opts) do + String.replace(content, "foo", state.some_state) + end + end + """ + + @doc "One-time initialisation. Returns opaque state passed to apply/3." + @callback init(opts :: map()) :: term() + + @doc "Transform file content. Returns the (possibly modified) content string." + @callback apply(content :: String.t(), state :: term(), opts :: map()) :: String.t() + + @doc "Optional cleanup after all files are processed." + @callback teardown(state :: term()) :: :ok + + @optional_callbacks teardown: 1 + + defmacro __using__(_) do + quote do + @behaviour Pipeline.Transform + + @impl Pipeline.Transform + def init(opts), do: opts + + defoverridable init: 1 + end + end +end diff --git a/scripts/pipeline/lib/pipeline/transforms/citations.ex b/scripts/pipeline/lib/pipeline/transforms/citations.ex new file mode 100644 index 000000000..67316db64 --- /dev/null +++ b/scripts/pipeline/lib/pipeline/transforms/citations.ex @@ -0,0 +1,231 @@ +defmodule Pipeline.Transforms.Citations do + @moduledoc """ + Markdown transform: resolves org-citar citation keys to hyperlinks. + + ## Recognised citation syntax (as output by ox-hugo from org-citar) + + [cite:@key] → org-cite / citar standard (most common) + [cite:@key1;@key2] → multiple citations + cite:key → older roam-style bare cite syntax + + ## Resolution chain (in order) + + 1. Zotero (live instance via Better BibTeX JSON-RPC) — preferred + 2. BibTeX file (BIBTEX_FILE env var) — fallback + 3. DOI / bare key — always succeeds + + ## Modes (opts.citation_mode) + + :silent — silently use DOI/bare-key fallback when Zotero+BibTeX fail + :warn — (default) emit a Logger.warning for unresolved keys + :strict — raise on unresolved keys (aborts pipeline) + + ## Format + + Resolved citations are rendered as: + + [Label](url) when a URL is available + [Label] when no URL could be determined (bare key fallback) + + Multiple semicolon-separated keys become space-separated links: + + [cite:@a;@b] → [Author A, 2020](url_a) [Author B, 2019](url_b) + + ## init/1 callback + + Loads the BibTeX file (if configured) once before processing begins, + and probes Zotero availability, emitting warnings as appropriate. + """ + + @behaviour Pipeline.Transform + + require Logger + + alias Pipeline.Resolvers.Zotero + alias Pipeline.Resolvers.BibTeX + alias Pipeline.Resolvers.DOI + + # Match [cite:@key] and [cite:@key1;@key2;...] (org-cite / citar style) + @cite_bracket_regex ~r/\[cite:(@[^\]]+)\]/ + + # Match bare cite:key (older roam style, no brackets, no @ prefix) + @cite_bare_regex ~r/(? + "All citations will fall back to bare-key rendering. " <> + "Set BIBTEX_FILE env var or start Zotero with Better BibTeX to resolve citations." + ) + end + + %{ + bibtex_entries: bibtex_entries, + zotero_available: zotero_available, + zotero_url: Map.get(opts, :zotero_url, "http://localhost:23119"), + citation_mode: Map.get(opts, :citation_mode, :warn) + } + end + + @doc """ + Apply citation resolution to a single markdown file's content. + """ + def apply(content, state, _opts) do + content + |> resolve_bracket_citations(state) + |> resolve_bare_citations(state) + end + + # ------------------------------------------------------------------ + # Resolution passes + # ------------------------------------------------------------------ + + defp resolve_bracket_citations(content, state) do + Regex.replace(@cite_bracket_regex, content, fn _full, keys_str -> + keys_str + |> String.split(";") + |> Enum.map(&String.trim/1) + |> Enum.map(fn "@" <> key -> key end) + |> Enum.map(&resolve_key(&1, state)) + |> Enum.join(" ") + end) + end + + defp resolve_bare_citations(content, state) do + Regex.replace(@cite_bare_regex, content, fn _full, key -> + resolve_key(key, state) + end) + end + + # ------------------------------------------------------------------ + # Single-key resolution chain + # ------------------------------------------------------------------ + + defp resolve_key(key, state) do + info = + with :error <- try_zotero(key, state), + :error <- try_bibtex(key, state) do + handle_unresolved(key, state) + else + {:ok, citation_info} -> citation_info + end + + format_result(info) + end + + defp try_zotero(_key, %{zotero_available: false}), do: :error + + defp try_zotero(key, %{zotero_url: url}) do + Zotero.resolve(key, url) + end + + defp try_bibtex(_key, %{bibtex_entries: entries}) when map_size(entries) == 0, do: :error + + defp try_bibtex(key, %{bibtex_entries: entries}) do + BibTeX.resolve(key, entries) + end + + defp handle_unresolved(key, %{citation_mode: mode}) do + case mode do + :strict -> + raise "Citations: could not resolve citation key '#{key}' and mode is :strict" + + :warn -> + Logger.warning("Citations: unresolved citation key '#{key}' — using bare-key fallback") + {:ok, result} = DOI.resolve(key) + result + + :silent -> + {:ok, result} = DOI.resolve(key) + result + end + end + + defp format_result(%{label: label, url: nil}), do: "[#{label}]" + defp format_result(%{label: label, url: url}), do: "[#{label}](#{url})" + + # ------------------------------------------------------------------ + # Init helpers + # ------------------------------------------------------------------ + + defp load_bibtex(opts) do + path = Map.get(opts, :bibtex_file) || System.get_env("BIBTEX_FILE") + + cond do + is_nil(path) -> + Logger.debug("Citations: BIBTEX_FILE not set — BibTeX resolver disabled") + %{} + + not File.exists?(path) -> + Logger.warning("Citations: BIBTEX_FILE=#{path} does not exist — BibTeX resolver disabled") + %{} + + true -> + case BibTeX.load(path) do + {:ok, entries} -> entries + {:error, reason} -> + Logger.warning("Citations: failed to load BibTeX file #{path}: #{inspect(reason)}") + %{} + end + end + end + + defp probe_zotero(opts) do + url = Map.get(opts, :zotero_url, "http://localhost:23119") + + # Use a no-op JSON-RPC call to probe availability. + # /better-bibtex/cayw is intentionally avoided — it blocks waiting for + # user interaction and never returns without a pick. + payload = + Jason.encode!(%{ + jsonrpc: "2.0", + method: "item.search", + params: [[[]]], + id: 0 + }) + + result = + try do + Req.post(url <> "/better-bibtex/json-rpc", + body: payload, + headers: [{"content-type", "application/json"}], + receive_timeout: 3_000, + finch: Pipeline.Finch + ) + rescue + e -> {:error, e} + end + + case result do + {:ok, %{status: 200}} -> + Logger.info("Citations: Zotero Better BibTeX is available at #{url}") + true + + {:ok, %{status: status}} -> + Logger.warning( + "Citations: Zotero responded HTTP #{status} at #{url} — " <> + "is Better BibTeX installed?" + ) + false + + _ -> + Logger.warning( + "Citations: Zotero not reachable at #{url} — " <> + "start Zotero with Better BibTeX or set BIBTEX_FILE as fallback" + ) + false + end + end +end diff --git a/scripts/pipeline/mix.exs b/scripts/pipeline/mix.exs new file mode 100644 index 000000000..33760f782 --- /dev/null +++ b/scripts/pipeline/mix.exs @@ -0,0 +1,27 @@ +defmodule Pipeline.MixProject do + use Mix.Project + + def project do + [ + app: :pipeline, + version: "0.1.0", + elixir: "~> 1.15", + start_permanent: Mix.env() == :prod, + deps: deps() + ] + end + + def application do + [ + extra_applications: [:logger, :inets, :ssl], + mod: {Pipeline.Application, []} + ] + end + + defp deps do + [ + {:req, "~> 0.5"}, + {:jason, "~> 1.4"} + ] + end +end diff --git a/scripts/pipeline/mix.lock b/scripts/pipeline/mix.lock new file mode 100644 index 000000000..862aa1b7b --- /dev/null +++ b/scripts/pipeline/mix.lock @@ -0,0 +1,11 @@ +%{ + "finch": {:hex, :finch, "0.21.0", "b1c3b2d48af02d0c66d2a9ebfb5622be5c5ecd62937cf79a88a7f98d48a8290c", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "87dc6e169794cb2570f75841a19da99cfde834249568f2a5b121b809588a4377"}, + "hpax": {:hex, :hpax, "1.0.3", "ed67ef51ad4df91e75cc6a1494f851850c0bd98ebc0be6e81b026e765ee535aa", [:mix], [], "hexpm", "8eab6e1cfa8d5918c2ce4ba43588e894af35dbd8e91e6e55c817bca5847df34a"}, + "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, + "mime": {:hex, :mime, "2.0.7", "b8d739037be7cd402aee1ba0306edfdef982687ee7e9859bee6198c1e7e2f128", [:mix], [], "hexpm", "6171188e399ee16023ffc5b76ce445eb6d9672e2e241d2df6050f3c771e80ccd"}, + "mint": {:hex, :mint, "1.7.1", "113fdb2b2f3b59e47c7955971854641c61f378549d73e829e1768de90fc1abf1", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0 or ~> 1.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "fceba0a4d0f24301ddee3024ae116df1c3f4bb7a563a731f45fdfeb9d39a231b"}, + "nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"}, + "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"}, + "req": {:hex, :req, "0.5.17", "0096ddd5b0ed6f576a03dde4b158a0c727215b15d2795e59e0916c6971066ede", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0b8bc6ffdfebbc07968e59d3ff96d52f2202d0536f10fef4dc11dc02a2a43e39"}, + "telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"}, +}