Files
quartz-org-roam/scripts/pipeline/lib/pipeline/resolvers/bibtex.ex
Ignacio Ballesteros 511b003da8 Add Elixir markdown pipeline with org-citar citation resolution
Introduces scripts/pipeline/, a Mix project that runs as a post-export
transformation pass over content/*.md before Quartz builds the site.

Pipeline (scripts/export.exs phase 3):
- Compiles and loads the Mix project at export time (cached after first run)
- Applies a list of Transform modules sequentially over all .md files
- Only rewrites files that were actually changed

Citations transform (Pipeline.Transforms.Citations):
- Resolves [cite:@key] and bare cite:key syntax produced by ox-hugo/citar
- Resolution chain: Zotero BBT JSON-RPC → BibTeX file → DOI/bare-key fallback
- Zotero probe uses a no-op JSON-RPC call (cayw endpoint blocks indefinitely)
- Zotero resolver fetches PDF attachments via item.attachments, producing
  zotero://open-pdf/... links; falls back to zotero://select/library/items/...
- BibTeX resolver parses .bib files with a simple regex parser (no deps)
- DOI resolver is the always-succeeding last resort

Configuration via env vars:
  BIBTEX_FILE   — path to .bib file for fallback resolution
  ZOTERO_URL    — Zotero base URL (default: http://localhost:23119)
  CITATION_MODE — silent | warn (default) | strict

Adding future transforms requires only implementing Pipeline.Transform
behaviour and appending the module to the transforms list in export.exs.
2026-02-20 10:00:11 +01:00

179 lines
4.7 KiB
Elixir

defmodule Pipeline.Resolvers.BibTeX do
@moduledoc """
Resolves citation keys from a local BibTeX (.bib) file.
Configured via the `BIBTEX_FILE` environment variable, or passed directly
as `opts.bibtex_file`. The file is parsed once at init time and the
resulting entry map is reused for all lookups.
Supports extracting: author last names, year, title, DOI, URL.
BibTeX entry format parsed:
@type{citationkey,
author = {Last, First and Last2, First2},
year = {2021},
title = {Some Title},
doi = {10.xxxx/yyyy},
url = {https://example.com},
}
Returns `{:ok, %{label: "Author, Year", url: "..."}}` or `:error`.
"""
require Logger
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
@doc """
Parse a .bib file and return a map of `%{citation_key => entry_map}`.
Returns `{:ok, entries}` or `{:error, reason}`.
"""
@spec load(String.t()) :: {:ok, map()} | {:error, term()}
def load(path) do
case File.read(path) do
{:ok, content} ->
entries = parse_entries(content)
Logger.info("BibTeX: loaded #{map_size(entries)} entries from #{path}")
{:ok, entries}
{:error, reason} ->
{:error, reason}
end
end
@doc """
Resolve a citation key from pre-loaded BibTeX entries.
"""
@spec resolve(String.t(), map()) :: {:ok, map()} | :error
def resolve(key, entries) do
case Map.fetch(entries, key) do
{:ok, entry} ->
label = build_label(entry)
url = build_url(entry)
{:ok, %{label: label, url: url}}
:error ->
:error
end
end
# ------------------------------------------------------------------
# Parsing
# ------------------------------------------------------------------
# Match @type{key, ...fields...}
# We handle nested braces by scanning character by character after
# finding the opening, rather than relying on a single regex.
@entry_header ~r/@\w+\s*\{\s*([^,\s]+)\s*,/
defp parse_entries(content) do
# Split on "@" boundaries, then parse each chunk
content
|> String.split(~r/(?=@\w+\s*\{)/, trim: true)
|> Enum.reduce(%{}, fn chunk, acc ->
case Regex.run(@entry_header, chunk) do
[_, key] ->
fields = parse_fields(chunk)
Map.put(acc, String.trim(key), fields)
_ ->
acc
end
end)
end
# Extract key = {value} or key = "value" pairs from an entry block.
# Handles simple single-depth braces; good enough for common fields.
@field_regex ~r/(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")/
defp parse_fields(chunk) do
@field_regex
|> Regex.scan(chunk)
|> Enum.reduce(%{}, fn match, acc ->
field_name = Enum.at(match, 1) |> String.downcase()
# Value is in capture group 2 (braces) or 3 (quotes)
value =
case {Enum.at(match, 2, ""), Enum.at(match, 3, "")} do
{"", q} -> q
{b, _} -> b
end
Map.put(acc, field_name, String.trim(value))
end)
end
# ------------------------------------------------------------------
# Label & URL building
# ------------------------------------------------------------------
defp build_label(entry) do
author_part =
entry
|> Map.get("author", "")
|> parse_authors()
|> format_authors()
year = Map.get(entry, "year", Map.get(entry, "date", ""))
year = extract_year(year)
if year && author_part != "", do: "#{author_part}, #{year}", else: author_part
end
defp parse_authors(""), do: []
defp parse_authors(author_str) do
author_str
|> String.split(" and ", trim: true)
|> Enum.map(&extract_last_name/1)
|> Enum.reject(&(&1 == ""))
end
# Handles "Last, First" and "First Last" formats
defp extract_last_name(name) do
name = String.trim(name)
cond do
String.contains?(name, ",") ->
name |> String.split(",") |> List.first() |> String.trim()
String.contains?(name, " ") ->
name |> String.split(" ") |> List.last() |> String.trim()
true ->
name
end
end
defp format_authors([]), do: "Unknown"
defp format_authors([single]), do: single
defp format_authors([first | rest]), do: "#{first} & #{List.last(rest)}"
defp extract_year(""), do: nil
defp extract_year(str) do
case Regex.run(~r/\b(\d{4})\b/, str) do
[_, year] -> year
_ -> nil
end
end
defp build_url(entry) do
cond do
doi = Map.get(entry, "doi", "") |> non_empty() ->
"https://doi.org/#{doi}"
url = Map.get(entry, "url", "") |> non_empty() ->
url
true ->
nil
end
end
defp non_empty(""), do: nil
defp non_empty(v), do: v
end