From d1631ddffe80fbfad779743ca56e5d413e112ec0 Mon Sep 17 00:00:00 2001
From: Kayos <kayos@sulkta.com>
Date: Thu, 14 May 2026 09:40:01 -0700
Subject: [PATCH] engines: import f5-tts + kokoro + tortoise sidecars into the
 tree

The python FastAPI sidecars have lived ad-hoc at /mnt/cache/appdata/
<engine>/build/ on Lucy without version control. Bringing them into
the skald repo so the engine code travels with the cross-engine
routing it depends on.

This commit lands the VANILLA version of each engine on main:

  engines/f5-tts/    SWivid F5-TTS (CC-BY-NC weights flagged)
  engines/kokoro/    hexgrad Kokoro-82M (Apache 2.0 top to bottom)
  engines/tortoise/  neonbjb Tortoise-TTS (Apache 2.0 top to bottom)

Engine-specific kludges (question doubling, GPU coordination,
pause-duration tuning) get layered on engine/* branches per the
README. Main stays the safe-to-read baseline.
---
 engines/README.md            |  58 +++++++
 engines/f5-tts/Dockerfile    |  41 +++++
 engines/f5-tts/compose.yml   |  43 +++++
 engines/f5-tts/server.py     | 184 ++++++++++++++++++++
 engines/kokoro/Dockerfile    |  35 ++++
 engines/kokoro/compose.yml   |  37 ++++
 engines/kokoro/server.py     | 324 +++++++++++++++++++++++++++++++++++
 engines/tortoise/Dockerfile  |  45 +++++
 engines/tortoise/compose.yml |  43 +++++
 engines/tortoise/server.py   | 305 +++++++++++++++++++++++++++++++++
 10 files changed, 1115 insertions(+)
 create mode 100644 engines/README.md
 create mode 100644 engines/f5-tts/Dockerfile
 create mode 100644 engines/f5-tts/compose.yml
 create mode 100644 engines/f5-tts/server.py
 create mode 100644 engines/kokoro/Dockerfile
 create mode 100644 engines/kokoro/compose.yml
 create mode 100644 engines/kokoro/server.py
 create mode 100644 engines/tortoise/Dockerfile
 create mode 100644 engines/tortoise/compose.yml
 create mode 100644 engines/tortoise/server.py
diff --git a/engines/README.md b/engines/README.md
new file mode 100644
index 0000000..d00c0dc
--- /dev/null
+++ b/engines/README.md
@@ -0,0 +1,58 @@
+# Skald TTS engines
+
+This subtree holds the per-engine sidecars that skald's narrate path
+talks to over HTTP. Each engine has the same contract:
+
+- `POST /synthesize` — same JSON shape across engines so skald's
+  one Rust client (`skald-core::narrate::Narrator`) deserializes
+  all of them. See `engines/<name>/server.py` for the per-engine
+  implementation.
+- `GET /healthz` — boot probe + model-loaded flag.
+
+Skald routes per-request by `voices.source`: a `kokoro_*` source
+goes to `$KOKORO_URL`, a `tortoise_*` source goes to `$TORTOISE_URL`,
+anything else (`lj_speech`, generic) goes to `$F5_TTS_URL`.
+
+## Engines
+
+| Dir | Engine | License (code/weights) | VRAM | Speed | Voices |
+|---|---|---|---|---|---|
+| `f5-tts/` | SWivid F5-TTS v1 | MIT / **CC-BY-NC** | ~5GB | fast (~2x real-time on 2070S) | voice cloning (LJ Speech reference shipped) |
+| `kokoro/` | hexgrad Kokoro-82M | Apache 2.0 / Apache 2.0 | ~1GB | very fast (~50x real-time) | 50+ named presets (af_*, am_*, bf_*, bm_*) |
+| `tortoise/` | neonbjb Tortoise-TTS | Apache 2.0 / Apache 2.0 | ~5GB | **slow** (~0.014x real-time, ~74s/s of audio on 2070S, standard preset) | 26 named built-ins (lj, freeman, daniel, weaver, jlaw, etc.) |
+
+## Branch model
+
+`main` carries the **vanilla** version of each engine — what you'd
+get from a clean `pip install <engine>` plus the FastAPI sidecar
++ control-tag splitter. No engine-specific kludges. Safe to look
+at without context.
+
+`engine/<name>` branches hold engine-tuned tweaks that don't
+generalise. Examples:
+
+- `engine/kokoro` — doubled-`??` prosody hack for the 82M's weak
+  question intonation, paragraph/scene/breath gap durations tuned
+  for af_heart's pacing, notes on how respellings need to be all-
+  lowercase to avoid letter-by-letter spell-out by misaki.
+- `engine/tortoise` — GPU exclusivity coordinator (stops F5 +
+  Kokoro before a Tortoise run since the 2070 Super can't host
+  all three at once), preset choice ergonomics, character→tortoise-
+  voice seed assignments.
+
+When deploying an engine to Lucy, the build dir at
+`/mnt/cache/appdata/<engine>/build/` tracks the engine's branch:
+
+```bash
+cd /mnt/cache/appdata/kokoro/build
+git fetch && git checkout engine/kokoro
+docker compose -p <name> up -d --build
+```
+
+## GPU coordination (2070 Super)
+
+The 8GB card is the bottleneck. F5 + Kokoro can co-reside (~5GB +
+~1GB). Tortoise pushes the budget over and needs the GPU largely
+to itself — the `engine/tortoise` branch will carry the script
+that stops kokoro + f5 before a tortoise run and restarts them
+after. Replace with proper coordination once we have more VRAM.
diff --git a/engines/f5-tts/Dockerfile b/engines/f5-tts/Dockerfile
new file mode 100644
index 0000000..be4cd91
--- /dev/null
+++ b/engines/f5-tts/Dockerfile
@@ -0,0 +1,41 @@
+# Sulkta build of F5-TTS — upstream ghcr.io/swivid/f5-tts:main was
+# shipped with torch 2.11/torchaudio 2.4 ABI mismatch on 2026-05-13,
+# breaking import torchaudio at boot. We rebuild on a known-good
+# pytorch base + pip install f5-tts.
+#
+# Image tag in lucy-registry: lucy-registry:5000/f5-tts:<ver>
+#
+# License: Apache 2.0 (code) / CC-BY-NC (Emilia-trained weights).
+# Personal use OK; redistribution gray-area — flagged.
+
+FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-runtime
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    HF_HOME=/cache/hf \
+    HF_HUB_DISABLE_TELEMETRY=1
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ffmpeg \
+        git \
+        ca-certificates \
+        curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Base torch 2.6.0 + torchaudio 2.6.0; f5-tts pulls a recent
+# transformers (5.x) which needs torch >=2.5's modern
+# torch.library.custom_op type signatures.
+RUN pip install --no-cache-dir 'f5-tts>=1.0.0'
+
+# Pre-warm the HF cache directory.
+RUN mkdir -p /cache/hf /audio /voices
+
+COPY f5_server.py /app/f5_server.py
+WORKDIR /app
+
+EXPOSE 7860
+
+# Skald talks to our purpose-built FastAPI server, not Gradio.
+# Models load at startup (first request would otherwise pay the
+# cold-start cost). uvicorn on :7860 to keep the port stable.
+CMD ["uvicorn", "f5_server:app", "--host", "0.0.0.0", "--port", "7860"]
diff --git a/engines/f5-tts/compose.yml b/engines/f5-tts/compose.yml
new file mode 100644
index 0000000..5f11b13
--- /dev/null
+++ b/engines/f5-tts/compose.yml
@@ -0,0 +1,43 @@
+# F5-TTS standalone stack on Lucy.
+#
+# License posture (acknowledged 2026-05-13): code is Apache 2.0, but
+# the pretrained model weights are CC-BY-NC (Emilia training data).
+# Personal listen is fine; public sharing is a flagged gray area.
+# Cobb's call: ship anyway.
+#
+# Runtime: 8GB GPU is plenty (F5 inference ~4-6GB peak).
+#
+# First-run cost: ~2GB model download from HuggingFace into hf-cache,
+# happens on first inference request. Subsequent runs are warm.
+name: f5-tts
+
+services:
+  f5-tts:
+    image: lucy-registry:5000/f5-tts:0.3
+    container_name: f5-tts
+    restart: unless-stopped
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    ports:
+      - "192.168.0.5:7792:7860"
+      - "127.0.0.1:7792:7860"
+    volumes:
+      # HF model weights cache — persists ~2GB after first download.
+      - /mnt/cache/appdata/f5-tts/hf-cache:/cache/hf
+      # Reference voice clips (lj_speech.wav, etc).
+      - /mnt/cache/appdata/f5-tts/voices:/voices:ro
+      # Rendered audio output — skald writes story narrations here.
+      - /mnt/cache/appdata/f5-tts/audio:/audio
+    environment:
+      HF_HOME: /cache/hf
+      HF_HUB_DISABLE_TELEMETRY: "1"
+    labels:
+      org.sulkta.domain: "sulkta"
+      org.sulkta.owner: "cobb"
+      org.sulkta.managed-by: "compose"
+      org.sulkta.role: "f5-tts"
diff --git a/engines/f5-tts/server.py b/engines/f5-tts/server.py
new file mode 100644
index 0000000..8e0d8ea
--- /dev/null
+++ b/engines/f5-tts/server.py
@@ -0,0 +1,184 @@
+"""Thin FastAPI server inside the F5-TTS container.
+
+Loads model + vocoder ONCE at startup (heavy: ~5s, ~5GB VRAM).
+POST /synthesize runs inference and writes the WAV to a shared
+volume; the response is JSON with the output path and metadata —
+not the WAV bytes, since chapter-length renders are 20-30MB and
+both skald and the f5 container share /audio anyway.
+
+Why not Gradio's API: Gradio's /gradio_api/call/* shape is event-
+stream + polling; this is a single POST + immediate response.
+Right for skald's "render one chapter, then move on" loop.
+"""
+import logging
+import time
+import uuid
+from pathlib import Path
+
+import soundfile as sf
+import torch
+from fastapi import FastAPI, HTTPException
+from omegaconf import OmegaConf
+from pydantic import BaseModel, Field
+from cached_path import cached_path
+from importlib.resources import files
+from hydra.utils import get_class
+
+from f5_tts.infer.utils_infer import (
+    infer_process,
+    load_model,
+    load_vocoder,
+    preprocess_ref_audio_text,
+)
+
+
+log = logging.getLogger("f5-server")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
+
+
+# ─── model state ─────────────────────────────────────────────────
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_NAME = "F5TTS_v1_Base"
+VOCODER_NAME = "vocos"
+AUDIO_ROOT = Path("/audio")
+VOICES_ROOT = Path("/voices")
+
+_model = None
+_vocoder = None
+
+
+def _load_models() -> None:
+    """One-time model + vocoder load. ~5-8s wall-clock on first call."""
+    global _model, _vocoder
+    if _model is not None:
+        return
+    log.info("loading vocoder=%s device=%s", VOCODER_NAME, DEVICE)
+    _vocoder = load_vocoder(vocoder_name=VOCODER_NAME, device=DEVICE)
+
+    cfg_path = str(files("f5_tts").joinpath(f"configs/{MODEL_NAME}.yaml"))
+    model_cfg = OmegaConf.load(cfg_path)
+    model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
+    model_arc = model_cfg.model.arch
+
+    # F5TTS_v1_Base ships as a HuggingFace artifact; cached_path
+    # handles the resolution + downloads to HF_HOME.
+    ckpt_file = str(
+        cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors")
+    )
+    vocab_file = str(files("f5_tts").joinpath("infer/examples/vocab.txt"))
+
+    log.info("loading model=%s ckpt=%s", MODEL_NAME, ckpt_file)
+    _model = load_model(
+        model_cls, model_arc, ckpt_file,
+        mel_spec_type=VOCODER_NAME, vocab_file=vocab_file, device=DEVICE,
+    )
+    log.info("model + vocoder loaded; ready")
+
+
+# ─── FastAPI app ─────────────────────────────────────────────────
+
+
+class SynthesizeRequest(BaseModel):
+    # The text we want to synthesize. Long-form OK — F5-TTS chunks
+    # internally via infer_process.
+    gen_text: str = Field(min_length=1)
+    # Reference audio path (inside the f5-tts container). Defaults
+    # to the staged lj_speech clip.
+    ref_audio_path: str = "/voices/lj_speech.wav"
+    # Reference transcript. Defaults to the bundled lj_speech.txt.
+    ref_text: str | None = None
+    # Output filename, relative to /audio (the shared output dir).
+    # If omitted, a UUID-based name is assigned.
+    output_filename: str | None = None
+    # Speech speed (0.5-2.0). Default 1.0 = natural pace.
+    speed: float = Field(default=1.0, ge=0.3, le=2.0)
+    # Cross-fade between chunks; F5 default is 0.15s. Bigger smooths
+    # chunk boundaries on long-form prose at the cost of pacing.
+    cross_fade_duration: float = Field(default=0.15, ge=0.0, le=1.0)
+
+
+class SynthesizeResponse(BaseModel):
+    ok: bool
+    output_path: str
+    sample_rate_hz: int
+    duration_seconds: float
+    elapsed_ms: int
+    chars_in: int
+
+
+app = FastAPI(title="f5-tts-server", version="0.1.0")
+
+
+@app.on_event("startup")
+def _startup() -> None:
+    _load_models()
+
+
+@app.get("/healthz")
+def healthz() -> dict:
+    return {
+        "ok": True,
+        "device": DEVICE,
+        "model": MODEL_NAME,
+        "vocoder": VOCODER_NAME,
+        "loaded": _model is not None,
+    }
+
+
+@app.post("/synthesize", response_model=SynthesizeResponse)
+def synthesize(req: SynthesizeRequest) -> SynthesizeResponse:
+    if _model is None:
+        raise HTTPException(503, "model not loaded yet — retry shortly")
+
+    ref_audio_path = Path(req.ref_audio_path)
+    if not ref_audio_path.is_file():
+        raise HTTPException(400, f"ref_audio_path not found: {ref_audio_path}")
+
+    # If no explicit ref_text, try sidecar .txt then fall back to ""
+    # (which triggers F5's auto-ASR).
+    ref_text = req.ref_text
+    if ref_text is None:
+        sidecar = ref_audio_path.with_suffix(".txt")
+        if sidecar.is_file():
+            ref_text = sidecar.read_text().strip()
+        else:
+            ref_text = ""
+
+    output_filename = req.output_filename or f"{uuid.uuid4().hex}.wav"
+    if "/" in output_filename or ".." in output_filename:
+        raise HTTPException(400, "output_filename must be a bare name, no path parts")
+    output_path = AUDIO_ROOT / output_filename
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    started = time.monotonic()
+    ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
+        str(ref_audio_path), ref_text
+    )
+    audio_segment, final_sample_rate, _ = infer_process(
+        ref_audio_processed,
+        ref_text_processed,
+        req.gen_text,
+        _model,
+        _vocoder,
+        mel_spec_type=VOCODER_NAME,
+        speed=req.speed,
+        cross_fade_duration=req.cross_fade_duration,
+        device=DEVICE,
+    )
+    elapsed_ms = int((time.monotonic() - started) * 1000)
+
+    sf.write(str(output_path), audio_segment, final_sample_rate, subtype="PCM_16")
+    duration_s = float(len(audio_segment)) / float(final_sample_rate)
+    log.info(
+        "synthesized chars=%d -> %s (sr=%d, dur=%.2fs, elapsed=%dms)",
+        len(req.gen_text), output_path, final_sample_rate, duration_s, elapsed_ms,
+    )
+    return SynthesizeResponse(
+        ok=True,
+        output_path=str(output_path),
+        sample_rate_hz=final_sample_rate,
+        duration_seconds=duration_s,
+        elapsed_ms=elapsed_ms,
+        chars_in=len(req.gen_text),
+    )
diff --git a/engines/kokoro/Dockerfile b/engines/kokoro/Dockerfile
new file mode 100644
index 0000000..5c7b13e
--- /dev/null
+++ b/engines/kokoro/Dockerfile
@@ -0,0 +1,35 @@
+# Sulkta build of Kokoro-82M TTS.
+#
+# License: Apache 2.0 (code AND model weights). Clean stack — no
+# CC-BY-NC asterisk like F5-TTS's Emilia weights. This is the
+# narrator engine for sleep-quality audiobook reads; F5-TTS stays
+# around for voice-cloning cases.
+#
+# Kokoro is small enough to run on CPU but we use the cuda base
+# anyway to stay consistent with f5-tts and so it'll pick up the
+# GPU when no other tenant has it.
+FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-runtime
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    HF_HOME=/cache/hf \
+    HF_HUB_DISABLE_TELEMETRY=1
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ffmpeg \
+        espeak-ng \
+        ca-certificates \
+        curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# kokoro pulls phonemizer + soundfile + espeakng transitively.
+RUN pip install --no-cache-dir 'kokoro>=0.9.0' 'fastapi>=0.115.0' 'uvicorn>=0.32.0' 'soundfile>=0.13.0'
+
+RUN mkdir -p /cache/hf /audio
+
+COPY kokoro_server.py /app/kokoro_server.py
+WORKDIR /app
+
+EXPOSE 7860
+
+CMD ["uvicorn", "kokoro_server:app", "--host", "0.0.0.0", "--port", "7860"]
diff --git a/engines/kokoro/compose.yml b/engines/kokoro/compose.yml
new file mode 100644
index 0000000..dc26741
--- /dev/null
+++ b/engines/kokoro/compose.yml
@@ -0,0 +1,37 @@
+# Kokoro-82M TTS stack on Lucy.
+#
+# Audiobook-quality narrator engine (Apache 2.0 code + weights —
+# clean stack vs F5-TTS's CC-BY-NC asterisk). Sibling to f5-tts;
+# both share /mnt/cache/appdata/f5-tts/audio so skald's audio
+# route serves outputs from either engine through the same path.
+#
+# License: Apache 2.0 top to bottom. Right for share/publish.
+name: kokoro
+
+services:
+  kokoro:
+    image: lucy-registry:5000/kokoro:0.5
+    container_name: kokoro
+    restart: unless-stopped
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    ports:
+      - "192.168.0.5:7794:7860"
+      - "127.0.0.1:7794:7860"
+    volumes:
+      - /mnt/cache/appdata/kokoro/hf-cache:/cache/hf
+      # Shared with f5-tts so skald's /audio route covers both.
+      - /mnt/cache/appdata/f5-tts/audio:/audio
+    environment:
+      HF_HOME: /cache/hf
+      HF_HUB_DISABLE_TELEMETRY: "1"
+    labels:
+      org.sulkta.domain: "sulkta"
+      org.sulkta.owner: "cobb"
+      org.sulkta.managed-by: "compose"
+      org.sulkta.role: "kokoro"
diff --git a/engines/kokoro/server.py b/engines/kokoro/server.py
new file mode 100644
index 0000000..169cbbd
--- /dev/null
+++ b/engines/kokoro/server.py
@@ -0,0 +1,324 @@
+"""Kokoro-82M FastAPI server, sibling to f5_server.
+
+Same /synthesize contract as F5 so skald can route between engines
+just by which URL it points at. The semantic difference: Kokoro
+voices are NAMED (af_heart, af_bella, am_michael, etc.) — there's no
+reference audio. We repurpose the `ref_audio_path` field to carry
+the voice name; if it starts with '/' we treat as F5-style path and
+error.
+
+Render-and-stitch:
+  The naive "feed the whole chapter to Kokoro" path produces audio
+  that runs paragraphs together — no breath between scenes, no beat
+  on a hard line break. So this server splits the input on paragraph
+  and scene boundaries, renders each chunk, and concatenates with
+  explicit silence inserts between chunks.
+
+  Control tags the splitter recognizes (case-insensitive):
+    [pause:1.5s]   — silence of N seconds at this point
+    [pause:500ms]  — silence of N milliseconds at this point
+    [breath]       — short breath beat (~400ms)
+    [scene]        — major scene break (~1500ms)
+
+  Implicit breaks the splitter inserts:
+    Blank line between paragraphs       → 700ms
+    A line of just `---`                → 1500ms (scene break)
+
+  Sentence-internal pacing (commas, periods, em-dashes, ellipses)
+  is left to Kokoro's own phonemizer — it handles that well.
+
+License: Apache 2.0 (code + model weights). Clean stack for the
+sleep-quality narrator use case.
+"""
+import logging
+import re
+import time
+import uuid
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+import torch
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+
+from kokoro import KPipeline
+
+
+log = logging.getLogger("kokoro-server")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
+
+
+# ─── pipeline state ──────────────────────────────────────────────
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DEFAULT_LANG = "a"
+DEFAULT_VOICE = "af_heart"
+AUDIO_ROOT = Path("/audio")
+SAMPLE_RATE = 24000
+
+# Default silence durations for implicit breaks. Tags override.
+PARAGRAPH_GAP_S = 0.7
+SCENE_GAP_S = 1.5
+BREATH_GAP_S = 0.4
+
+_pipelines: dict[str, KPipeline] = {}
+
+
+def _get_pipeline(lang_code: str) -> KPipeline:
+    if lang_code not in _pipelines:
+        log.info("loading kokoro pipeline lang=%s device=%s", lang_code, DEVICE)
+        _pipelines[lang_code] = KPipeline(lang_code=lang_code, device=DEVICE)
+        log.info("kokoro pipeline loaded lang=%s", lang_code)
+    return _pipelines[lang_code]
+
+
+# ─── split + render pipeline ─────────────────────────────────────
+
+# A "node" is one of three kinds; the renderer walks the list,
+# calls Kokoro on each text node with its (possibly per-segment)
+# voice, and emits zeros for each silence node.
+class Node:
+    __slots__ = ("kind", "value", "voice")
+
+    def __init__(self, kind: str, value, voice: str | None = None):
+        # kind ∈ {"text", "silence"}; value is str for text and
+        # float seconds for silence. voice override is only used
+        # on text nodes from [voice:X]...[/voice] blocks; outside
+        # those blocks the request's default voice is used.
+        self.kind = kind
+        self.value = value
+        self.voice = voice
+
+
+# Voice-block delimiters are parsed at a higher level than other
+# tags so dialogue can contain its own [breath]/[pause] beats.
+_VOICE_OPEN_RE = re.compile(r"\[voice:([A-Za-z0-9_-]+)\]")
+_VOICE_CLOSE = "[/voice]"
+
+_TAG_RE = re.compile(
+    r"\[(pause:(?P<dur>[0-9]+(?:\.[0-9]+)?)(?P<unit>s|ms)?|breath|scene)\]",
+    re.IGNORECASE,
+)
+
+
+def _parse_tag(match: re.Match) -> float:
+    body = match.group(0).lower().strip("[]")
+    if body == "breath":
+        return BREATH_GAP_S
+    if body == "scene":
+        return SCENE_GAP_S
+    dur = float(match.group("dur"))
+    unit = (match.group("unit") or "s").lower()
+    return dur / 1000.0 if unit == "ms" else dur
+
+
+def _expand_inline(text: str, voice: str | None) -> list[Node]:
+    """Expand inline [breath]/[pause]/[scene] tags inside a chunk
+    of text that already has a single voice attribution. Voice
+    blocks themselves are handled one level up in split_to_nodes."""
+    out: list[Node] = []
+    text = text.strip()
+    if not text:
+        return out
+    cursor = 0
+    for m in _TAG_RE.finditer(text):
+        pre = text[cursor : m.start()].strip()
+        if pre:
+            out.append(Node("text", pre, voice))
+        out.append(Node("silence", _parse_tag(m)))
+        cursor = m.end()
+    tail = text[cursor:].strip()
+    if tail:
+        out.append(Node("text", tail, voice))
+    return out
+
+
+def split_to_nodes(text: str) -> list[Node]:
+    """Walk the source text and split it into text+silence nodes.
+
+    Order of operations:
+      1. Split on `---` lines (scene breaks).
+      2. Within each scene, split on blank lines (paragraph breaks).
+      3. Within each paragraph, split on [voice:X]...[/voice] blocks
+         so each dialogue line carries its own voice attribution.
+      4. Within each (paragraph, voice-region) chunk, expand inline
+         [breath]/[pause:Xs]/[scene] tags.
+    """
+    nodes: list[Node] = []
+    scenes = re.split(r"(?m)^\s*---\s*$", text)
+    for s_idx, scene in enumerate(scenes):
+        if s_idx > 0:
+            nodes.append(Node("silence", SCENE_GAP_S))
+        paragraphs = re.split(r"\n\s*\n", scene)
+        first_para = True
+        for para in paragraphs:
+            para = para.strip()
+            if not para:
+                continue
+            if not first_para:
+                nodes.append(Node("silence", PARAGRAPH_GAP_S))
+            first_para = False
+            nodes.extend(_split_paragraph_voices(para))
+    return nodes
+
+
+def _split_paragraph_voices(para: str) -> list[Node]:
+    """Split a single paragraph on [voice:X]...[/voice] blocks.
+    Outside those blocks the voice is None (request default).
+    Unmatched/orphan [/voice] markers are silently stripped.
+    """
+    out: list[Node] = []
+    cursor = 0
+    while cursor < len(para):
+        m = _VOICE_OPEN_RE.search(para, cursor)
+        if not m:
+            out.extend(_expand_inline(para[cursor:], None))
+            break
+        # Text BEFORE the voice block uses default voice.
+        out.extend(_expand_inline(para[cursor : m.start()], None))
+        voice = m.group(1)
+        body_start = m.end()
+        close_idx = para.find(_VOICE_CLOSE, body_start)
+        if close_idx < 0:
+            # Unclosed voice block; treat rest of paragraph as that
+            # voice. Defensive — should be rare.
+            out.extend(_expand_inline(para[body_start:], voice))
+            break
+        out.extend(_expand_inline(para[body_start:close_idx], voice))
+        cursor = close_idx + len(_VOICE_CLOSE)
+    return out
+
+
+def _silence_samples(seconds: float) -> np.ndarray:
+    n = int(round(seconds * SAMPLE_RATE))
+    return np.zeros(n, dtype=np.float32)
+
+
+# ─── FastAPI app ─────────────────────────────────────────────────
+
+
+class SynthesizeRequest(BaseModel):
+    gen_text: str = Field(min_length=1)
+    ref_audio_path: str = DEFAULT_VOICE
+    ref_text: str | None = None
+    output_filename: str | None = None
+    speed: float = Field(default=1.0, ge=0.3, le=2.0)
+    lang_code: str = DEFAULT_LANG
+
+
+class SynthesizeResponse(BaseModel):
+    ok: bool
+    output_path: str
+    sample_rate_hz: int
+    duration_seconds: float
+    elapsed_ms: int
+    chars_in: int
+    engine: str
+    voice: str
+    text_nodes: int
+    silence_nodes: int
+    # Every distinct Kokoro voice id that actually got rendered.
+    # Single-element when no [voice:X] tags were in the input;
+    # multiple when multi-voice dialogue was attributed.
+    voices_used: list[str]
+
+
+app = FastAPI(title="kokoro-server", version="0.2.0")
+
+
+@app.on_event("startup")
+def _startup() -> None:
+    _get_pipeline(DEFAULT_LANG)
+
+
+@app.get("/healthz")
+def healthz() -> dict:
+    # Shape matches f5_server's so the same Rust HealthResponse
+    # struct deserializes both: model/vocoder/loaded fields are
+    # required by skald-core::narrate::HealthResponse.
+    return {
+        "ok": True,
+        "device": DEVICE,
+        "model": "kokoro-82m",
+        "vocoder": "kokoro-internal",
+        "loaded": bool(_pipelines),
+        "engine": "kokoro-82m",
+        "default_voice": DEFAULT_VOICE,
+        "default_lang": DEFAULT_LANG,
+        "loaded_langs": list(_pipelines.keys()),
+        "version": "0.2.0",
+    }
+
+
+@app.post("/synthesize", response_model=SynthesizeResponse)
+def synthesize(req: SynthesizeRequest) -> SynthesizeResponse:
+    if req.ref_audio_path.startswith("/"):
+        raise HTTPException(
+            400,
+            "ref_audio_path looks like a filesystem path; Kokoro takes a voice "
+            "name like 'af_heart' or 'am_michael'. Did you mean to route to the "
+            "f5-tts engine?",
+        )
+    voice = req.ref_audio_path
+
+    output_filename = req.output_filename or f"{uuid.uuid4().hex}.wav"
+    if "/" in output_filename or ".." in output_filename:
+        raise HTTPException(400, "output_filename must be a bare name, no path parts")
+    output_path = AUDIO_ROOT / output_filename
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    pipeline = _get_pipeline(req.lang_code)
+
+    # Split the text into a node list. Empty nodes get filtered.
+    nodes = [n for n in split_to_nodes(req.gen_text) if n.kind == "silence" or n.value]
+    text_count = sum(1 for n in nodes if n.kind == "text")
+    silence_count = sum(1 for n in nodes if n.kind == "silence")
+    if text_count == 0:
+        raise HTTPException(400, "gen_text expanded to zero text nodes")
+
+    started = time.monotonic()
+    pieces: list[np.ndarray] = []
+    voices_used: set[str] = set()
+    for node in nodes:
+        if node.kind == "silence":
+            pieces.append(_silence_samples(node.value))
+            continue
+        # text: hand to Kokoro. The node's voice override (set by
+        # [voice:X]...[/voice] blocks) wins; otherwise the request's
+        # default narrator voice.
+        seg_voice = node.voice or voice
+        voices_used.add(seg_voice)
+        chunk_audio: list[np.ndarray] = []
+        for _, _, audio in pipeline(node.value, voice=seg_voice, speed=req.speed):
+            arr = audio.cpu().numpy() if hasattr(audio, "cpu") else np.asarray(audio)
+            chunk_audio.append(arr.astype(np.float32))
+        if chunk_audio:
+            pieces.append(np.concatenate(chunk_audio))
+    elapsed_ms = int((time.monotonic() - started) * 1000)
+
+    if not pieces:
+        raise HTTPException(500, "kokoro returned no audio")
+    full_audio = np.concatenate(pieces)
+    sf.write(str(output_path), full_audio, SAMPLE_RATE, subtype="PCM_16")
+    duration_s = float(len(full_audio)) / float(SAMPLE_RATE)
+
+    log.info(
+        "synthesized chars=%d voice=%s text_nodes=%d silence_nodes=%d "
+        "voices_used=%s -> %s (dur=%.2fs, elapsed=%dms)",
+        len(req.gen_text), voice, text_count, silence_count,
+        sorted(voices_used), output_path, duration_s, elapsed_ms,
+    )
+    return SynthesizeResponse(
+        ok=True,
+        output_path=str(output_path),
+        sample_rate_hz=SAMPLE_RATE,
+        duration_seconds=duration_s,
+        elapsed_ms=elapsed_ms,
+        chars_in=len(req.gen_text),
+        engine="kokoro-82m",
+        voice=voice,
+        text_nodes=text_count,
+        silence_nodes=silence_count,
+        voices_used=sorted(voices_used),
+    )
diff --git a/engines/tortoise/Dockerfile b/engines/tortoise/Dockerfile
new file mode 100644
index 0000000..d2f104a
--- /dev/null
+++ b/engines/tortoise/Dockerfile
@@ -0,0 +1,45 @@
+# Sulkta build of Tortoise-TTS.
+#
+# Voice roster (built-in, no cloning needed): angie, daniel, deniro,
+# emma, freeman, geralt, halle, jlaw, lj, mol, myself, pat, pat2,
+# rainbow, snakes, tim_reynolds, tom, train_atkins, train_dotrice,
+# train_dreams, train_grace, train_kennard, train_lescault,
+# train_mouse, weaver, william. ~26 voices baked in.
+#
+# License: Apache 2.0 (code) + Apache 2.0 (model weights). Clean
+# stack for share/publish.
+#
+# Speed: slow. Trade for quality. Standard preset is ~10x slower
+# than Kokoro; high_quality is ~30x slower. Worth it for the
+# audiobook-quality bar.
+
+FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-runtime
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    HF_HOME=/cache/hf \
+    HF_HUB_DISABLE_TELEMETRY=1 \
+    TORTOISE_MODELS_DIR=/cache/tortoise-models
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ffmpeg \
+        git \
+        ca-certificates \
+        curl \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir \
+        'tortoise-tts>=3.0.0' \
+        'fastapi>=0.115.0' \
+        'uvicorn>=0.32.0' \
+        'soundfile>=0.13.0' \
+        'numpy<2'
+
+RUN mkdir -p /cache/hf /cache/tortoise-models /audio
+
+COPY tortoise_server.py /app/tortoise_server.py
+WORKDIR /app
+
+EXPOSE 7860
+
+CMD ["uvicorn", "tortoise_server:app", "--host", "0.0.0.0", "--port", "7860"]
diff --git a/engines/tortoise/compose.yml b/engines/tortoise/compose.yml
new file mode 100644
index 0000000..ec4d386
--- /dev/null
+++ b/engines/tortoise/compose.yml
@@ -0,0 +1,43 @@
+# Tortoise-TTS stack on Lucy. Audiobook-quality engine with 25+
+# named voices (no cloning). Apache 2.0 top to bottom.
+#
+# Slow: ~10x kokoro wall clock at 'standard' preset. Worth it for
+# the quality bar. Cobb's call 2026-05-14: "use higgs (now tortoise)
+# and we will only let it use the full gpu for runs" — translated:
+# runs are batched, slow is acceptable.
+#
+# Co-resides with kokoro on the 2070 Super since tortoise is ~5GB
+# and kokoro is ~1GB (8GB total). If OOM hits during a render,
+# we'll add a coordination layer to pause kokoro first.
+name: tortoise
+
+services:
+  tortoise:
+    image: lucy-registry:5000/tortoise:0.1
+    container_name: tortoise
+    restart: unless-stopped
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    ports:
+      - "192.168.0.5:7795:7860"
+      - "127.0.0.1:7795:7860"
+    volumes:
+      - /mnt/cache/appdata/tortoise/hf-cache:/cache/hf
+      - /mnt/cache/appdata/tortoise/models:/cache/tortoise-models
+      # Shared audio dir with f5/kokoro so skald serves all engines'
+      # outputs through the same /audio route.
+      - /mnt/cache/appdata/f5-tts/audio:/audio
+    environment:
+      HF_HOME: /cache/hf
+      HF_HUB_DISABLE_TELEMETRY: "1"
+      TORTOISE_MODELS_DIR: /cache/tortoise-models
+    labels:
+      org.sulkta.domain: "sulkta"
+      org.sulkta.owner: "cobb"
+      org.sulkta.managed-by: "compose"
+      org.sulkta.role: "tortoise-tts"
diff --git a/engines/tortoise/server.py b/engines/tortoise/server.py
new file mode 100644
index 0000000..c39eafe
--- /dev/null
+++ b/engines/tortoise/server.py
@@ -0,0 +1,305 @@
+"""Tortoise-TTS FastAPI server. Sibling to kokoro_server.
+
+Same /synthesize contract as the kokoro server so skald only has to
+route by voice.source. Differences:
+  - Tortoise voices are NAMED PRESETS shipped with the library
+    (angie, daniel, freeman, jlaw, lj, weaver, etc.). No cloning.
+  - Tortoise is slow. Standard preset is ~10x kokoro's wall clock.
+    Caller should expect minutes per chunk, not seconds.
+  - We DON'T re-implement render-and-stitch + multi-voice tag parsing
+    here for v0.1 — tortoise's quality is the win, not multi-voice.
+    Long-form sequential renders use the request's default voice
+    throughout.
+  - The [voice:X]...[/voice] tags ARE parsed though: each block
+    renders with its named voice. This is the audiobook win.
+
+Quality presets: ultra_fast / fast / standard / high_quality. The
+trade-off is real — high_quality on a 2070 Super is ~30x slower
+than kokoro. Default to 'standard' for the bar.
+"""
+import logging
+import re
+import time
+import uuid
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+import torch
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+
+from tortoise.api import TextToSpeech
+from tortoise.utils.audio import load_voice
+
+
+log = logging.getLogger("tortoise-server")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
+
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DEFAULT_VOICE = "lj"
+DEFAULT_PRESET = "standard"
+AUDIO_ROOT = Path("/audio")
+SAMPLE_RATE = 24000  # Tortoise outputs 24kHz
+
+# Silence durations for between-chunk stitching (matches kokoro
+# server's conventions so audio from both engines feels similar).
+PARAGRAPH_GAP_S = 0.7
+SCENE_GAP_S = 1.5
+BREATH_GAP_S = 0.4
+
+_tts: TextToSpeech | None = None
+_voice_cache: dict[str, tuple] = {}
+
+
+def _get_tts() -> TextToSpeech:
+    global _tts
+    if _tts is None:
+        log.info("loading tortoise device=%s", DEVICE)
+        _tts = TextToSpeech(use_deepspeed=False, kv_cache=True, half=(DEVICE == "cuda"))
+        log.info("tortoise loaded")
+    return _tts
+
+
+def _get_voice(name: str) -> tuple:
+    """Cache voice latents to avoid re-loading reference clips on
+    every synthesis call. Tortoise's load_voice returns
+    (voice_samples, conditioning_latents)."""
+    if name not in _voice_cache:
+        _voice_cache[name] = load_voice(name)
+    return _voice_cache[name]
+
+
+# ─── tag splitter (lifted from kokoro_server) ───────────────────
+
+
+class Node:
+    __slots__ = ("kind", "value", "voice")
+
+    def __init__(self, kind: str, value, voice: str | None = None):
+        self.kind = kind
+        self.value = value
+        self.voice = voice
+
+
+_VOICE_OPEN_RE = re.compile(r"\[voice:([A-Za-z0-9_-]+)\]")
+_VOICE_CLOSE = "[/voice]"
+_TAG_RE = re.compile(
+    r"\[(pause:(?P<dur>[0-9]+(?:\.[0-9]+)?)(?P<unit>s|ms)?|breath|scene)\]",
+    re.IGNORECASE,
+)
+
+
+def _parse_tag(match: re.Match) -> float:
+    body = match.group(0).lower().strip("[]")
+    if body == "breath":
+        return BREATH_GAP_S
+    if body == "scene":
+        return SCENE_GAP_S
+    dur = float(match.group("dur"))
+    unit = (match.group("unit") or "s").lower()
+    return dur / 1000.0 if unit == "ms" else dur
+
+
+def _expand_inline(text: str, voice: str | None) -> list[Node]:
+    out: list[Node] = []
+    text = text.strip()
+    if not text:
+        return out
+    cursor = 0
+    for m in _TAG_RE.finditer(text):
+        pre = text[cursor : m.start()].strip()
+        if pre:
+            out.append(Node("text", pre, voice))
+        out.append(Node("silence", _parse_tag(m)))
+        cursor = m.end()
+    tail = text[cursor:].strip()
+    if tail:
+        out.append(Node("text", tail, voice))
+    return out
+
+
+def _split_paragraph_voices(para: str) -> list[Node]:
+    out: list[Node] = []
+    cursor = 0
+    while cursor < len(para):
+        m = _VOICE_OPEN_RE.search(para, cursor)
+        if not m:
+            out.extend(_expand_inline(para[cursor:], None))
+            break
+        out.extend(_expand_inline(para[cursor : m.start()], None))
+        voice = m.group(1)
+        body_start = m.end()
+        close_idx = para.find(_VOICE_CLOSE, body_start)
+        if close_idx < 0:
+            out.extend(_expand_inline(para[body_start:], voice))
+            break
+        out.extend(_expand_inline(para[body_start:close_idx], voice))
+        cursor = close_idx + len(_VOICE_CLOSE)
+    return out
+
+
+def split_to_nodes(text: str) -> list[Node]:
+    nodes: list[Node] = []
+    scenes = re.split(r"(?m)^\s*---\s*$", text)
+    for s_idx, scene in enumerate(scenes):
+        if s_idx > 0:
+            nodes.append(Node("silence", SCENE_GAP_S))
+        paragraphs = re.split(r"\n\s*\n", scene)
+        first_para = True
+        for para in paragraphs:
+            para = para.strip()
+            if not para:
+                continue
+            if not first_para:
+                nodes.append(Node("silence", PARAGRAPH_GAP_S))
+            first_para = False
+            nodes.extend(_split_paragraph_voices(para))
+    return nodes
+
+
+def _silence_samples(seconds: float) -> np.ndarray:
+    n = int(round(seconds * SAMPLE_RATE))
+    return np.zeros(n, dtype=np.float32)
+
+
+# ─── FastAPI ─────────────────────────────────────────────────────
+
+
+class SynthesizeRequest(BaseModel):
+    gen_text: str = Field(min_length=1)
+    # Tortoise voice name (lj, freeman, daniel, etc.). API-compat
+    # field carries the voice id as a "path" — same shape as kokoro.
+    ref_audio_path: str = DEFAULT_VOICE
+    ref_text: str | None = None
+    output_filename: str | None = None
+    speed: float = Field(default=1.0, ge=0.3, le=2.0)
+    # Tortoise-specific: quality preset. Slower = better.
+    preset: str = Field(default=DEFAULT_PRESET)
+
+
+class SynthesizeResponse(BaseModel):
+    ok: bool
+    output_path: str
+    sample_rate_hz: int
+    duration_seconds: float
+    elapsed_ms: int
+    chars_in: int
+    engine: str
+    voice: str
+    text_nodes: int
+    silence_nodes: int
+    voices_used: list[str]
+
+
+app = FastAPI(title="tortoise-server", version="0.1.0")
+
+
+@app.on_event("startup")
+def _startup() -> None:
+    _get_tts()
+    # Pre-load the default voice so the first synth doesn't pay
+    # the latent-extraction cost.
+    try:
+        _get_voice(DEFAULT_VOICE)
+    except Exception as e:
+        log.warning("could not preload default voice %s: %s", DEFAULT_VOICE, e)
+
+
+@app.get("/healthz")
+def healthz() -> dict:
+    # Shape matches f5_server/kokoro_server so skald's HealthResponse
+    # struct deserializes all three.
+    return {
+        "ok": True,
+        "device": DEVICE,
+        "model": "tortoise-tts",
+        "vocoder": "tortoise-builtin",
+        "loaded": _tts is not None,
+        "engine": "tortoise-tts",
+        "default_voice": DEFAULT_VOICE,
+        "default_preset": DEFAULT_PRESET,
+        "cached_voices": list(_voice_cache.keys()),
+        "version": "0.1.0",
+    }
+
+
+@app.post("/synthesize", response_model=SynthesizeResponse)
+def synthesize(req: SynthesizeRequest) -> SynthesizeResponse:
+    if req.ref_audio_path.startswith("/"):
+        raise HTTPException(
+            400,
+            "ref_audio_path looks like a filesystem path; tortoise takes a voice "
+            "name like 'lj' or 'freeman'.",
+        )
+    voice = req.ref_audio_path
+    preset = req.preset
+
+    output_filename = req.output_filename or f"{uuid.uuid4().hex}.wav"
+    if "/" in output_filename or ".." in output_filename:
+        raise HTTPException(400, "output_filename must be a bare name, no path parts")
+    output_path = AUDIO_ROOT / output_filename
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    tts = _get_tts()
+
+    nodes = [n for n in split_to_nodes(req.gen_text) if n.kind == "silence" or n.value]
+    text_count = sum(1 for n in nodes if n.kind == "text")
+    silence_count = sum(1 for n in nodes if n.kind == "silence")
+    if text_count == 0:
+        raise HTTPException(400, "gen_text expanded to zero text nodes")
+
+    started = time.monotonic()
+    pieces: list[np.ndarray] = []
+    voices_used: set[str] = set()
+    for node in nodes:
+        if node.kind == "silence":
+            pieces.append(_silence_samples(node.value))
+            continue
+        seg_voice = node.voice or voice
+        voices_used.add(seg_voice)
+        try:
+            samples, latents = _get_voice(seg_voice)
+        except Exception as e:
+            log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e)
+            samples, latents = _get_voice(voice)
+        # Tortoise's tts_with_preset returns a torch.Tensor on the
+        # configured device.
+        audio_tensor = tts.tts_with_preset(
+            text=node.value,
+            voice_samples=samples,
+            conditioning_latents=latents,
+            preset=preset,
+        )
+        if isinstance(audio_tensor, list):
+            audio_tensor = audio_tensor[0]
+        arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32)
+        pieces.append(arr)
+    elapsed_ms = int((time.monotonic() - started) * 1000)
+
+    if not pieces:
+        raise HTTPException(500, "tortoise returned no audio")
+    full_audio = np.concatenate(pieces)
+    sf.write(str(output_path), full_audio, SAMPLE_RATE, subtype="PCM_16")
+    duration_s = float(len(full_audio)) / float(SAMPLE_RATE)
+
+    log.info(
+        "synthesized chars=%d voice=%s preset=%s text_nodes=%d silence_nodes=%d "
+        "voices_used=%s -> %s (dur=%.2fs, elapsed=%dms)",
+        len(req.gen_text), voice, preset, text_count, silence_count,
+        sorted(voices_used), output_path, duration_s, elapsed_ms,
+    )
+    return SynthesizeResponse(
+        ok=True,
+        output_path=str(output_path),
+        sample_rate_hz=SAMPLE_RATE,
+        duration_seconds=duration_s,
+        elapsed_ms=elapsed_ms,
+        chars_in=len(req.gen_text),
+        engine="tortoise-tts",
+        voice=voice,
+        text_nodes=text_count,
+        silence_nodes=silence_count,
+        voices_used=sorted(voices_used),
+    )