engines: import f5-tts + kokoro + tortoise sidecars into the tree

The python FastAPI sidecars have lived ad-hoc at /mnt/cache/appdata/ <engine>/build/ on Lucy without version control. Bringing them into the skald repo so the engine code travels with the cross-engine routing it depends on. This commit lands the VANILLA version of each engine on main: engines/f5-tts/ SWivid F5-TTS (CC-BY-NC weights flagged) engines/kokoro/ hexgrad Kokoro-82M (Apache 2.0 top to bottom) engines/tortoise/ neonbjb Tortoise-TTS (Apache 2.0 top to bottom) Engine-specific kludges (question doubling, GPU coordination, pause-duration tuning) get layered on engine/* branches per the README. Main stays the safe-to-read baseline.
2026-05-14 09:40:01 -07:00 · 2026-05-14 09:40:01 -07:00 · d1631ddffe
commit d1631ddffe
parent 1c3fc11484
10 changed files with 1115 additions and 0 deletions
--- a/engines/tortoise/server.py
+++ b/engines/tortoise/server.py
@ -0,0 +1,305 @@
+"""Tortoise-TTS FastAPI server. Sibling to kokoro_server.
+
+Same /synthesize contract as the kokoro server so skald only has to
+route by voice.source. Differences:
+  - Tortoise voices are NAMED PRESETS shipped with the library
+    (angie, daniel, freeman, jlaw, lj, weaver, etc.). No cloning.
+  - Tortoise is slow. Standard preset is ~10x kokoro's wall clock.
+    Caller should expect minutes per chunk, not seconds.
+  - We DON'T re-implement render-and-stitch + multi-voice tag parsing
+    here for v0.1 — tortoise's quality is the win, not multi-voice.
+    Long-form sequential renders use the request's default voice
+    throughout.
+  - The [voice:X]...[/voice] tags ARE parsed though: each block
+    renders with its named voice. This is the audiobook win.
+
+Quality presets: ultra_fast / fast / standard / high_quality. The
+trade-off is real — high_quality on a 2070 Super is ~30x slower
+than kokoro. Default to 'standard' for the bar.
+"""
+import logging
+import re
+import time
+import uuid
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+import torch
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+
+from tortoise.api import TextToSpeech
+from tortoise.utils.audio import load_voice
+
+
+log = logging.getLogger("tortoise-server")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
+
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DEFAULT_VOICE = "lj"
+DEFAULT_PRESET = "standard"
+AUDIO_ROOT = Path("/audio")
+SAMPLE_RATE = 24000  # Tortoise outputs 24kHz
+
+# Silence durations for between-chunk stitching (matches kokoro
+# server's conventions so audio from both engines feels similar).
+PARAGRAPH_GAP_S = 0.7
+SCENE_GAP_S = 1.5
+BREATH_GAP_S = 0.4
+
+_tts: TextToSpeech | None = None
+_voice_cache: dict[str, tuple] = {}
+
+
+def _get_tts() -> TextToSpeech:
+    global _tts
+    if _tts is None:
+        log.info("loading tortoise device=%s", DEVICE)
+        _tts = TextToSpeech(use_deepspeed=False, kv_cache=True, half=(DEVICE == "cuda"))
+        log.info("tortoise loaded")
+    return _tts
+
+
+def _get_voice(name: str) -> tuple:
+    """Cache voice latents to avoid re-loading reference clips on
+    every synthesis call. Tortoise's load_voice returns
+    (voice_samples, conditioning_latents)."""
+    if name not in _voice_cache:
+        _voice_cache[name] = load_voice(name)
+    return _voice_cache[name]
+
+
+# ─── tag splitter (lifted from kokoro_server) ───────────────────
+
+
+class Node:
+    __slots__ = ("kind", "value", "voice")
+
+    def __init__(self, kind: str, value, voice: str | None = None):
+        self.kind = kind
+        self.value = value
+        self.voice = voice
+
+
+_VOICE_OPEN_RE = re.compile(r"\[voice:([A-Za-z0-9_-]+)\]")
+_VOICE_CLOSE = "[/voice]"
+_TAG_RE = re.compile(
+    r"\[(pause:(?P<dur>[0-9]+(?:\.[0-9]+)?)(?P<unit>s|ms)?|breath|scene)\]",
+    re.IGNORECASE,
+)
+
+
+def _parse_tag(match: re.Match) -> float:
+    body = match.group(0).lower().strip("[]")
+    if body == "breath":
+        return BREATH_GAP_S
+    if body == "scene":
+        return SCENE_GAP_S
+    dur = float(match.group("dur"))
+    unit = (match.group("unit") or "s").lower()
+    return dur / 1000.0 if unit == "ms" else dur
+
+
+def _expand_inline(text: str, voice: str | None) -> list[Node]:
+    out: list[Node] = []
+    text = text.strip()
+    if not text:
+        return out
+    cursor = 0
+    for m in _TAG_RE.finditer(text):
+        pre = text[cursor : m.start()].strip()
+        if pre:
+            out.append(Node("text", pre, voice))
+        out.append(Node("silence", _parse_tag(m)))
+        cursor = m.end()
+    tail = text[cursor:].strip()
+    if tail:
+        out.append(Node("text", tail, voice))
+    return out
+
+
+def _split_paragraph_voices(para: str) -> list[Node]:
+    out: list[Node] = []
+    cursor = 0
+    while cursor < len(para):
+        m = _VOICE_OPEN_RE.search(para, cursor)
+        if not m:
+            out.extend(_expand_inline(para[cursor:], None))
+            break
+        out.extend(_expand_inline(para[cursor : m.start()], None))
+        voice = m.group(1)
+        body_start = m.end()
+        close_idx = para.find(_VOICE_CLOSE, body_start)
+        if close_idx < 0:
+            out.extend(_expand_inline(para[body_start:], voice))
+            break
+        out.extend(_expand_inline(para[body_start:close_idx], voice))
+        cursor = close_idx + len(_VOICE_CLOSE)
+    return out
+
+
+def split_to_nodes(text: str) -> list[Node]:
+    nodes: list[Node] = []
+    scenes = re.split(r"(?m)^\s*---\s*$", text)
+    for s_idx, scene in enumerate(scenes):
+        if s_idx > 0:
+            nodes.append(Node("silence", SCENE_GAP_S))
+        paragraphs = re.split(r"\n\s*\n", scene)
+        first_para = True
+        for para in paragraphs:
+            para = para.strip()
+            if not para:
+                continue
+            if not first_para:
+                nodes.append(Node("silence", PARAGRAPH_GAP_S))
+            first_para = False
+            nodes.extend(_split_paragraph_voices(para))
+    return nodes
+
+
+def _silence_samples(seconds: float) -> np.ndarray:
+    n = int(round(seconds * SAMPLE_RATE))
+    return np.zeros(n, dtype=np.float32)
+
+
+# ─── FastAPI ─────────────────────────────────────────────────────
+
+
+class SynthesizeRequest(BaseModel):
+    gen_text: str = Field(min_length=1)
+    # Tortoise voice name (lj, freeman, daniel, etc.). API-compat
+    # field carries the voice id as a "path" — same shape as kokoro.
+    ref_audio_path: str = DEFAULT_VOICE
+    ref_text: str | None = None
+    output_filename: str | None = None
+    speed: float = Field(default=1.0, ge=0.3, le=2.0)
+    # Tortoise-specific: quality preset. Slower = better.
+    preset: str = Field(default=DEFAULT_PRESET)
+
+
+class SynthesizeResponse(BaseModel):
+    ok: bool
+    output_path: str
+    sample_rate_hz: int
+    duration_seconds: float
+    elapsed_ms: int
+    chars_in: int
+    engine: str
+    voice: str
+    text_nodes: int
+    silence_nodes: int
+    voices_used: list[str]
+
+
+app = FastAPI(title="tortoise-server", version="0.1.0")
+
+
+@app.on_event("startup")
+def _startup() -> None:
+    _get_tts()
+    # Pre-load the default voice so the first synth doesn't pay
+    # the latent-extraction cost.
+    try:
+        _get_voice(DEFAULT_VOICE)
+    except Exception as e:
+        log.warning("could not preload default voice %s: %s", DEFAULT_VOICE, e)
+
+
+@app.get("/healthz")
+def healthz() -> dict:
+    # Shape matches f5_server/kokoro_server so skald's HealthResponse
+    # struct deserializes all three.
+    return {
+        "ok": True,
+        "device": DEVICE,
+        "model": "tortoise-tts",
+        "vocoder": "tortoise-builtin",
+        "loaded": _tts is not None,
+        "engine": "tortoise-tts",
+        "default_voice": DEFAULT_VOICE,
+        "default_preset": DEFAULT_PRESET,
+        "cached_voices": list(_voice_cache.keys()),
+        "version": "0.1.0",
+    }
+
+
+@app.post("/synthesize", response_model=SynthesizeResponse)
+def synthesize(req: SynthesizeRequest) -> SynthesizeResponse:
+    if req.ref_audio_path.startswith("/"):
+        raise HTTPException(
+            400,
+            "ref_audio_path looks like a filesystem path; tortoise takes a voice "
+            "name like 'lj' or 'freeman'.",
+        )
+    voice = req.ref_audio_path
+    preset = req.preset
+
+    output_filename = req.output_filename or f"{uuid.uuid4().hex}.wav"
+    if "/" in output_filename or ".." in output_filename:
+        raise HTTPException(400, "output_filename must be a bare name, no path parts")
+    output_path = AUDIO_ROOT / output_filename
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    tts = _get_tts()
+
+    nodes = [n for n in split_to_nodes(req.gen_text) if n.kind == "silence" or n.value]
+    text_count = sum(1 for n in nodes if n.kind == "text")
+    silence_count = sum(1 for n in nodes if n.kind == "silence")
+    if text_count == 0:
+        raise HTTPException(400, "gen_text expanded to zero text nodes")
+
+    started = time.monotonic()
+    pieces: list[np.ndarray] = []
+    voices_used: set[str] = set()
+    for node in nodes:
+        if node.kind == "silence":
+            pieces.append(_silence_samples(node.value))
+            continue
+        seg_voice = node.voice or voice
+        voices_used.add(seg_voice)
+        try:
+            samples, latents = _get_voice(seg_voice)
+        except Exception as e:
+            log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e)
+            samples, latents = _get_voice(voice)
+        # Tortoise's tts_with_preset returns a torch.Tensor on the
+        # configured device.
+        audio_tensor = tts.tts_with_preset(
+            text=node.value,
+            voice_samples=samples,
+            conditioning_latents=latents,
+            preset=preset,
+        )
+        if isinstance(audio_tensor, list):
+            audio_tensor = audio_tensor[0]
+        arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32)
+        pieces.append(arr)
+    elapsed_ms = int((time.monotonic() - started) * 1000)
+
+    if not pieces:
+        raise HTTPException(500, "tortoise returned no audio")
+    full_audio = np.concatenate(pieces)
+    sf.write(str(output_path), full_audio, SAMPLE_RATE, subtype="PCM_16")
+    duration_s = float(len(full_audio)) / float(SAMPLE_RATE)
+
+    log.info(
+        "synthesized chars=%d voice=%s preset=%s text_nodes=%d silence_nodes=%d "
+        "voices_used=%s -> %s (dur=%.2fs, elapsed=%dms)",
+        len(req.gen_text), voice, preset, text_count, silence_count,
+        sorted(voices_used), output_path, duration_s, elapsed_ms,
+    )
+    return SynthesizeResponse(
+        ok=True,
+        output_path=str(output_path),
+        sample_rate_hz=SAMPLE_RATE,
+        duration_seconds=duration_s,
+        elapsed_ms=elapsed_ms,
+        chars_in=len(req.gen_text),
+        engine="tortoise-tts",
+        voice=voice,
+        text_nodes=text_count,
+        silence_nodes=silence_count,
+        voices_used=sorted(voices_used),
+    )