From d1631ddffe80fbfad779743ca56e5d413e112ec0 Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 09:40:01 -0700 Subject: [PATCH] engines: import f5-tts + kokoro + tortoise sidecars into the tree The python FastAPI sidecars have lived ad-hoc at /mnt/cache/appdata/ /build/ on Lucy without version control. Bringing them into the skald repo so the engine code travels with the cross-engine routing it depends on. This commit lands the VANILLA version of each engine on main: engines/f5-tts/ SWivid F5-TTS (CC-BY-NC weights flagged) engines/kokoro/ hexgrad Kokoro-82M (Apache 2.0 top to bottom) engines/tortoise/ neonbjb Tortoise-TTS (Apache 2.0 top to bottom) Engine-specific kludges (question doubling, GPU coordination, pause-duration tuning) get layered on engine/* branches per the README. Main stays the safe-to-read baseline. --- engines/README.md | 58 +++++++ engines/f5-tts/Dockerfile | 41 +++++ engines/f5-tts/compose.yml | 43 +++++ engines/f5-tts/server.py | 184 ++++++++++++++++++++ engines/kokoro/Dockerfile | 35 ++++ engines/kokoro/compose.yml | 37 ++++ engines/kokoro/server.py | 324 +++++++++++++++++++++++++++++++++++ engines/tortoise/Dockerfile | 45 +++++ engines/tortoise/compose.yml | 43 +++++ engines/tortoise/server.py | 305 +++++++++++++++++++++++++++++++++ 10 files changed, 1115 insertions(+) create mode 100644 engines/README.md create mode 100644 engines/f5-tts/Dockerfile create mode 100644 engines/f5-tts/compose.yml create mode 100644 engines/f5-tts/server.py create mode 100644 engines/kokoro/Dockerfile create mode 100644 engines/kokoro/compose.yml create mode 100644 engines/kokoro/server.py create mode 100644 engines/tortoise/Dockerfile create mode 100644 engines/tortoise/compose.yml create mode 100644 engines/tortoise/server.py diff --git a/engines/README.md b/engines/README.md new file mode 100644 index 0000000..d00c0dc --- /dev/null +++ b/engines/README.md @@ -0,0 +1,58 @@ +# Skald TTS engines + +This subtree holds the per-engine sidecars that skald's narrate path +talks to over HTTP. Each engine has the same contract: + +- `POST /synthesize` — same JSON shape across engines so skald's + one Rust client (`skald-core::narrate::Narrator`) deserializes + all of them. See `engines//server.py` for the per-engine + implementation. +- `GET /healthz` — boot probe + model-loaded flag. + +Skald routes per-request by `voices.source`: a `kokoro_*` source +goes to `$KOKORO_URL`, a `tortoise_*` source goes to `$TORTOISE_URL`, +anything else (`lj_speech`, generic) goes to `$F5_TTS_URL`. + +## Engines + +| Dir | Engine | License (code/weights) | VRAM | Speed | Voices | +|---|---|---|---|---|---| +| `f5-tts/` | SWivid F5-TTS v1 | MIT / **CC-BY-NC** | ~5GB | fast (~2x real-time on 2070S) | voice cloning (LJ Speech reference shipped) | +| `kokoro/` | hexgrad Kokoro-82M | Apache 2.0 / Apache 2.0 | ~1GB | very fast (~50x real-time) | 50+ named presets (af_*, am_*, bf_*, bm_*) | +| `tortoise/` | neonbjb Tortoise-TTS | Apache 2.0 / Apache 2.0 | ~5GB | **slow** (~0.014x real-time, ~74s/s of audio on 2070S, standard preset) | 26 named built-ins (lj, freeman, daniel, weaver, jlaw, etc.) | + +## Branch model + +`main` carries the **vanilla** version of each engine — what you'd +get from a clean `pip install ` plus the FastAPI sidecar ++ control-tag splitter. No engine-specific kludges. Safe to look +at without context. + +`engine/` branches hold engine-tuned tweaks that don't +generalise. Examples: + +- `engine/kokoro` — doubled-`??` prosody hack for the 82M's weak + question intonation, paragraph/scene/breath gap durations tuned + for af_heart's pacing, notes on how respellings need to be all- + lowercase to avoid letter-by-letter spell-out by misaki. +- `engine/tortoise` — GPU exclusivity coordinator (stops F5 + + Kokoro before a Tortoise run since the 2070 Super can't host + all three at once), preset choice ergonomics, character→tortoise- + voice seed assignments. + +When deploying an engine to Lucy, the build dir at +`/mnt/cache/appdata//build/` tracks the engine's branch: + +```bash +cd /mnt/cache/appdata/kokoro/build +git fetch && git checkout engine/kokoro +docker compose -p up -d --build +``` + +## GPU coordination (2070 Super) + +The 8GB card is the bottleneck. F5 + Kokoro can co-reside (~5GB + +~1GB). Tortoise pushes the budget over and needs the GPU largely +to itself — the `engine/tortoise` branch will carry the script +that stops kokoro + f5 before a tortoise run and restarts them +after. Replace with proper coordination once we have more VRAM. diff --git a/engines/f5-tts/Dockerfile b/engines/f5-tts/Dockerfile new file mode 100644 index 0000000..be4cd91 --- /dev/null +++ b/engines/f5-tts/Dockerfile @@ -0,0 +1,41 @@ +# Sulkta build of F5-TTS — upstream ghcr.io/swivid/f5-tts:main was +# shipped with torch 2.11/torchaudio 2.4 ABI mismatch on 2026-05-13, +# breaking import torchaudio at boot. We rebuild on a known-good +# pytorch base + pip install f5-tts. +# +# Image tag in lucy-registry: lucy-registry:5000/f5-tts: +# +# License: Apache 2.0 (code) / CC-BY-NC (Emilia-trained weights). +# Personal use OK; redistribution gray-area — flagged. + +FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-runtime + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONUNBUFFERED=1 \ + HF_HOME=/cache/hf \ + HF_HUB_DISABLE_TELEMETRY=1 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + git \ + ca-certificates \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Base torch 2.6.0 + torchaudio 2.6.0; f5-tts pulls a recent +# transformers (5.x) which needs torch >=2.5's modern +# torch.library.custom_op type signatures. +RUN pip install --no-cache-dir 'f5-tts>=1.0.0' + +# Pre-warm the HF cache directory. +RUN mkdir -p /cache/hf /audio /voices + +COPY f5_server.py /app/f5_server.py +WORKDIR /app + +EXPOSE 7860 + +# Skald talks to our purpose-built FastAPI server, not Gradio. +# Models load at startup (first request would otherwise pay the +# cold-start cost). uvicorn on :7860 to keep the port stable. +CMD ["uvicorn", "f5_server:app", "--host", "0.0.0.0", "--port", "7860"] diff --git a/engines/f5-tts/compose.yml b/engines/f5-tts/compose.yml new file mode 100644 index 0000000..5f11b13 --- /dev/null +++ b/engines/f5-tts/compose.yml @@ -0,0 +1,43 @@ +# F5-TTS standalone stack on Lucy. +# +# License posture (acknowledged 2026-05-13): code is Apache 2.0, but +# the pretrained model weights are CC-BY-NC (Emilia training data). +# Personal listen is fine; public sharing is a flagged gray area. +# Cobb's call: ship anyway. +# +# Runtime: 8GB GPU is plenty (F5 inference ~4-6GB peak). +# +# First-run cost: ~2GB model download from HuggingFace into hf-cache, +# happens on first inference request. Subsequent runs are warm. +name: f5-tts + +services: + f5-tts: + image: lucy-registry:5000/f5-tts:0.3 + container_name: f5-tts + restart: unless-stopped + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + ports: + - "192.168.0.5:7792:7860" + - "127.0.0.1:7792:7860" + volumes: + # HF model weights cache — persists ~2GB after first download. + - /mnt/cache/appdata/f5-tts/hf-cache:/cache/hf + # Reference voice clips (lj_speech.wav, etc). + - /mnt/cache/appdata/f5-tts/voices:/voices:ro + # Rendered audio output — skald writes story narrations here. + - /mnt/cache/appdata/f5-tts/audio:/audio + environment: + HF_HOME: /cache/hf + HF_HUB_DISABLE_TELEMETRY: "1" + labels: + org.sulkta.domain: "sulkta" + org.sulkta.owner: "cobb" + org.sulkta.managed-by: "compose" + org.sulkta.role: "f5-tts" diff --git a/engines/f5-tts/server.py b/engines/f5-tts/server.py new file mode 100644 index 0000000..8e0d8ea --- /dev/null +++ b/engines/f5-tts/server.py @@ -0,0 +1,184 @@ +"""Thin FastAPI server inside the F5-TTS container. + +Loads model + vocoder ONCE at startup (heavy: ~5s, ~5GB VRAM). +POST /synthesize runs inference and writes the WAV to a shared +volume; the response is JSON with the output path and metadata — +not the WAV bytes, since chapter-length renders are 20-30MB and +both skald and the f5 container share /audio anyway. + +Why not Gradio's API: Gradio's /gradio_api/call/* shape is event- +stream + polling; this is a single POST + immediate response. +Right for skald's "render one chapter, then move on" loop. +""" +import logging +import time +import uuid +from pathlib import Path + +import soundfile as sf +import torch +from fastapi import FastAPI, HTTPException +from omegaconf import OmegaConf +from pydantic import BaseModel, Field +from cached_path import cached_path +from importlib.resources import files +from hydra.utils import get_class + +from f5_tts.infer.utils_infer import ( + infer_process, + load_model, + load_vocoder, + preprocess_ref_audio_text, +) + + +log = logging.getLogger("f5-server") +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s") + + +# ─── model state ───────────────────────────────────────────────── + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +MODEL_NAME = "F5TTS_v1_Base" +VOCODER_NAME = "vocos" +AUDIO_ROOT = Path("/audio") +VOICES_ROOT = Path("/voices") + +_model = None +_vocoder = None + + +def _load_models() -> None: + """One-time model + vocoder load. ~5-8s wall-clock on first call.""" + global _model, _vocoder + if _model is not None: + return + log.info("loading vocoder=%s device=%s", VOCODER_NAME, DEVICE) + _vocoder = load_vocoder(vocoder_name=VOCODER_NAME, device=DEVICE) + + cfg_path = str(files("f5_tts").joinpath(f"configs/{MODEL_NAME}.yaml")) + model_cfg = OmegaConf.load(cfg_path) + model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}") + model_arc = model_cfg.model.arch + + # F5TTS_v1_Base ships as a HuggingFace artifact; cached_path + # handles the resolution + downloads to HF_HOME. + ckpt_file = str( + cached_path("hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors") + ) + vocab_file = str(files("f5_tts").joinpath("infer/examples/vocab.txt")) + + log.info("loading model=%s ckpt=%s", MODEL_NAME, ckpt_file) + _model = load_model( + model_cls, model_arc, ckpt_file, + mel_spec_type=VOCODER_NAME, vocab_file=vocab_file, device=DEVICE, + ) + log.info("model + vocoder loaded; ready") + + +# ─── FastAPI app ───────────────────────────────────────────────── + + +class SynthesizeRequest(BaseModel): + # The text we want to synthesize. Long-form OK — F5-TTS chunks + # internally via infer_process. + gen_text: str = Field(min_length=1) + # Reference audio path (inside the f5-tts container). Defaults + # to the staged lj_speech clip. + ref_audio_path: str = "/voices/lj_speech.wav" + # Reference transcript. Defaults to the bundled lj_speech.txt. + ref_text: str | None = None + # Output filename, relative to /audio (the shared output dir). + # If omitted, a UUID-based name is assigned. + output_filename: str | None = None + # Speech speed (0.5-2.0). Default 1.0 = natural pace. + speed: float = Field(default=1.0, ge=0.3, le=2.0) + # Cross-fade between chunks; F5 default is 0.15s. Bigger smooths + # chunk boundaries on long-form prose at the cost of pacing. + cross_fade_duration: float = Field(default=0.15, ge=0.0, le=1.0) + + +class SynthesizeResponse(BaseModel): + ok: bool + output_path: str + sample_rate_hz: int + duration_seconds: float + elapsed_ms: int + chars_in: int + + +app = FastAPI(title="f5-tts-server", version="0.1.0") + + +@app.on_event("startup") +def _startup() -> None: + _load_models() + + +@app.get("/healthz") +def healthz() -> dict: + return { + "ok": True, + "device": DEVICE, + "model": MODEL_NAME, + "vocoder": VOCODER_NAME, + "loaded": _model is not None, + } + + +@app.post("/synthesize", response_model=SynthesizeResponse) +def synthesize(req: SynthesizeRequest) -> SynthesizeResponse: + if _model is None: + raise HTTPException(503, "model not loaded yet — retry shortly") + + ref_audio_path = Path(req.ref_audio_path) + if not ref_audio_path.is_file(): + raise HTTPException(400, f"ref_audio_path not found: {ref_audio_path}") + + # If no explicit ref_text, try sidecar .txt then fall back to "" + # (which triggers F5's auto-ASR). + ref_text = req.ref_text + if ref_text is None: + sidecar = ref_audio_path.with_suffix(".txt") + if sidecar.is_file(): + ref_text = sidecar.read_text().strip() + else: + ref_text = "" + + output_filename = req.output_filename or f"{uuid.uuid4().hex}.wav" + if "/" in output_filename or ".." in output_filename: + raise HTTPException(400, "output_filename must be a bare name, no path parts") + output_path = AUDIO_ROOT / output_filename + output_path.parent.mkdir(parents=True, exist_ok=True) + + started = time.monotonic() + ref_audio_processed, ref_text_processed = preprocess_ref_audio_text( + str(ref_audio_path), ref_text + ) + audio_segment, final_sample_rate, _ = infer_process( + ref_audio_processed, + ref_text_processed, + req.gen_text, + _model, + _vocoder, + mel_spec_type=VOCODER_NAME, + speed=req.speed, + cross_fade_duration=req.cross_fade_duration, + device=DEVICE, + ) + elapsed_ms = int((time.monotonic() - started) * 1000) + + sf.write(str(output_path), audio_segment, final_sample_rate, subtype="PCM_16") + duration_s = float(len(audio_segment)) / float(final_sample_rate) + log.info( + "synthesized chars=%d -> %s (sr=%d, dur=%.2fs, elapsed=%dms)", + len(req.gen_text), output_path, final_sample_rate, duration_s, elapsed_ms, + ) + return SynthesizeResponse( + ok=True, + output_path=str(output_path), + sample_rate_hz=final_sample_rate, + duration_seconds=duration_s, + elapsed_ms=elapsed_ms, + chars_in=len(req.gen_text), + ) diff --git a/engines/kokoro/Dockerfile b/engines/kokoro/Dockerfile new file mode 100644 index 0000000..5c7b13e --- /dev/null +++ b/engines/kokoro/Dockerfile @@ -0,0 +1,35 @@ +# Sulkta build of Kokoro-82M TTS. +# +# License: Apache 2.0 (code AND model weights). Clean stack — no +# CC-BY-NC asterisk like F5-TTS's Emilia weights. This is the +# narrator engine for sleep-quality audiobook reads; F5-TTS stays +# around for voice-cloning cases. +# +# Kokoro is small enough to run on CPU but we use the cuda base +# anyway to stay consistent with f5-tts and so it'll pick up the +# GPU when no other tenant has it. +FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-runtime + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONUNBUFFERED=1 \ + HF_HOME=/cache/hf \ + HF_HUB_DISABLE_TELEMETRY=1 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + espeak-ng \ + ca-certificates \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# kokoro pulls phonemizer + soundfile + espeakng transitively. +RUN pip install --no-cache-dir 'kokoro>=0.9.0' 'fastapi>=0.115.0' 'uvicorn>=0.32.0' 'soundfile>=0.13.0' + +RUN mkdir -p /cache/hf /audio + +COPY kokoro_server.py /app/kokoro_server.py +WORKDIR /app + +EXPOSE 7860 + +CMD ["uvicorn", "kokoro_server:app", "--host", "0.0.0.0", "--port", "7860"] diff --git a/engines/kokoro/compose.yml b/engines/kokoro/compose.yml new file mode 100644 index 0000000..dc26741 --- /dev/null +++ b/engines/kokoro/compose.yml @@ -0,0 +1,37 @@ +# Kokoro-82M TTS stack on Lucy. +# +# Audiobook-quality narrator engine (Apache 2.0 code + weights — +# clean stack vs F5-TTS's CC-BY-NC asterisk). Sibling to f5-tts; +# both share /mnt/cache/appdata/f5-tts/audio so skald's audio +# route serves outputs from either engine through the same path. +# +# License: Apache 2.0 top to bottom. Right for share/publish. +name: kokoro + +services: + kokoro: + image: lucy-registry:5000/kokoro:0.5 + container_name: kokoro + restart: unless-stopped + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + ports: + - "192.168.0.5:7794:7860" + - "127.0.0.1:7794:7860" + volumes: + - /mnt/cache/appdata/kokoro/hf-cache:/cache/hf + # Shared with f5-tts so skald's /audio route covers both. + - /mnt/cache/appdata/f5-tts/audio:/audio + environment: + HF_HOME: /cache/hf + HF_HUB_DISABLE_TELEMETRY: "1" + labels: + org.sulkta.domain: "sulkta" + org.sulkta.owner: "cobb" + org.sulkta.managed-by: "compose" + org.sulkta.role: "kokoro" diff --git a/engines/kokoro/server.py b/engines/kokoro/server.py new file mode 100644 index 0000000..169cbbd --- /dev/null +++ b/engines/kokoro/server.py @@ -0,0 +1,324 @@ +"""Kokoro-82M FastAPI server, sibling to f5_server. + +Same /synthesize contract as F5 so skald can route between engines +just by which URL it points at. The semantic difference: Kokoro +voices are NAMED (af_heart, af_bella, am_michael, etc.) — there's no +reference audio. We repurpose the `ref_audio_path` field to carry +the voice name; if it starts with '/' we treat as F5-style path and +error. + +Render-and-stitch: + The naive "feed the whole chapter to Kokoro" path produces audio + that runs paragraphs together — no breath between scenes, no beat + on a hard line break. So this server splits the input on paragraph + and scene boundaries, renders each chunk, and concatenates with + explicit silence inserts between chunks. + + Control tags the splitter recognizes (case-insensitive): + [pause:1.5s] — silence of N seconds at this point + [pause:500ms] — silence of N milliseconds at this point + [breath] — short breath beat (~400ms) + [scene] — major scene break (~1500ms) + + Implicit breaks the splitter inserts: + Blank line between paragraphs → 700ms + A line of just `---` → 1500ms (scene break) + + Sentence-internal pacing (commas, periods, em-dashes, ellipses) + is left to Kokoro's own phonemizer — it handles that well. + +License: Apache 2.0 (code + model weights). Clean stack for the +sleep-quality narrator use case. +""" +import logging +import re +import time +import uuid +from pathlib import Path + +import numpy as np +import soundfile as sf +import torch +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field + +from kokoro import KPipeline + + +log = logging.getLogger("kokoro-server") +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s") + + +# ─── pipeline state ────────────────────────────────────────────── + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +DEFAULT_LANG = "a" +DEFAULT_VOICE = "af_heart" +AUDIO_ROOT = Path("/audio") +SAMPLE_RATE = 24000 + +# Default silence durations for implicit breaks. Tags override. +PARAGRAPH_GAP_S = 0.7 +SCENE_GAP_S = 1.5 +BREATH_GAP_S = 0.4 + +_pipelines: dict[str, KPipeline] = {} + + +def _get_pipeline(lang_code: str) -> KPipeline: + if lang_code not in _pipelines: + log.info("loading kokoro pipeline lang=%s device=%s", lang_code, DEVICE) + _pipelines[lang_code] = KPipeline(lang_code=lang_code, device=DEVICE) + log.info("kokoro pipeline loaded lang=%s", lang_code) + return _pipelines[lang_code] + + +# ─── split + render pipeline ───────────────────────────────────── + +# A "node" is one of three kinds; the renderer walks the list, +# calls Kokoro on each text node with its (possibly per-segment) +# voice, and emits zeros for each silence node. +class Node: + __slots__ = ("kind", "value", "voice") + + def __init__(self, kind: str, value, voice: str | None = None): + # kind ∈ {"text", "silence"}; value is str for text and + # float seconds for silence. voice override is only used + # on text nodes from [voice:X]...[/voice] blocks; outside + # those blocks the request's default voice is used. + self.kind = kind + self.value = value + self.voice = voice + + +# Voice-block delimiters are parsed at a higher level than other +# tags so dialogue can contain its own [breath]/[pause] beats. +_VOICE_OPEN_RE = re.compile(r"\[voice:([A-Za-z0-9_-]+)\]") +_VOICE_CLOSE = "[/voice]" + +_TAG_RE = re.compile( + r"\[(pause:(?P[0-9]+(?:\.[0-9]+)?)(?Ps|ms)?|breath|scene)\]", + re.IGNORECASE, +) + + +def _parse_tag(match: re.Match) -> float: + body = match.group(0).lower().strip("[]") + if body == "breath": + return BREATH_GAP_S + if body == "scene": + return SCENE_GAP_S + dur = float(match.group("dur")) + unit = (match.group("unit") or "s").lower() + return dur / 1000.0 if unit == "ms" else dur + + +def _expand_inline(text: str, voice: str | None) -> list[Node]: + """Expand inline [breath]/[pause]/[scene] tags inside a chunk + of text that already has a single voice attribution. Voice + blocks themselves are handled one level up in split_to_nodes.""" + out: list[Node] = [] + text = text.strip() + if not text: + return out + cursor = 0 + for m in _TAG_RE.finditer(text): + pre = text[cursor : m.start()].strip() + if pre: + out.append(Node("text", pre, voice)) + out.append(Node("silence", _parse_tag(m))) + cursor = m.end() + tail = text[cursor:].strip() + if tail: + out.append(Node("text", tail, voice)) + return out + + +def split_to_nodes(text: str) -> list[Node]: + """Walk the source text and split it into text+silence nodes. + + Order of operations: + 1. Split on `---` lines (scene breaks). + 2. Within each scene, split on blank lines (paragraph breaks). + 3. Within each paragraph, split on [voice:X]...[/voice] blocks + so each dialogue line carries its own voice attribution. + 4. Within each (paragraph, voice-region) chunk, expand inline + [breath]/[pause:Xs]/[scene] tags. + """ + nodes: list[Node] = [] + scenes = re.split(r"(?m)^\s*---\s*$", text) + for s_idx, scene in enumerate(scenes): + if s_idx > 0: + nodes.append(Node("silence", SCENE_GAP_S)) + paragraphs = re.split(r"\n\s*\n", scene) + first_para = True + for para in paragraphs: + para = para.strip() + if not para: + continue + if not first_para: + nodes.append(Node("silence", PARAGRAPH_GAP_S)) + first_para = False + nodes.extend(_split_paragraph_voices(para)) + return nodes + + +def _split_paragraph_voices(para: str) -> list[Node]: + """Split a single paragraph on [voice:X]...[/voice] blocks. + Outside those blocks the voice is None (request default). + Unmatched/orphan [/voice] markers are silently stripped. + """ + out: list[Node] = [] + cursor = 0 + while cursor < len(para): + m = _VOICE_OPEN_RE.search(para, cursor) + if not m: + out.extend(_expand_inline(para[cursor:], None)) + break + # Text BEFORE the voice block uses default voice. + out.extend(_expand_inline(para[cursor : m.start()], None)) + voice = m.group(1) + body_start = m.end() + close_idx = para.find(_VOICE_CLOSE, body_start) + if close_idx < 0: + # Unclosed voice block; treat rest of paragraph as that + # voice. Defensive — should be rare. + out.extend(_expand_inline(para[body_start:], voice)) + break + out.extend(_expand_inline(para[body_start:close_idx], voice)) + cursor = close_idx + len(_VOICE_CLOSE) + return out + + +def _silence_samples(seconds: float) -> np.ndarray: + n = int(round(seconds * SAMPLE_RATE)) + return np.zeros(n, dtype=np.float32) + + +# ─── FastAPI app ───────────────────────────────────────────────── + + +class SynthesizeRequest(BaseModel): + gen_text: str = Field(min_length=1) + ref_audio_path: str = DEFAULT_VOICE + ref_text: str | None = None + output_filename: str | None = None + speed: float = Field(default=1.0, ge=0.3, le=2.0) + lang_code: str = DEFAULT_LANG + + +class SynthesizeResponse(BaseModel): + ok: bool + output_path: str + sample_rate_hz: int + duration_seconds: float + elapsed_ms: int + chars_in: int + engine: str + voice: str + text_nodes: int + silence_nodes: int + # Every distinct Kokoro voice id that actually got rendered. + # Single-element when no [voice:X] tags were in the input; + # multiple when multi-voice dialogue was attributed. + voices_used: list[str] + + +app = FastAPI(title="kokoro-server", version="0.2.0") + + +@app.on_event("startup") +def _startup() -> None: + _get_pipeline(DEFAULT_LANG) + + +@app.get("/healthz") +def healthz() -> dict: + # Shape matches f5_server's so the same Rust HealthResponse + # struct deserializes both: model/vocoder/loaded fields are + # required by skald-core::narrate::HealthResponse. + return { + "ok": True, + "device": DEVICE, + "model": "kokoro-82m", + "vocoder": "kokoro-internal", + "loaded": bool(_pipelines), + "engine": "kokoro-82m", + "default_voice": DEFAULT_VOICE, + "default_lang": DEFAULT_LANG, + "loaded_langs": list(_pipelines.keys()), + "version": "0.2.0", + } + + +@app.post("/synthesize", response_model=SynthesizeResponse) +def synthesize(req: SynthesizeRequest) -> SynthesizeResponse: + if req.ref_audio_path.startswith("/"): + raise HTTPException( + 400, + "ref_audio_path looks like a filesystem path; Kokoro takes a voice " + "name like 'af_heart' or 'am_michael'. Did you mean to route to the " + "f5-tts engine?", + ) + voice = req.ref_audio_path + + output_filename = req.output_filename or f"{uuid.uuid4().hex}.wav" + if "/" in output_filename or ".." in output_filename: + raise HTTPException(400, "output_filename must be a bare name, no path parts") + output_path = AUDIO_ROOT / output_filename + output_path.parent.mkdir(parents=True, exist_ok=True) + + pipeline = _get_pipeline(req.lang_code) + + # Split the text into a node list. Empty nodes get filtered. + nodes = [n for n in split_to_nodes(req.gen_text) if n.kind == "silence" or n.value] + text_count = sum(1 for n in nodes if n.kind == "text") + silence_count = sum(1 for n in nodes if n.kind == "silence") + if text_count == 0: + raise HTTPException(400, "gen_text expanded to zero text nodes") + + started = time.monotonic() + pieces: list[np.ndarray] = [] + voices_used: set[str] = set() + for node in nodes: + if node.kind == "silence": + pieces.append(_silence_samples(node.value)) + continue + # text: hand to Kokoro. The node's voice override (set by + # [voice:X]...[/voice] blocks) wins; otherwise the request's + # default narrator voice. + seg_voice = node.voice or voice + voices_used.add(seg_voice) + chunk_audio: list[np.ndarray] = [] + for _, _, audio in pipeline(node.value, voice=seg_voice, speed=req.speed): + arr = audio.cpu().numpy() if hasattr(audio, "cpu") else np.asarray(audio) + chunk_audio.append(arr.astype(np.float32)) + if chunk_audio: + pieces.append(np.concatenate(chunk_audio)) + elapsed_ms = int((time.monotonic() - started) * 1000) + + if not pieces: + raise HTTPException(500, "kokoro returned no audio") + full_audio = np.concatenate(pieces) + sf.write(str(output_path), full_audio, SAMPLE_RATE, subtype="PCM_16") + duration_s = float(len(full_audio)) / float(SAMPLE_RATE) + + log.info( + "synthesized chars=%d voice=%s text_nodes=%d silence_nodes=%d " + "voices_used=%s -> %s (dur=%.2fs, elapsed=%dms)", + len(req.gen_text), voice, text_count, silence_count, + sorted(voices_used), output_path, duration_s, elapsed_ms, + ) + return SynthesizeResponse( + ok=True, + output_path=str(output_path), + sample_rate_hz=SAMPLE_RATE, + duration_seconds=duration_s, + elapsed_ms=elapsed_ms, + chars_in=len(req.gen_text), + engine="kokoro-82m", + voice=voice, + text_nodes=text_count, + silence_nodes=silence_count, + voices_used=sorted(voices_used), + ) diff --git a/engines/tortoise/Dockerfile b/engines/tortoise/Dockerfile new file mode 100644 index 0000000..d2f104a --- /dev/null +++ b/engines/tortoise/Dockerfile @@ -0,0 +1,45 @@ +# Sulkta build of Tortoise-TTS. +# +# Voice roster (built-in, no cloning needed): angie, daniel, deniro, +# emma, freeman, geralt, halle, jlaw, lj, mol, myself, pat, pat2, +# rainbow, snakes, tim_reynolds, tom, train_atkins, train_dotrice, +# train_dreams, train_grace, train_kennard, train_lescault, +# train_mouse, weaver, william. ~26 voices baked in. +# +# License: Apache 2.0 (code) + Apache 2.0 (model weights). Clean +# stack for share/publish. +# +# Speed: slow. Trade for quality. Standard preset is ~10x slower +# than Kokoro; high_quality is ~30x slower. Worth it for the +# audiobook-quality bar. + +FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-runtime + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONUNBUFFERED=1 \ + HF_HOME=/cache/hf \ + HF_HUB_DISABLE_TELEMETRY=1 \ + TORTOISE_MODELS_DIR=/cache/tortoise-models + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + git \ + ca-certificates \ + curl \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + 'tortoise-tts>=3.0.0' \ + 'fastapi>=0.115.0' \ + 'uvicorn>=0.32.0' \ + 'soundfile>=0.13.0' \ + 'numpy<2' + +RUN mkdir -p /cache/hf /cache/tortoise-models /audio + +COPY tortoise_server.py /app/tortoise_server.py +WORKDIR /app + +EXPOSE 7860 + +CMD ["uvicorn", "tortoise_server:app", "--host", "0.0.0.0", "--port", "7860"] diff --git a/engines/tortoise/compose.yml b/engines/tortoise/compose.yml new file mode 100644 index 0000000..ec4d386 --- /dev/null +++ b/engines/tortoise/compose.yml @@ -0,0 +1,43 @@ +# Tortoise-TTS stack on Lucy. Audiobook-quality engine with 25+ +# named voices (no cloning). Apache 2.0 top to bottom. +# +# Slow: ~10x kokoro wall clock at 'standard' preset. Worth it for +# the quality bar. Cobb's call 2026-05-14: "use higgs (now tortoise) +# and we will only let it use the full gpu for runs" — translated: +# runs are batched, slow is acceptable. +# +# Co-resides with kokoro on the 2070 Super since tortoise is ~5GB +# and kokoro is ~1GB (8GB total). If OOM hits during a render, +# we'll add a coordination layer to pause kokoro first. +name: tortoise + +services: + tortoise: + image: lucy-registry:5000/tortoise:0.1 + container_name: tortoise + restart: unless-stopped + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + ports: + - "192.168.0.5:7795:7860" + - "127.0.0.1:7795:7860" + volumes: + - /mnt/cache/appdata/tortoise/hf-cache:/cache/hf + - /mnt/cache/appdata/tortoise/models:/cache/tortoise-models + # Shared audio dir with f5/kokoro so skald serves all engines' + # outputs through the same /audio route. + - /mnt/cache/appdata/f5-tts/audio:/audio + environment: + HF_HOME: /cache/hf + HF_HUB_DISABLE_TELEMETRY: "1" + TORTOISE_MODELS_DIR: /cache/tortoise-models + labels: + org.sulkta.domain: "sulkta" + org.sulkta.owner: "cobb" + org.sulkta.managed-by: "compose" + org.sulkta.role: "tortoise-tts" diff --git a/engines/tortoise/server.py b/engines/tortoise/server.py new file mode 100644 index 0000000..c39eafe --- /dev/null +++ b/engines/tortoise/server.py @@ -0,0 +1,305 @@ +"""Tortoise-TTS FastAPI server. Sibling to kokoro_server. + +Same /synthesize contract as the kokoro server so skald only has to +route by voice.source. Differences: + - Tortoise voices are NAMED PRESETS shipped with the library + (angie, daniel, freeman, jlaw, lj, weaver, etc.). No cloning. + - Tortoise is slow. Standard preset is ~10x kokoro's wall clock. + Caller should expect minutes per chunk, not seconds. + - We DON'T re-implement render-and-stitch + multi-voice tag parsing + here for v0.1 — tortoise's quality is the win, not multi-voice. + Long-form sequential renders use the request's default voice + throughout. + - The [voice:X]...[/voice] tags ARE parsed though: each block + renders with its named voice. This is the audiobook win. + +Quality presets: ultra_fast / fast / standard / high_quality. The +trade-off is real — high_quality on a 2070 Super is ~30x slower +than kokoro. Default to 'standard' for the bar. +""" +import logging +import re +import time +import uuid +from pathlib import Path + +import numpy as np +import soundfile as sf +import torch +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field + +from tortoise.api import TextToSpeech +from tortoise.utils.audio import load_voice + + +log = logging.getLogger("tortoise-server") +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s") + + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +DEFAULT_VOICE = "lj" +DEFAULT_PRESET = "standard" +AUDIO_ROOT = Path("/audio") +SAMPLE_RATE = 24000 # Tortoise outputs 24kHz + +# Silence durations for between-chunk stitching (matches kokoro +# server's conventions so audio from both engines feels similar). +PARAGRAPH_GAP_S = 0.7 +SCENE_GAP_S = 1.5 +BREATH_GAP_S = 0.4 + +_tts: TextToSpeech | None = None +_voice_cache: dict[str, tuple] = {} + + +def _get_tts() -> TextToSpeech: + global _tts + if _tts is None: + log.info("loading tortoise device=%s", DEVICE) + _tts = TextToSpeech(use_deepspeed=False, kv_cache=True, half=(DEVICE == "cuda")) + log.info("tortoise loaded") + return _tts + + +def _get_voice(name: str) -> tuple: + """Cache voice latents to avoid re-loading reference clips on + every synthesis call. Tortoise's load_voice returns + (voice_samples, conditioning_latents).""" + if name not in _voice_cache: + _voice_cache[name] = load_voice(name) + return _voice_cache[name] + + +# ─── tag splitter (lifted from kokoro_server) ─────────────────── + + +class Node: + __slots__ = ("kind", "value", "voice") + + def __init__(self, kind: str, value, voice: str | None = None): + self.kind = kind + self.value = value + self.voice = voice + + +_VOICE_OPEN_RE = re.compile(r"\[voice:([A-Za-z0-9_-]+)\]") +_VOICE_CLOSE = "[/voice]" +_TAG_RE = re.compile( + r"\[(pause:(?P[0-9]+(?:\.[0-9]+)?)(?Ps|ms)?|breath|scene)\]", + re.IGNORECASE, +) + + +def _parse_tag(match: re.Match) -> float: + body = match.group(0).lower().strip("[]") + if body == "breath": + return BREATH_GAP_S + if body == "scene": + return SCENE_GAP_S + dur = float(match.group("dur")) + unit = (match.group("unit") or "s").lower() + return dur / 1000.0 if unit == "ms" else dur + + +def _expand_inline(text: str, voice: str | None) -> list[Node]: + out: list[Node] = [] + text = text.strip() + if not text: + return out + cursor = 0 + for m in _TAG_RE.finditer(text): + pre = text[cursor : m.start()].strip() + if pre: + out.append(Node("text", pre, voice)) + out.append(Node("silence", _parse_tag(m))) + cursor = m.end() + tail = text[cursor:].strip() + if tail: + out.append(Node("text", tail, voice)) + return out + + +def _split_paragraph_voices(para: str) -> list[Node]: + out: list[Node] = [] + cursor = 0 + while cursor < len(para): + m = _VOICE_OPEN_RE.search(para, cursor) + if not m: + out.extend(_expand_inline(para[cursor:], None)) + break + out.extend(_expand_inline(para[cursor : m.start()], None)) + voice = m.group(1) + body_start = m.end() + close_idx = para.find(_VOICE_CLOSE, body_start) + if close_idx < 0: + out.extend(_expand_inline(para[body_start:], voice)) + break + out.extend(_expand_inline(para[body_start:close_idx], voice)) + cursor = close_idx + len(_VOICE_CLOSE) + return out + + +def split_to_nodes(text: str) -> list[Node]: + nodes: list[Node] = [] + scenes = re.split(r"(?m)^\s*---\s*$", text) + for s_idx, scene in enumerate(scenes): + if s_idx > 0: + nodes.append(Node("silence", SCENE_GAP_S)) + paragraphs = re.split(r"\n\s*\n", scene) + first_para = True + for para in paragraphs: + para = para.strip() + if not para: + continue + if not first_para: + nodes.append(Node("silence", PARAGRAPH_GAP_S)) + first_para = False + nodes.extend(_split_paragraph_voices(para)) + return nodes + + +def _silence_samples(seconds: float) -> np.ndarray: + n = int(round(seconds * SAMPLE_RATE)) + return np.zeros(n, dtype=np.float32) + + +# ─── FastAPI ───────────────────────────────────────────────────── + + +class SynthesizeRequest(BaseModel): + gen_text: str = Field(min_length=1) + # Tortoise voice name (lj, freeman, daniel, etc.). API-compat + # field carries the voice id as a "path" — same shape as kokoro. + ref_audio_path: str = DEFAULT_VOICE + ref_text: str | None = None + output_filename: str | None = None + speed: float = Field(default=1.0, ge=0.3, le=2.0) + # Tortoise-specific: quality preset. Slower = better. + preset: str = Field(default=DEFAULT_PRESET) + + +class SynthesizeResponse(BaseModel): + ok: bool + output_path: str + sample_rate_hz: int + duration_seconds: float + elapsed_ms: int + chars_in: int + engine: str + voice: str + text_nodes: int + silence_nodes: int + voices_used: list[str] + + +app = FastAPI(title="tortoise-server", version="0.1.0") + + +@app.on_event("startup") +def _startup() -> None: + _get_tts() + # Pre-load the default voice so the first synth doesn't pay + # the latent-extraction cost. + try: + _get_voice(DEFAULT_VOICE) + except Exception as e: + log.warning("could not preload default voice %s: %s", DEFAULT_VOICE, e) + + +@app.get("/healthz") +def healthz() -> dict: + # Shape matches f5_server/kokoro_server so skald's HealthResponse + # struct deserializes all three. + return { + "ok": True, + "device": DEVICE, + "model": "tortoise-tts", + "vocoder": "tortoise-builtin", + "loaded": _tts is not None, + "engine": "tortoise-tts", + "default_voice": DEFAULT_VOICE, + "default_preset": DEFAULT_PRESET, + "cached_voices": list(_voice_cache.keys()), + "version": "0.1.0", + } + + +@app.post("/synthesize", response_model=SynthesizeResponse) +def synthesize(req: SynthesizeRequest) -> SynthesizeResponse: + if req.ref_audio_path.startswith("/"): + raise HTTPException( + 400, + "ref_audio_path looks like a filesystem path; tortoise takes a voice " + "name like 'lj' or 'freeman'.", + ) + voice = req.ref_audio_path + preset = req.preset + + output_filename = req.output_filename or f"{uuid.uuid4().hex}.wav" + if "/" in output_filename or ".." in output_filename: + raise HTTPException(400, "output_filename must be a bare name, no path parts") + output_path = AUDIO_ROOT / output_filename + output_path.parent.mkdir(parents=True, exist_ok=True) + + tts = _get_tts() + + nodes = [n for n in split_to_nodes(req.gen_text) if n.kind == "silence" or n.value] + text_count = sum(1 for n in nodes if n.kind == "text") + silence_count = sum(1 for n in nodes if n.kind == "silence") + if text_count == 0: + raise HTTPException(400, "gen_text expanded to zero text nodes") + + started = time.monotonic() + pieces: list[np.ndarray] = [] + voices_used: set[str] = set() + for node in nodes: + if node.kind == "silence": + pieces.append(_silence_samples(node.value)) + continue + seg_voice = node.voice or voice + voices_used.add(seg_voice) + try: + samples, latents = _get_voice(seg_voice) + except Exception as e: + log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e) + samples, latents = _get_voice(voice) + # Tortoise's tts_with_preset returns a torch.Tensor on the + # configured device. + audio_tensor = tts.tts_with_preset( + text=node.value, + voice_samples=samples, + conditioning_latents=latents, + preset=preset, + ) + if isinstance(audio_tensor, list): + audio_tensor = audio_tensor[0] + arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32) + pieces.append(arr) + elapsed_ms = int((time.monotonic() - started) * 1000) + + if not pieces: + raise HTTPException(500, "tortoise returned no audio") + full_audio = np.concatenate(pieces) + sf.write(str(output_path), full_audio, SAMPLE_RATE, subtype="PCM_16") + duration_s = float(len(full_audio)) / float(SAMPLE_RATE) + + log.info( + "synthesized chars=%d voice=%s preset=%s text_nodes=%d silence_nodes=%d " + "voices_used=%s -> %s (dur=%.2fs, elapsed=%dms)", + len(req.gen_text), voice, preset, text_count, silence_count, + sorted(voices_used), output_path, duration_s, elapsed_ms, + ) + return SynthesizeResponse( + ok=True, + output_path=str(output_path), + sample_rate_hz=SAMPLE_RATE, + duration_seconds=duration_s, + elapsed_ms=elapsed_ms, + chars_in=len(req.gen_text), + engine="tortoise-tts", + voice=voice, + text_nodes=text_count, + silence_nodes=silence_count, + voices_used=sorted(voices_used), + )