engines: import f5-tts + kokoro + tortoise sidecars into the tree

The python FastAPI sidecars have lived ad-hoc at /mnt/cache/appdata/
<engine>/build/ on Lucy without version control. Bringing them into
the skald repo so the engine code travels with the cross-engine
routing it depends on.

This commit lands the VANILLA version of each engine on main:

  engines/f5-tts/    SWivid F5-TTS (CC-BY-NC weights flagged)
  engines/kokoro/    hexgrad Kokoro-82M (Apache 2.0 top to bottom)
  engines/tortoise/  neonbjb Tortoise-TTS (Apache 2.0 top to bottom)

Engine-specific kludges (question doubling, GPU coordination,
pause-duration tuning) get layered on engine/* branches per the
README. Main stays the safe-to-read baseline.
This commit is contained in:
Kayos 2026-05-14 09:40:01 -07:00
parent 1c3fc11484
commit d1631ddffe
10 changed files with 1115 additions and 0 deletions

305
engines/tortoise/server.py Normal file
View file

@ -0,0 +1,305 @@
"""Tortoise-TTS FastAPI server. Sibling to kokoro_server.
Same /synthesize contract as the kokoro server so skald only has to
route by voice.source. Differences:
- Tortoise voices are NAMED PRESETS shipped with the library
(angie, daniel, freeman, jlaw, lj, weaver, etc.). No cloning.
- Tortoise is slow. Standard preset is ~10x kokoro's wall clock.
Caller should expect minutes per chunk, not seconds.
- We DON'T re-implement render-and-stitch + multi-voice tag parsing
here for v0.1 tortoise's quality is the win, not multi-voice.
Long-form sequential renders use the request's default voice
throughout.
- The [voice:X]...[/voice] tags ARE parsed though: each block
renders with its named voice. This is the audiobook win.
Quality presets: ultra_fast / fast / standard / high_quality. The
trade-off is real high_quality on a 2070 Super is ~30x slower
than kokoro. Default to 'standard' for the bar.
"""
import logging
import re
import time
import uuid
from pathlib import Path
import numpy as np
import soundfile as sf
import torch
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice
log = logging.getLogger("tortoise-server")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEFAULT_VOICE = "lj"
DEFAULT_PRESET = "standard"
AUDIO_ROOT = Path("/audio")
SAMPLE_RATE = 24000 # Tortoise outputs 24kHz
# Silence durations for between-chunk stitching (matches kokoro
# server's conventions so audio from both engines feels similar).
PARAGRAPH_GAP_S = 0.7
SCENE_GAP_S = 1.5
BREATH_GAP_S = 0.4
_tts: TextToSpeech | None = None
_voice_cache: dict[str, tuple] = {}
def _get_tts() -> TextToSpeech:
global _tts
if _tts is None:
log.info("loading tortoise device=%s", DEVICE)
_tts = TextToSpeech(use_deepspeed=False, kv_cache=True, half=(DEVICE == "cuda"))
log.info("tortoise loaded")
return _tts
def _get_voice(name: str) -> tuple:
"""Cache voice latents to avoid re-loading reference clips on
every synthesis call. Tortoise's load_voice returns
(voice_samples, conditioning_latents)."""
if name not in _voice_cache:
_voice_cache[name] = load_voice(name)
return _voice_cache[name]
# ─── tag splitter (lifted from kokoro_server) ───────────────────
class Node:
__slots__ = ("kind", "value", "voice")
def __init__(self, kind: str, value, voice: str | None = None):
self.kind = kind
self.value = value
self.voice = voice
_VOICE_OPEN_RE = re.compile(r"\[voice:([A-Za-z0-9_-]+)\]")
_VOICE_CLOSE = "[/voice]"
_TAG_RE = re.compile(
r"\[(pause:(?P<dur>[0-9]+(?:\.[0-9]+)?)(?P<unit>s|ms)?|breath|scene)\]",
re.IGNORECASE,
)
def _parse_tag(match: re.Match) -> float:
body = match.group(0).lower().strip("[]")
if body == "breath":
return BREATH_GAP_S
if body == "scene":
return SCENE_GAP_S
dur = float(match.group("dur"))
unit = (match.group("unit") or "s").lower()
return dur / 1000.0 if unit == "ms" else dur
def _expand_inline(text: str, voice: str | None) -> list[Node]:
out: list[Node] = []
text = text.strip()
if not text:
return out
cursor = 0
for m in _TAG_RE.finditer(text):
pre = text[cursor : m.start()].strip()
if pre:
out.append(Node("text", pre, voice))
out.append(Node("silence", _parse_tag(m)))
cursor = m.end()
tail = text[cursor:].strip()
if tail:
out.append(Node("text", tail, voice))
return out
def _split_paragraph_voices(para: str) -> list[Node]:
out: list[Node] = []
cursor = 0
while cursor < len(para):
m = _VOICE_OPEN_RE.search(para, cursor)
if not m:
out.extend(_expand_inline(para[cursor:], None))
break
out.extend(_expand_inline(para[cursor : m.start()], None))
voice = m.group(1)
body_start = m.end()
close_idx = para.find(_VOICE_CLOSE, body_start)
if close_idx < 0:
out.extend(_expand_inline(para[body_start:], voice))
break
out.extend(_expand_inline(para[body_start:close_idx], voice))
cursor = close_idx + len(_VOICE_CLOSE)
return out
def split_to_nodes(text: str) -> list[Node]:
nodes: list[Node] = []
scenes = re.split(r"(?m)^\s*---\s*$", text)
for s_idx, scene in enumerate(scenes):
if s_idx > 0:
nodes.append(Node("silence", SCENE_GAP_S))
paragraphs = re.split(r"\n\s*\n", scene)
first_para = True
for para in paragraphs:
para = para.strip()
if not para:
continue
if not first_para:
nodes.append(Node("silence", PARAGRAPH_GAP_S))
first_para = False
nodes.extend(_split_paragraph_voices(para))
return nodes
def _silence_samples(seconds: float) -> np.ndarray:
n = int(round(seconds * SAMPLE_RATE))
return np.zeros(n, dtype=np.float32)
# ─── FastAPI ─────────────────────────────────────────────────────
class SynthesizeRequest(BaseModel):
gen_text: str = Field(min_length=1)
# Tortoise voice name (lj, freeman, daniel, etc.). API-compat
# field carries the voice id as a "path" — same shape as kokoro.
ref_audio_path: str = DEFAULT_VOICE
ref_text: str | None = None
output_filename: str | None = None
speed: float = Field(default=1.0, ge=0.3, le=2.0)
# Tortoise-specific: quality preset. Slower = better.
preset: str = Field(default=DEFAULT_PRESET)
class SynthesizeResponse(BaseModel):
ok: bool
output_path: str
sample_rate_hz: int
duration_seconds: float
elapsed_ms: int
chars_in: int
engine: str
voice: str
text_nodes: int
silence_nodes: int
voices_used: list[str]
app = FastAPI(title="tortoise-server", version="0.1.0")
@app.on_event("startup")
def _startup() -> None:
_get_tts()
# Pre-load the default voice so the first synth doesn't pay
# the latent-extraction cost.
try:
_get_voice(DEFAULT_VOICE)
except Exception as e:
log.warning("could not preload default voice %s: %s", DEFAULT_VOICE, e)
@app.get("/healthz")
def healthz() -> dict:
# Shape matches f5_server/kokoro_server so skald's HealthResponse
# struct deserializes all three.
return {
"ok": True,
"device": DEVICE,
"model": "tortoise-tts",
"vocoder": "tortoise-builtin",
"loaded": _tts is not None,
"engine": "tortoise-tts",
"default_voice": DEFAULT_VOICE,
"default_preset": DEFAULT_PRESET,
"cached_voices": list(_voice_cache.keys()),
"version": "0.1.0",
}
@app.post("/synthesize", response_model=SynthesizeResponse)
def synthesize(req: SynthesizeRequest) -> SynthesizeResponse:
if req.ref_audio_path.startswith("/"):
raise HTTPException(
400,
"ref_audio_path looks like a filesystem path; tortoise takes a voice "
"name like 'lj' or 'freeman'.",
)
voice = req.ref_audio_path
preset = req.preset
output_filename = req.output_filename or f"{uuid.uuid4().hex}.wav"
if "/" in output_filename or ".." in output_filename:
raise HTTPException(400, "output_filename must be a bare name, no path parts")
output_path = AUDIO_ROOT / output_filename
output_path.parent.mkdir(parents=True, exist_ok=True)
tts = _get_tts()
nodes = [n for n in split_to_nodes(req.gen_text) if n.kind == "silence" or n.value]
text_count = sum(1 for n in nodes if n.kind == "text")
silence_count = sum(1 for n in nodes if n.kind == "silence")
if text_count == 0:
raise HTTPException(400, "gen_text expanded to zero text nodes")
started = time.monotonic()
pieces: list[np.ndarray] = []
voices_used: set[str] = set()
for node in nodes:
if node.kind == "silence":
pieces.append(_silence_samples(node.value))
continue
seg_voice = node.voice or voice
voices_used.add(seg_voice)
try:
samples, latents = _get_voice(seg_voice)
except Exception as e:
log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e)
samples, latents = _get_voice(voice)
# Tortoise's tts_with_preset returns a torch.Tensor on the
# configured device.
audio_tensor = tts.tts_with_preset(
text=node.value,
voice_samples=samples,
conditioning_latents=latents,
preset=preset,
)
if isinstance(audio_tensor, list):
audio_tensor = audio_tensor[0]
arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32)
pieces.append(arr)
elapsed_ms = int((time.monotonic() - started) * 1000)
if not pieces:
raise HTTPException(500, "tortoise returned no audio")
full_audio = np.concatenate(pieces)
sf.write(str(output_path), full_audio, SAMPLE_RATE, subtype="PCM_16")
duration_s = float(len(full_audio)) / float(SAMPLE_RATE)
log.info(
"synthesized chars=%d voice=%s preset=%s text_nodes=%d silence_nodes=%d "
"voices_used=%s -> %s (dur=%.2fs, elapsed=%dms)",
len(req.gen_text), voice, preset, text_count, silence_count,
sorted(voices_used), output_path, duration_s, elapsed_ms,
)
return SynthesizeResponse(
ok=True,
output_path=str(output_path),
sample_rate_hz=SAMPLE_RATE,
duration_seconds=duration_s,
elapsed_ms=elapsed_ms,
chars_in=len(req.gen_text),
engine="tortoise-tts",
voice=voice,
text_nodes=text_count,
silence_nodes=silence_count,
voices_used=sorted(voices_used),
)