"""Tortoise-TTS FastAPI server. Sibling to kokoro_server. Same /synthesize contract as the kokoro server so skald only has to route by voice.source. Differences: - Tortoise voices are NAMED PRESETS shipped with the library (angie, daniel, freeman, jlaw, lj, weaver, etc.). No cloning. - Tortoise is slow. Standard preset is ~10x kokoro's wall clock. Caller should expect minutes per chunk, not seconds. - We DON'T re-implement render-and-stitch + multi-voice tag parsing here for v0.1 — tortoise's quality is the win, not multi-voice. Long-form sequential renders use the request's default voice throughout. - The [voice:X]...[/voice] tags ARE parsed though: each block renders with its named voice. This is the audiobook win. Quality presets: ultra_fast / fast / standard / high_quality. The trade-off is real — high_quality on a 2070 Super is ~30x slower than kokoro. Default to 'standard' for the bar. """ import logging import re import time import uuid from pathlib import Path import numpy as np import soundfile as sf import torch from fastapi import FastAPI, HTTPException from pydantic import BaseModel, Field from tortoise.api import TextToSpeech from tortoise.utils.audio import load_voice log = logging.getLogger("tortoise-server") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DEFAULT_VOICE = "lj" DEFAULT_PRESET = "standard" AUDIO_ROOT = Path("/audio") SAMPLE_RATE = 24000 # Tortoise outputs 24kHz # Silence durations for between-chunk stitching (matches kokoro # server's conventions so audio from both engines feels similar). PARAGRAPH_GAP_S = 0.7 SCENE_GAP_S = 1.5 BREATH_GAP_S = 0.4 _tts: TextToSpeech | None = None _voice_cache: dict[str, tuple] = {} def _get_tts() -> TextToSpeech: global _tts if _tts is None: log.info("loading tortoise device=%s", DEVICE) _tts = TextToSpeech(use_deepspeed=False, kv_cache=True, half=(DEVICE == "cuda")) log.info("tortoise loaded") return _tts def _get_voice(name: str) -> tuple: """Cache voice latents to avoid re-loading reference clips on every synthesis call. Tortoise's load_voice returns (voice_samples, conditioning_latents).""" if name not in _voice_cache: _voice_cache[name] = load_voice(name) return _voice_cache[name] # ─── tag splitter (lifted from kokoro_server) ─────────────────── class Node: __slots__ = ("kind", "value", "voice") def __init__(self, kind: str, value, voice: str | None = None): self.kind = kind self.value = value self.voice = voice _VOICE_OPEN_RE = re.compile(r"\[voice:([A-Za-z0-9_-]+)\]") _VOICE_CLOSE = "[/voice]" _TAG_RE = re.compile( r"\[(pause:(?P[0-9]+(?:\.[0-9]+)?)(?Ps|ms)?|breath|scene)\]", re.IGNORECASE, ) def _parse_tag(match: re.Match) -> float: body = match.group(0).lower().strip("[]") if body == "breath": return BREATH_GAP_S if body == "scene": return SCENE_GAP_S dur = float(match.group("dur")) unit = (match.group("unit") or "s").lower() return dur / 1000.0 if unit == "ms" else dur def _expand_inline(text: str, voice: str | None) -> list[Node]: out: list[Node] = [] text = text.strip() if not text: return out cursor = 0 for m in _TAG_RE.finditer(text): pre = text[cursor : m.start()].strip() if pre: out.append(Node("text", pre, voice)) out.append(Node("silence", _parse_tag(m))) cursor = m.end() tail = text[cursor:].strip() if tail: out.append(Node("text", tail, voice)) return out def _split_paragraph_voices(para: str) -> list[Node]: out: list[Node] = [] cursor = 0 while cursor < len(para): m = _VOICE_OPEN_RE.search(para, cursor) if not m: out.extend(_expand_inline(para[cursor:], None)) break out.extend(_expand_inline(para[cursor : m.start()], None)) voice = m.group(1) body_start = m.end() close_idx = para.find(_VOICE_CLOSE, body_start) if close_idx < 0: out.extend(_expand_inline(para[body_start:], voice)) break out.extend(_expand_inline(para[body_start:close_idx], voice)) cursor = close_idx + len(_VOICE_CLOSE) return out def split_to_nodes(text: str) -> list[Node]: nodes: list[Node] = [] scenes = re.split(r"(?m)^\s*---\s*$", text) for s_idx, scene in enumerate(scenes): if s_idx > 0: nodes.append(Node("silence", SCENE_GAP_S)) paragraphs = re.split(r"\n\s*\n", scene) first_para = True for para in paragraphs: para = para.strip() if not para: continue if not first_para: nodes.append(Node("silence", PARAGRAPH_GAP_S)) first_para = False nodes.extend(_split_paragraph_voices(para)) return nodes def _silence_samples(seconds: float) -> np.ndarray: n = int(round(seconds * SAMPLE_RATE)) return np.zeros(n, dtype=np.float32) # ─── FastAPI ───────────────────────────────────────────────────── class SynthesizeRequest(BaseModel): gen_text: str = Field(min_length=1) # Tortoise voice name (lj, freeman, daniel, etc.). API-compat # field carries the voice id as a "path" — same shape as kokoro. ref_audio_path: str = DEFAULT_VOICE ref_text: str | None = None output_filename: str | None = None speed: float = Field(default=1.0, ge=0.3, le=2.0) # Tortoise-specific: quality preset. Slower = better. preset: str = Field(default=DEFAULT_PRESET) class SynthesizeResponse(BaseModel): ok: bool output_path: str sample_rate_hz: int duration_seconds: float elapsed_ms: int chars_in: int engine: str voice: str text_nodes: int silence_nodes: int voices_used: list[str] app = FastAPI(title="tortoise-server", version="0.1.0") @app.on_event("startup") def _startup() -> None: _get_tts() # Pre-load the default voice so the first synth doesn't pay # the latent-extraction cost. try: _get_voice(DEFAULT_VOICE) except Exception as e: log.warning("could not preload default voice %s: %s", DEFAULT_VOICE, e) @app.get("/healthz") def healthz() -> dict: # Shape matches f5_server/kokoro_server so skald's HealthResponse # struct deserializes all three. return { "ok": True, "device": DEVICE, "model": "tortoise-tts", "vocoder": "tortoise-builtin", "loaded": _tts is not None, "engine": "tortoise-tts", "default_voice": DEFAULT_VOICE, "default_preset": DEFAULT_PRESET, "cached_voices": list(_voice_cache.keys()), "version": "0.1.0", } @app.post("/synthesize", response_model=SynthesizeResponse) def synthesize(req: SynthesizeRequest) -> SynthesizeResponse: if req.ref_audio_path.startswith("/"): raise HTTPException( 400, "ref_audio_path looks like a filesystem path; tortoise takes a voice " "name like 'lj' or 'freeman'.", ) voice = req.ref_audio_path preset = req.preset output_filename = req.output_filename or f"{uuid.uuid4().hex}.wav" if "/" in output_filename or ".." in output_filename: raise HTTPException(400, "output_filename must be a bare name, no path parts") output_path = AUDIO_ROOT / output_filename output_path.parent.mkdir(parents=True, exist_ok=True) tts = _get_tts() nodes = [n for n in split_to_nodes(req.gen_text) if n.kind == "silence" or n.value] text_count = sum(1 for n in nodes if n.kind == "text") silence_count = sum(1 for n in nodes if n.kind == "silence") if text_count == 0: raise HTTPException(400, "gen_text expanded to zero text nodes") started = time.monotonic() pieces: list[np.ndarray] = [] voices_used: set[str] = set() for node in nodes: if node.kind == "silence": pieces.append(_silence_samples(node.value)) continue seg_voice = node.voice or voice voices_used.add(seg_voice) try: samples, latents = _get_voice(seg_voice) except Exception as e: log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e) samples, latents = _get_voice(voice) # Tortoise's tts_with_preset returns a torch.Tensor on the # configured device. audio_tensor = tts.tts_with_preset( text=node.value, voice_samples=samples, conditioning_latents=latents, preset=preset, ) if isinstance(audio_tensor, list): audio_tensor = audio_tensor[0] arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32) pieces.append(arr) elapsed_ms = int((time.monotonic() - started) * 1000) if not pieces: raise HTTPException(500, "tortoise returned no audio") full_audio = np.concatenate(pieces) sf.write(str(output_path), full_audio, SAMPLE_RATE, subtype="PCM_16") duration_s = float(len(full_audio)) / float(SAMPLE_RATE) log.info( "synthesized chars=%d voice=%s preset=%s text_nodes=%d silence_nodes=%d " "voices_used=%s -> %s (dur=%.2fs, elapsed=%dms)", len(req.gen_text), voice, preset, text_count, silence_count, sorted(voices_used), output_path, duration_s, elapsed_ms, ) return SynthesizeResponse( ok=True, output_path=str(output_path), sample_rate_hz=SAMPLE_RATE, duration_seconds=duration_s, elapsed_ms=elapsed_ms, chars_in=len(req.gen_text), engine="tortoise-tts", voice=voice, text_nodes=text_count, silence_nodes=silence_count, voices_used=sorted(voices_used), )