skald/engines/tortoise/server.py

"""Tortoise-TTS FastAPI server. Sibling to kokoro_server.

Same /synthesize contract as the kokoro server so skald only has to
route by voice.source. Differences:
  - Tortoise voices are NAMED PRESETS shipped with the library
    (angie, daniel, freeman, jlaw, lj, weaver, etc.). No cloning.
  - Tortoise is slow. Standard preset is ~10x kokoro's wall clock.
    Caller should expect minutes per chunk, not seconds.
  - We DON'T re-implement render-and-stitch + multi-voice tag parsing
    here for v0.1 — tortoise's quality is the win, not multi-voice.
    Long-form sequential renders use the request's default voice
    throughout.
  - The [voice:X]...[/voice] tags ARE parsed though: each block
    renders with its named voice. This is the audiobook win.

Quality presets: ultra_fast / fast / standard / high_quality. The
trade-off is real — high_quality on a 2070 Super is ~30x slower
than kokoro. Default to 'standard' for the bar.
"""
import logging
import re
import time
import uuid
from pathlib import Path

import librosa
import numpy as np
import soundfile as sf
import torch
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice


log = logging.getLogger("tortoise-server")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEFAULT_VOICE = "lj"
DEFAULT_PRESET = "standard"
AUDIO_ROOT = Path("/audio")
SAMPLE_RATE = 24000  # Tortoise outputs 24kHz

# Silence durations for between-chunk stitching (matches kokoro
# server's conventions so audio from both engines feels similar).
PARAGRAPH_GAP_S = 0.7
SCENE_GAP_S = 1.5
BREATH_GAP_S = 0.4

_tts: TextToSpeech | None = None
_voice_cache: dict[str, tuple] = {}


def _get_tts() -> TextToSpeech:
    global _tts
    if _tts is None:
        log.info("loading tortoise device=%s", DEVICE)
        _tts = TextToSpeech(use_deepspeed=False, kv_cache=True, half=(DEVICE == "cuda"))
        log.info("tortoise loaded")
    return _tts


def _move_to_device(obj):
    """Recursively .to(DEVICE) tensors inside the structure tortoise
    returns from load_voice. voice_samples is a list of tensors;
    conditioning_latents is a tuple of tensors. Anything else
    passes through unchanged (e.g. None, ints)."""
    if obj is None:
        return obj
    if isinstance(obj, torch.Tensor):
        return obj.to(DEVICE)
    if isinstance(obj, list):
        return [_move_to_device(x) for x in obj]
    if isinstance(obj, tuple):
        return tuple(_move_to_device(x) for x in obj)
    return obj


def _get_voice(name: str) -> tuple:
    """Cache voice latents to avoid re-loading reference clips on
    every synthesis call. Tortoise's load_voice returns
    (voice_samples, conditioning_latents) — but they're created on
    CPU; we move them to DEVICE so the autoregressive model (on
    CUDA) doesn't fail with cpu/cuda tensor-device mismatch."""
    if name not in _voice_cache:
        samples, latents = load_voice(name)
        _voice_cache[name] = (_move_to_device(samples), _move_to_device(latents))
    return _voice_cache[name]


# ─── tag splitter (lifted from kokoro_server) ───────────────────


class Node:
    __slots__ = ("kind", "value", "voice", "pitch", "rate")

    def __init__(
        self,
        kind: str,
        value,
        voice: str | None = None,
        pitch: float = 0.0,
        rate: float = 1.0,
    ):
        # kind ∈ {"text", "silence"}; value is str for text, float
        # seconds for silence. voice/pitch/rate are character-voicing
        # modifiers from [voice:NAME pitch=N rate=R] tags. Default:
        # request voice, 0 semitones, 1x rate.
        self.kind = kind
        self.value = value
        self.voice = voice
        self.pitch = pitch
        self.rate = rate


# Voice open tag — name + optional pitch (semitones) + optional rate:
#   [voice:dyatlov]               → voice swap only
#   [voice:lj pitch=-3]           → same voice, 3 semitones lower
#   [voice:lj pitch=2 rate=1.1]   → higher + slightly faster (fairy)
#   [voice:lj pitch=-4 rate=0.9]  → lower + slower (troll)
_VOICE_OPEN_RE = re.compile(
    r"\[voice:([A-Za-z0-9_-]+)"
    r"(?:\s+pitch=(-?[0-9]+(?:\.[0-9]+)?))?"
    r"(?:\s+rate=([0-9]+(?:\.[0-9]+)?))?"
    r"\]"
)
_VOICE_CLOSE = "[/voice]"
_TAG_RE = re.compile(
    r"\[(pause:(?P<dur>[0-9]+(?:\.[0-9]+)?)(?P<unit>s|ms)?|breath|scene)\]",
    re.IGNORECASE,
)


def _parse_tag(match: re.Match) -> float:
    body = match.group(0).lower().strip("[]")
    if body == "breath":
        return BREATH_GAP_S
    if body == "scene":
        return SCENE_GAP_S
    dur = float(match.group("dur"))
    unit = (match.group("unit") or "s").lower()
    return dur / 1000.0 if unit == "ms" else dur


# Tortoise's autoregressive head loses coherence past ~20s of generated
# audio per inference call. lj's pace is roughly 14 chars/s, so anything
# past ~280 chars per call risks gibberish at the end. We split inside
# _expand_inline at sentence boundaries to keep each tts_with_preset
# call inside the model's reliable horizon.
TORTOISE_MAX_CHUNK_CHARS = 220

# Sentence boundary regex — splits on `.`/`?`/`!` followed by whitespace
# and a capital letter (keeps "Mr. Smith" / "U.S." together) OR at any
# newline.
_SENTENCE_BOUNDARY = re.compile(r"(?<=[\.!?])\s+(?=[A-Z\"\(])|(?<=\n)\s*")


def _chunk_for_tortoise(text: str, max_chars: int = TORTOISE_MAX_CHUNK_CHARS) -> list[str]:
    """Split text into chunks <= max_chars at sentence boundaries.
    If a single sentence exceeds max_chars (rare for prose), fall
    back to splitting that sentence at commas or just hard-cutting.
    """
    sentences = [s.strip() for s in _SENTENCE_BOUNDARY.split(text) if s and s.strip()]
    chunks: list[str] = []
    current = ""
    for sent in sentences:
        # Long sentence: emit alone, but try sub-splitting at commas.
        if len(sent) > max_chars:
            if current:
                chunks.append(current.strip())
                current = ""
            # Split on commas
            parts = [p.strip() for p in sent.split(",") if p.strip()]
            sub = ""
            for p in parts:
                add = (sub + ", " if sub else "") + p
                if len(add) <= max_chars:
                    sub = add
                else:
                    if sub:
                        chunks.append(sub)
                    # If even the part alone exceeds, hard-cut at max_chars
                    while len(p) > max_chars:
                        chunks.append(p[:max_chars])
                        p = p[max_chars:]
                    sub = p
            if sub:
                chunks.append(sub)
            continue
        # Sentence fits — accumulate.
        candidate = (current + " " if current else "") + sent
        if len(candidate) <= max_chars:
            current = candidate
        else:
            if current:
                chunks.append(current.strip())
            current = sent
    if current:
        chunks.append(current.strip())
    return chunks


def _expand_inline(
    text: str,
    voice: str | None,
    pitch: float = 0.0,
    rate: float = 1.0,
) -> list[Node]:
    out: list[Node] = []
    text = text.strip()
    if not text:
        return out
    cursor = 0
    for m in _TAG_RE.finditer(text):
        pre = text[cursor : m.start()].strip()
        if pre:
            out.append(Node("text", pre, voice, pitch, rate))
        out.append(Node("silence", _parse_tag(m)))
        cursor = m.end()
    tail = text[cursor:].strip()
    if tail:
        out.append(Node("text", tail, voice, pitch, rate))
    return out


def _split_paragraph_voices(para: str) -> list[Node]:
    out: list[Node] = []
    cursor = 0
    while cursor < len(para):
        m = _VOICE_OPEN_RE.search(para, cursor)
        if not m:
            out.extend(_expand_inline(para[cursor:], None))
            break
        out.extend(_expand_inline(para[cursor : m.start()], None))
        voice = m.group(1)
        pitch = float(m.group(2)) if m.group(2) else 0.0
        rate = float(m.group(3)) if m.group(3) else 1.0
        body_start = m.end()
        close_idx = para.find(_VOICE_CLOSE, body_start)
        if close_idx < 0:
            out.extend(_expand_inline(para[body_start:], voice, pitch, rate))
            break
        out.extend(_expand_inline(para[body_start:close_idx], voice, pitch, rate))
        cursor = close_idx + len(_VOICE_CLOSE)
    return out


def split_to_nodes(text: str) -> list[Node]:
    nodes: list[Node] = []
    scenes = re.split(r"(?m)^\s*---\s*$", text)
    for s_idx, scene in enumerate(scenes):
        if s_idx > 0:
            nodes.append(Node("silence", SCENE_GAP_S))
        paragraphs = re.split(r"\n\s*\n", scene)
        first_para = True
        for para in paragraphs:
            para = para.strip()
            if not para:
                continue
            if not first_para:
                nodes.append(Node("silence", PARAGRAPH_GAP_S))
            first_para = False
            nodes.extend(_split_paragraph_voices(para))
    return nodes


def _silence_samples(seconds: float) -> np.ndarray:
    n = int(round(seconds * SAMPLE_RATE))
    return np.zeros(n, dtype=np.float32)


# ─── FastAPI ─────────────────────────────────────────────────────


class SynthesizeRequest(BaseModel):
    gen_text: str = Field(min_length=1)
    # Tortoise voice name (lj, freeman, daniel, etc.). API-compat
    # field carries the voice id as a "path" — same shape as kokoro.
    ref_audio_path: str = DEFAULT_VOICE
    ref_text: str | None = None
    output_filename: str | None = None
    speed: float = Field(default=1.0, ge=0.3, le=2.0)
    # Tortoise-specific: quality preset. Slower = better.
    preset: str = Field(default=DEFAULT_PRESET)


class SynthesizeResponse(BaseModel):
    ok: bool
    output_path: str
    sample_rate_hz: int
    duration_seconds: float
    elapsed_ms: int
    chars_in: int
    engine: str
    voice: str
    text_nodes: int
    silence_nodes: int
    voices_used: list[str]


app = FastAPI(title="tortoise-server", version="0.1.0")


@app.on_event("startup")
def _startup() -> None:
    _get_tts()
    # Pre-load the default voice so the first synth doesn't pay
    # the latent-extraction cost.
    try:
        _get_voice(DEFAULT_VOICE)
    except Exception as e:
        log.warning("could not preload default voice %s: %s", DEFAULT_VOICE, e)


@app.get("/healthz")
def healthz() -> dict:
    # Shape matches f5_server/kokoro_server so skald's HealthResponse
    # struct deserializes all three.
    return {
        "ok": True,
        "device": DEVICE,
        "model": "tortoise-tts",
        "vocoder": "tortoise-builtin",
        "loaded": _tts is not None,
        "engine": "tortoise-tts",
        "default_voice": DEFAULT_VOICE,
        "default_preset": DEFAULT_PRESET,
        "cached_voices": list(_voice_cache.keys()),
        "version": "0.1.0",
    }


@app.post("/synthesize", response_model=SynthesizeResponse)
def synthesize(req: SynthesizeRequest) -> SynthesizeResponse:
    if req.ref_audio_path.startswith("/"):
        raise HTTPException(
            400,
            "ref_audio_path looks like a filesystem path; tortoise takes a voice "
            "name like 'lj' or 'freeman'.",
        )
    voice = req.ref_audio_path
    preset = req.preset

    output_filename = req.output_filename or f"{uuid.uuid4().hex}.wav"
    if "/" in output_filename or ".." in output_filename:
        raise HTTPException(400, "output_filename must be a bare name, no path parts")
    output_path = AUDIO_ROOT / output_filename
    output_path.parent.mkdir(parents=True, exist_ok=True)

    tts = _get_tts()

    nodes = [n for n in split_to_nodes(req.gen_text) if n.kind == "silence" or n.value]
    text_count = sum(1 for n in nodes if n.kind == "text")
    silence_count = sum(1 for n in nodes if n.kind == "silence")
    if text_count == 0:
        raise HTTPException(400, "gen_text expanded to zero text nodes")

    started = time.monotonic()
    pieces: list[np.ndarray] = []
    voices_used: set[str] = set()
    tortoise_chunks_rendered = 0
    for node in nodes:
        if node.kind == "silence":
            pieces.append(_silence_samples(node.value))
            continue
        seg_voice = node.voice or voice
        voices_used.add(seg_voice)
        try:
            samples, latents = _get_voice(seg_voice)
        except Exception as e:
            log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e)
            samples, latents = _get_voice(voice)
        # Each text node may exceed Tortoise's reliable ~20s horizon —
        # split at sentence boundaries before feeding the model.
        sub_chunks = _chunk_for_tortoise(node.value)
        for sub_idx, sub in enumerate(sub_chunks):
            audio_tensor = tts.tts_with_preset(
                text=sub,
                voice_samples=samples,
                conditioning_latents=latents,
                preset=preset,
            )
            if isinstance(audio_tensor, list):
                audio_tensor = audio_tensor[0]
            arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32)
            # Per-character voice modulation via librosa. Apply
            # pitch first (preserves duration), then rate (preserves
            # pitch). Default pitch=0, rate=1.0 = no-op fast path.
            if abs(node.pitch) > 1e-3:
                arr = librosa.effects.pitch_shift(
                    arr, sr=SAMPLE_RATE, n_steps=node.pitch
                )
            if abs(node.rate - 1.0) > 1e-3:
                arr = librosa.effects.time_stretch(arr, rate=node.rate)
            arr = arr.astype(np.float32)
            pieces.append(arr)
            tortoise_chunks_rendered += 1
            log.info(
                "chunk %d/%d done (%d chars, pitch=%+.1f rate=%.2f, %.1fs audio so far)",
                sub_idx + 1, len(sub_chunks), len(sub),
                node.pitch, node.rate,
                sum(len(p) for p in pieces) / SAMPLE_RATE,
            )
    elapsed_ms = int((time.monotonic() - started) * 1000)

    if not pieces:
        raise HTTPException(500, "tortoise returned no audio")
    full_audio = np.concatenate(pieces)
    sf.write(str(output_path), full_audio, SAMPLE_RATE, subtype="PCM_16")
    duration_s = float(len(full_audio)) / float(SAMPLE_RATE)

    log.info(
        "synthesized chars=%d voice=%s preset=%s text_nodes=%d silence_nodes=%d "
        "voices_used=%s -> %s (dur=%.2fs, elapsed=%dms)",
        len(req.gen_text), voice, preset, text_count, silence_count,
        sorted(voices_used), output_path, duration_s, elapsed_ms,
    )
    return SynthesizeResponse(
        ok=True,
        output_path=str(output_path),
        sample_rate_hz=SAMPLE_RATE,
        duration_seconds=duration_s,
        elapsed_ms=elapsed_ms,
        chars_in=len(req.gen_text),
        engine="tortoise-tts",
        voice=voice,
        text_nodes=text_count,
        silence_nodes=silence_count,
        voices_used=sorted(voices_used),
    )