Catches up engines/tortoise/server.py with what's been deployed on
Lucy through tonight's smoke iterations:
0.2 — _chunk_for_tortoise splits text nodes at sentence boundaries
(max 220 chars) before each tts_with_preset call. Fixes the
end-of-prompt gibberish past tortoise's ~20s reliable horizon.
0.3 — _get_voice now .to(DEVICE) cached samples + latents. Without
this, non-lj voices crash with 'Expected all tensors to be on
the same device, but found cpu and cuda:0'.
0.4 — [voice:NAME pitch=N rate=R][/voice] tag syntax. librosa
pitch_shift + time_stretch applied per-chunk for single-voice
multi-character renders. The strategy survived the design
table — but the librosa phase-vocoder artifacts at ±5 semitones
ate the quality on the 2070 Super. Parked here for the GPU
rebuild; modulation works architecturally, just needs better
stretching algorithm (rubberband) + more headroom.
Production stayed Kokoro. Coast-Down preferred_voice_id reverted
to kokoro_af_heart in the live DB after this experiment.
433 lines
15 KiB
Python
433 lines
15 KiB
Python
"""Tortoise-TTS FastAPI server. Sibling to kokoro_server.
|
|
|
|
Same /synthesize contract as the kokoro server so skald only has to
|
|
route by voice.source. Differences:
|
|
- Tortoise voices are NAMED PRESETS shipped with the library
|
|
(angie, daniel, freeman, jlaw, lj, weaver, etc.). No cloning.
|
|
- Tortoise is slow. Standard preset is ~10x kokoro's wall clock.
|
|
Caller should expect minutes per chunk, not seconds.
|
|
- We DON'T re-implement render-and-stitch + multi-voice tag parsing
|
|
here for v0.1 — tortoise's quality is the win, not multi-voice.
|
|
Long-form sequential renders use the request's default voice
|
|
throughout.
|
|
- The [voice:X]...[/voice] tags ARE parsed though: each block
|
|
renders with its named voice. This is the audiobook win.
|
|
|
|
Quality presets: ultra_fast / fast / standard / high_quality. The
|
|
trade-off is real — high_quality on a 2070 Super is ~30x slower
|
|
than kokoro. Default to 'standard' for the bar.
|
|
"""
|
|
import logging
|
|
import re
|
|
import time
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
import librosa
|
|
import numpy as np
|
|
import soundfile as sf
|
|
import torch
|
|
from fastapi import FastAPI, HTTPException
|
|
from pydantic import BaseModel, Field
|
|
|
|
from tortoise.api import TextToSpeech
|
|
from tortoise.utils.audio import load_voice
|
|
|
|
|
|
log = logging.getLogger("tortoise-server")
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
|
|
|
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
DEFAULT_VOICE = "lj"
|
|
DEFAULT_PRESET = "standard"
|
|
AUDIO_ROOT = Path("/audio")
|
|
SAMPLE_RATE = 24000 # Tortoise outputs 24kHz
|
|
|
|
# Silence durations for between-chunk stitching (matches kokoro
|
|
# server's conventions so audio from both engines feels similar).
|
|
PARAGRAPH_GAP_S = 0.7
|
|
SCENE_GAP_S = 1.5
|
|
BREATH_GAP_S = 0.4
|
|
|
|
_tts: TextToSpeech | None = None
|
|
_voice_cache: dict[str, tuple] = {}
|
|
|
|
|
|
def _get_tts() -> TextToSpeech:
|
|
global _tts
|
|
if _tts is None:
|
|
log.info("loading tortoise device=%s", DEVICE)
|
|
_tts = TextToSpeech(use_deepspeed=False, kv_cache=True, half=(DEVICE == "cuda"))
|
|
log.info("tortoise loaded")
|
|
return _tts
|
|
|
|
|
|
def _move_to_device(obj):
|
|
"""Recursively .to(DEVICE) tensors inside the structure tortoise
|
|
returns from load_voice. voice_samples is a list of tensors;
|
|
conditioning_latents is a tuple of tensors. Anything else
|
|
passes through unchanged (e.g. None, ints)."""
|
|
if obj is None:
|
|
return obj
|
|
if isinstance(obj, torch.Tensor):
|
|
return obj.to(DEVICE)
|
|
if isinstance(obj, list):
|
|
return [_move_to_device(x) for x in obj]
|
|
if isinstance(obj, tuple):
|
|
return tuple(_move_to_device(x) for x in obj)
|
|
return obj
|
|
|
|
|
|
def _get_voice(name: str) -> tuple:
|
|
"""Cache voice latents to avoid re-loading reference clips on
|
|
every synthesis call. Tortoise's load_voice returns
|
|
(voice_samples, conditioning_latents) — but they're created on
|
|
CPU; we move them to DEVICE so the autoregressive model (on
|
|
CUDA) doesn't fail with cpu/cuda tensor-device mismatch."""
|
|
if name not in _voice_cache:
|
|
samples, latents = load_voice(name)
|
|
_voice_cache[name] = (_move_to_device(samples), _move_to_device(latents))
|
|
return _voice_cache[name]
|
|
|
|
|
|
# ─── tag splitter (lifted from kokoro_server) ───────────────────
|
|
|
|
|
|
class Node:
|
|
__slots__ = ("kind", "value", "voice", "pitch", "rate")
|
|
|
|
def __init__(
|
|
self,
|
|
kind: str,
|
|
value,
|
|
voice: str | None = None,
|
|
pitch: float = 0.0,
|
|
rate: float = 1.0,
|
|
):
|
|
# kind ∈ {"text", "silence"}; value is str for text, float
|
|
# seconds for silence. voice/pitch/rate are character-voicing
|
|
# modifiers from [voice:NAME pitch=N rate=R] tags. Default:
|
|
# request voice, 0 semitones, 1x rate.
|
|
self.kind = kind
|
|
self.value = value
|
|
self.voice = voice
|
|
self.pitch = pitch
|
|
self.rate = rate
|
|
|
|
|
|
# Voice open tag — name + optional pitch (semitones) + optional rate:
|
|
# [voice:dyatlov] → voice swap only
|
|
# [voice:lj pitch=-3] → same voice, 3 semitones lower
|
|
# [voice:lj pitch=2 rate=1.1] → higher + slightly faster (fairy)
|
|
# [voice:lj pitch=-4 rate=0.9] → lower + slower (troll)
|
|
_VOICE_OPEN_RE = re.compile(
|
|
r"\[voice:([A-Za-z0-9_-]+)"
|
|
r"(?:\s+pitch=(-?[0-9]+(?:\.[0-9]+)?))?"
|
|
r"(?:\s+rate=([0-9]+(?:\.[0-9]+)?))?"
|
|
r"\]"
|
|
)
|
|
_VOICE_CLOSE = "[/voice]"
|
|
_TAG_RE = re.compile(
|
|
r"\[(pause:(?P<dur>[0-9]+(?:\.[0-9]+)?)(?P<unit>s|ms)?|breath|scene)\]",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _parse_tag(match: re.Match) -> float:
|
|
body = match.group(0).lower().strip("[]")
|
|
if body == "breath":
|
|
return BREATH_GAP_S
|
|
if body == "scene":
|
|
return SCENE_GAP_S
|
|
dur = float(match.group("dur"))
|
|
unit = (match.group("unit") or "s").lower()
|
|
return dur / 1000.0 if unit == "ms" else dur
|
|
|
|
|
|
# Tortoise's autoregressive head loses coherence past ~20s of generated
|
|
# audio per inference call. lj's pace is roughly 14 chars/s, so anything
|
|
# past ~280 chars per call risks gibberish at the end. We split inside
|
|
# _expand_inline at sentence boundaries to keep each tts_with_preset
|
|
# call inside the model's reliable horizon.
|
|
TORTOISE_MAX_CHUNK_CHARS = 220
|
|
|
|
# Sentence boundary regex — splits on `.`/`?`/`!` followed by whitespace
|
|
# and a capital letter (keeps "Mr. Smith" / "U.S." together) OR at any
|
|
# newline.
|
|
_SENTENCE_BOUNDARY = re.compile(r"(?<=[\.!?])\s+(?=[A-Z\"\(])|(?<=\n)\s*")
|
|
|
|
|
|
def _chunk_for_tortoise(text: str, max_chars: int = TORTOISE_MAX_CHUNK_CHARS) -> list[str]:
|
|
"""Split text into chunks <= max_chars at sentence boundaries.
|
|
If a single sentence exceeds max_chars (rare for prose), fall
|
|
back to splitting that sentence at commas or just hard-cutting.
|
|
"""
|
|
sentences = [s.strip() for s in _SENTENCE_BOUNDARY.split(text) if s and s.strip()]
|
|
chunks: list[str] = []
|
|
current = ""
|
|
for sent in sentences:
|
|
# Long sentence: emit alone, but try sub-splitting at commas.
|
|
if len(sent) > max_chars:
|
|
if current:
|
|
chunks.append(current.strip())
|
|
current = ""
|
|
# Split on commas
|
|
parts = [p.strip() for p in sent.split(",") if p.strip()]
|
|
sub = ""
|
|
for p in parts:
|
|
add = (sub + ", " if sub else "") + p
|
|
if len(add) <= max_chars:
|
|
sub = add
|
|
else:
|
|
if sub:
|
|
chunks.append(sub)
|
|
# If even the part alone exceeds, hard-cut at max_chars
|
|
while len(p) > max_chars:
|
|
chunks.append(p[:max_chars])
|
|
p = p[max_chars:]
|
|
sub = p
|
|
if sub:
|
|
chunks.append(sub)
|
|
continue
|
|
# Sentence fits — accumulate.
|
|
candidate = (current + " " if current else "") + sent
|
|
if len(candidate) <= max_chars:
|
|
current = candidate
|
|
else:
|
|
if current:
|
|
chunks.append(current.strip())
|
|
current = sent
|
|
if current:
|
|
chunks.append(current.strip())
|
|
return chunks
|
|
|
|
|
|
def _expand_inline(
|
|
text: str,
|
|
voice: str | None,
|
|
pitch: float = 0.0,
|
|
rate: float = 1.0,
|
|
) -> list[Node]:
|
|
out: list[Node] = []
|
|
text = text.strip()
|
|
if not text:
|
|
return out
|
|
cursor = 0
|
|
for m in _TAG_RE.finditer(text):
|
|
pre = text[cursor : m.start()].strip()
|
|
if pre:
|
|
out.append(Node("text", pre, voice, pitch, rate))
|
|
out.append(Node("silence", _parse_tag(m)))
|
|
cursor = m.end()
|
|
tail = text[cursor:].strip()
|
|
if tail:
|
|
out.append(Node("text", tail, voice, pitch, rate))
|
|
return out
|
|
|
|
|
|
def _split_paragraph_voices(para: str) -> list[Node]:
|
|
out: list[Node] = []
|
|
cursor = 0
|
|
while cursor < len(para):
|
|
m = _VOICE_OPEN_RE.search(para, cursor)
|
|
if not m:
|
|
out.extend(_expand_inline(para[cursor:], None))
|
|
break
|
|
out.extend(_expand_inline(para[cursor : m.start()], None))
|
|
voice = m.group(1)
|
|
pitch = float(m.group(2)) if m.group(2) else 0.0
|
|
rate = float(m.group(3)) if m.group(3) else 1.0
|
|
body_start = m.end()
|
|
close_idx = para.find(_VOICE_CLOSE, body_start)
|
|
if close_idx < 0:
|
|
out.extend(_expand_inline(para[body_start:], voice, pitch, rate))
|
|
break
|
|
out.extend(_expand_inline(para[body_start:close_idx], voice, pitch, rate))
|
|
cursor = close_idx + len(_VOICE_CLOSE)
|
|
return out
|
|
|
|
|
|
def split_to_nodes(text: str) -> list[Node]:
|
|
nodes: list[Node] = []
|
|
scenes = re.split(r"(?m)^\s*---\s*$", text)
|
|
for s_idx, scene in enumerate(scenes):
|
|
if s_idx > 0:
|
|
nodes.append(Node("silence", SCENE_GAP_S))
|
|
paragraphs = re.split(r"\n\s*\n", scene)
|
|
first_para = True
|
|
for para in paragraphs:
|
|
para = para.strip()
|
|
if not para:
|
|
continue
|
|
if not first_para:
|
|
nodes.append(Node("silence", PARAGRAPH_GAP_S))
|
|
first_para = False
|
|
nodes.extend(_split_paragraph_voices(para))
|
|
return nodes
|
|
|
|
|
|
def _silence_samples(seconds: float) -> np.ndarray:
|
|
n = int(round(seconds * SAMPLE_RATE))
|
|
return np.zeros(n, dtype=np.float32)
|
|
|
|
|
|
# ─── FastAPI ─────────────────────────────────────────────────────
|
|
|
|
|
|
class SynthesizeRequest(BaseModel):
|
|
gen_text: str = Field(min_length=1)
|
|
# Tortoise voice name (lj, freeman, daniel, etc.). API-compat
|
|
# field carries the voice id as a "path" — same shape as kokoro.
|
|
ref_audio_path: str = DEFAULT_VOICE
|
|
ref_text: str | None = None
|
|
output_filename: str | None = None
|
|
speed: float = Field(default=1.0, ge=0.3, le=2.0)
|
|
# Tortoise-specific: quality preset. Slower = better.
|
|
preset: str = Field(default=DEFAULT_PRESET)
|
|
|
|
|
|
class SynthesizeResponse(BaseModel):
|
|
ok: bool
|
|
output_path: str
|
|
sample_rate_hz: int
|
|
duration_seconds: float
|
|
elapsed_ms: int
|
|
chars_in: int
|
|
engine: str
|
|
voice: str
|
|
text_nodes: int
|
|
silence_nodes: int
|
|
voices_used: list[str]
|
|
|
|
|
|
app = FastAPI(title="tortoise-server", version="0.1.0")
|
|
|
|
|
|
@app.on_event("startup")
|
|
def _startup() -> None:
|
|
_get_tts()
|
|
# Pre-load the default voice so the first synth doesn't pay
|
|
# the latent-extraction cost.
|
|
try:
|
|
_get_voice(DEFAULT_VOICE)
|
|
except Exception as e:
|
|
log.warning("could not preload default voice %s: %s", DEFAULT_VOICE, e)
|
|
|
|
|
|
@app.get("/healthz")
|
|
def healthz() -> dict:
|
|
# Shape matches f5_server/kokoro_server so skald's HealthResponse
|
|
# struct deserializes all three.
|
|
return {
|
|
"ok": True,
|
|
"device": DEVICE,
|
|
"model": "tortoise-tts",
|
|
"vocoder": "tortoise-builtin",
|
|
"loaded": _tts is not None,
|
|
"engine": "tortoise-tts",
|
|
"default_voice": DEFAULT_VOICE,
|
|
"default_preset": DEFAULT_PRESET,
|
|
"cached_voices": list(_voice_cache.keys()),
|
|
"version": "0.1.0",
|
|
}
|
|
|
|
|
|
@app.post("/synthesize", response_model=SynthesizeResponse)
|
|
def synthesize(req: SynthesizeRequest) -> SynthesizeResponse:
|
|
if req.ref_audio_path.startswith("/"):
|
|
raise HTTPException(
|
|
400,
|
|
"ref_audio_path looks like a filesystem path; tortoise takes a voice "
|
|
"name like 'lj' or 'freeman'.",
|
|
)
|
|
voice = req.ref_audio_path
|
|
preset = req.preset
|
|
|
|
output_filename = req.output_filename or f"{uuid.uuid4().hex}.wav"
|
|
if "/" in output_filename or ".." in output_filename:
|
|
raise HTTPException(400, "output_filename must be a bare name, no path parts")
|
|
output_path = AUDIO_ROOT / output_filename
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
tts = _get_tts()
|
|
|
|
nodes = [n for n in split_to_nodes(req.gen_text) if n.kind == "silence" or n.value]
|
|
text_count = sum(1 for n in nodes if n.kind == "text")
|
|
silence_count = sum(1 for n in nodes if n.kind == "silence")
|
|
if text_count == 0:
|
|
raise HTTPException(400, "gen_text expanded to zero text nodes")
|
|
|
|
started = time.monotonic()
|
|
pieces: list[np.ndarray] = []
|
|
voices_used: set[str] = set()
|
|
tortoise_chunks_rendered = 0
|
|
for node in nodes:
|
|
if node.kind == "silence":
|
|
pieces.append(_silence_samples(node.value))
|
|
continue
|
|
seg_voice = node.voice or voice
|
|
voices_used.add(seg_voice)
|
|
try:
|
|
samples, latents = _get_voice(seg_voice)
|
|
except Exception as e:
|
|
log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e)
|
|
samples, latents = _get_voice(voice)
|
|
# Each text node may exceed Tortoise's reliable ~20s horizon —
|
|
# split at sentence boundaries before feeding the model.
|
|
sub_chunks = _chunk_for_tortoise(node.value)
|
|
for sub_idx, sub in enumerate(sub_chunks):
|
|
audio_tensor = tts.tts_with_preset(
|
|
text=sub,
|
|
voice_samples=samples,
|
|
conditioning_latents=latents,
|
|
preset=preset,
|
|
)
|
|
if isinstance(audio_tensor, list):
|
|
audio_tensor = audio_tensor[0]
|
|
arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32)
|
|
# Per-character voice modulation via librosa. Apply
|
|
# pitch first (preserves duration), then rate (preserves
|
|
# pitch). Default pitch=0, rate=1.0 = no-op fast path.
|
|
if abs(node.pitch) > 1e-3:
|
|
arr = librosa.effects.pitch_shift(
|
|
arr, sr=SAMPLE_RATE, n_steps=node.pitch
|
|
)
|
|
if abs(node.rate - 1.0) > 1e-3:
|
|
arr = librosa.effects.time_stretch(arr, rate=node.rate)
|
|
arr = arr.astype(np.float32)
|
|
pieces.append(arr)
|
|
tortoise_chunks_rendered += 1
|
|
log.info(
|
|
"chunk %d/%d done (%d chars, pitch=%+.1f rate=%.2f, %.1fs audio so far)",
|
|
sub_idx + 1, len(sub_chunks), len(sub),
|
|
node.pitch, node.rate,
|
|
sum(len(p) for p in pieces) / SAMPLE_RATE,
|
|
)
|
|
elapsed_ms = int((time.monotonic() - started) * 1000)
|
|
|
|
if not pieces:
|
|
raise HTTPException(500, "tortoise returned no audio")
|
|
full_audio = np.concatenate(pieces)
|
|
sf.write(str(output_path), full_audio, SAMPLE_RATE, subtype="PCM_16")
|
|
duration_s = float(len(full_audio)) / float(SAMPLE_RATE)
|
|
|
|
log.info(
|
|
"synthesized chars=%d voice=%s preset=%s text_nodes=%d silence_nodes=%d "
|
|
"voices_used=%s -> %s (dur=%.2fs, elapsed=%dms)",
|
|
len(req.gen_text), voice, preset, text_count, silence_count,
|
|
sorted(voices_used), output_path, duration_s, elapsed_ms,
|
|
)
|
|
return SynthesizeResponse(
|
|
ok=True,
|
|
output_path=str(output_path),
|
|
sample_rate_hz=SAMPLE_RATE,
|
|
duration_seconds=duration_s,
|
|
elapsed_ms=elapsed_ms,
|
|
chars_in=len(req.gen_text),
|
|
engine="tortoise-tts",
|
|
voice=voice,
|
|
text_nodes=text_count,
|
|
silence_nodes=silence_count,
|
|
voices_used=sorted(voices_used),
|
|
)
|