engine/tortoise: sentence chunking + device fix + pitch/rate modulation

Catches up engines/tortoise/server.py with what's been deployed on
Lucy through tonight's smoke iterations:

0.2 — _chunk_for_tortoise splits text nodes at sentence boundaries
      (max 220 chars) before each tts_with_preset call. Fixes the
      end-of-prompt gibberish past tortoise's ~20s reliable horizon.

0.3 — _get_voice now .to(DEVICE) cached samples + latents. Without
      this, non-lj voices crash with 'Expected all tensors to be on
      the same device, but found cpu and cuda:0'.

0.4 — [voice:NAME pitch=N rate=R][/voice] tag syntax. librosa
      pitch_shift + time_stretch applied per-chunk for single-voice
      multi-character renders. The strategy survived the design
      table — but the librosa phase-vocoder artifacts at ±5 semitones
      ate the quality on the 2070 Super. Parked here for the GPU
      rebuild; modulation works architecturally, just needs better
      stretching algorithm (rubberband) + more headroom.

Production stayed Kokoro. Coast-Down preferred_voice_id reverted
to kokoro_af_heart in the live DB after this experiment.
This commit is contained in:
Kayos 2026-05-14 19:08:43 -07:00
parent 7a96031aa6
commit 9df378f799

View file

@ -23,6 +23,7 @@ import time
import uuid
from pathlib import Path
import librosa
import numpy as np
import soundfile as sf
import torch
@ -62,12 +63,31 @@ def _get_tts() -> TextToSpeech:
return _tts
def _move_to_device(obj):
"""Recursively .to(DEVICE) tensors inside the structure tortoise
returns from load_voice. voice_samples is a list of tensors;
conditioning_latents is a tuple of tensors. Anything else
passes through unchanged (e.g. None, ints)."""
if obj is None:
return obj
if isinstance(obj, torch.Tensor):
return obj.to(DEVICE)
if isinstance(obj, list):
return [_move_to_device(x) for x in obj]
if isinstance(obj, tuple):
return tuple(_move_to_device(x) for x in obj)
return obj
def _get_voice(name: str) -> tuple:
"""Cache voice latents to avoid re-loading reference clips on
every synthesis call. Tortoise's load_voice returns
(voice_samples, conditioning_latents)."""
(voice_samples, conditioning_latents) but they're created on
CPU; we move them to DEVICE so the autoregressive model (on
CUDA) doesn't fail with cpu/cuda tensor-device mismatch."""
if name not in _voice_cache:
_voice_cache[name] = load_voice(name)
samples, latents = load_voice(name)
_voice_cache[name] = (_move_to_device(samples), _move_to_device(latents))
return _voice_cache[name]
@ -75,15 +95,38 @@ def _get_voice(name: str) -> tuple:
class Node:
__slots__ = ("kind", "value", "voice")
__slots__ = ("kind", "value", "voice", "pitch", "rate")
def __init__(self, kind: str, value, voice: str | None = None):
def __init__(
self,
kind: str,
value,
voice: str | None = None,
pitch: float = 0.0,
rate: float = 1.0,
):
# kind ∈ {"text", "silence"}; value is str for text, float
# seconds for silence. voice/pitch/rate are character-voicing
# modifiers from [voice:NAME pitch=N rate=R] tags. Default:
# request voice, 0 semitones, 1x rate.
self.kind = kind
self.value = value
self.voice = voice
self.pitch = pitch
self.rate = rate
_VOICE_OPEN_RE = re.compile(r"\[voice:([A-Za-z0-9_-]+)\]")
# Voice open tag — name + optional pitch (semitones) + optional rate:
# [voice:dyatlov] → voice swap only
# [voice:lj pitch=-3] → same voice, 3 semitones lower
# [voice:lj pitch=2 rate=1.1] → higher + slightly faster (fairy)
# [voice:lj pitch=-4 rate=0.9] → lower + slower (troll)
_VOICE_OPEN_RE = re.compile(
r"\[voice:([A-Za-z0-9_-]+)"
r"(?:\s+pitch=(-?[0-9]+(?:\.[0-9]+)?))?"
r"(?:\s+rate=([0-9]+(?:\.[0-9]+)?))?"
r"\]"
)
_VOICE_CLOSE = "[/voice]"
_TAG_RE = re.compile(
r"\[(pause:(?P<dur>[0-9]+(?:\.[0-9]+)?)(?P<unit>s|ms)?|breath|scene)\]",
@ -102,7 +145,70 @@ def _parse_tag(match: re.Match) -> float:
return dur / 1000.0 if unit == "ms" else dur
def _expand_inline(text: str, voice: str | None) -> list[Node]:
# Tortoise's autoregressive head loses coherence past ~20s of generated
# audio per inference call. lj's pace is roughly 14 chars/s, so anything
# past ~280 chars per call risks gibberish at the end. We split inside
# _expand_inline at sentence boundaries to keep each tts_with_preset
# call inside the model's reliable horizon.
TORTOISE_MAX_CHUNK_CHARS = 220
# Sentence boundary regex — splits on `.`/`?`/`!` followed by whitespace
# and a capital letter (keeps "Mr. Smith" / "U.S." together) OR at any
# newline.
_SENTENCE_BOUNDARY = re.compile(r"(?<=[\.!?])\s+(?=[A-Z\"\(])|(?<=\n)\s*")
def _chunk_for_tortoise(text: str, max_chars: int = TORTOISE_MAX_CHUNK_CHARS) -> list[str]:
"""Split text into chunks <= max_chars at sentence boundaries.
If a single sentence exceeds max_chars (rare for prose), fall
back to splitting that sentence at commas or just hard-cutting.
"""
sentences = [s.strip() for s in _SENTENCE_BOUNDARY.split(text) if s and s.strip()]
chunks: list[str] = []
current = ""
for sent in sentences:
# Long sentence: emit alone, but try sub-splitting at commas.
if len(sent) > max_chars:
if current:
chunks.append(current.strip())
current = ""
# Split on commas
parts = [p.strip() for p in sent.split(",") if p.strip()]
sub = ""
for p in parts:
add = (sub + ", " if sub else "") + p
if len(add) <= max_chars:
sub = add
else:
if sub:
chunks.append(sub)
# If even the part alone exceeds, hard-cut at max_chars
while len(p) > max_chars:
chunks.append(p[:max_chars])
p = p[max_chars:]
sub = p
if sub:
chunks.append(sub)
continue
# Sentence fits — accumulate.
candidate = (current + " " if current else "") + sent
if len(candidate) <= max_chars:
current = candidate
else:
if current:
chunks.append(current.strip())
current = sent
if current:
chunks.append(current.strip())
return chunks
def _expand_inline(
text: str,
voice: str | None,
pitch: float = 0.0,
rate: float = 1.0,
) -> list[Node]:
out: list[Node] = []
text = text.strip()
if not text:
@ -111,12 +217,12 @@ def _expand_inline(text: str, voice: str | None) -> list[Node]:
for m in _TAG_RE.finditer(text):
pre = text[cursor : m.start()].strip()
if pre:
out.append(Node("text", pre, voice))
out.append(Node("text", pre, voice, pitch, rate))
out.append(Node("silence", _parse_tag(m)))
cursor = m.end()
tail = text[cursor:].strip()
if tail:
out.append(Node("text", tail, voice))
out.append(Node("text", tail, voice, pitch, rate))
return out
@ -130,12 +236,14 @@ def _split_paragraph_voices(para: str) -> list[Node]:
break
out.extend(_expand_inline(para[cursor : m.start()], None))
voice = m.group(1)
pitch = float(m.group(2)) if m.group(2) else 0.0
rate = float(m.group(3)) if m.group(3) else 1.0
body_start = m.end()
close_idx = para.find(_VOICE_CLOSE, body_start)
if close_idx < 0:
out.extend(_expand_inline(para[body_start:], voice))
out.extend(_expand_inline(para[body_start:], voice, pitch, rate))
break
out.extend(_expand_inline(para[body_start:close_idx], voice))
out.extend(_expand_inline(para[body_start:close_idx], voice, pitch, rate))
cursor = close_idx + len(_VOICE_CLOSE)
return out
@ -253,6 +361,7 @@ def synthesize(req: SynthesizeRequest) -> SynthesizeResponse:
started = time.monotonic()
pieces: list[np.ndarray] = []
voices_used: set[str] = set()
tortoise_chunks_rendered = 0
for node in nodes:
if node.kind == "silence":
pieces.append(_silence_samples(node.value))
@ -264,18 +373,37 @@ def synthesize(req: SynthesizeRequest) -> SynthesizeResponse:
except Exception as e:
log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e)
samples, latents = _get_voice(voice)
# Tortoise's tts_with_preset returns a torch.Tensor on the
# configured device.
audio_tensor = tts.tts_with_preset(
text=node.value,
voice_samples=samples,
conditioning_latents=latents,
preset=preset,
)
if isinstance(audio_tensor, list):
audio_tensor = audio_tensor[0]
arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32)
pieces.append(arr)
# Each text node may exceed Tortoise's reliable ~20s horizon —
# split at sentence boundaries before feeding the model.
sub_chunks = _chunk_for_tortoise(node.value)
for sub_idx, sub in enumerate(sub_chunks):
audio_tensor = tts.tts_with_preset(
text=sub,
voice_samples=samples,
conditioning_latents=latents,
preset=preset,
)
if isinstance(audio_tensor, list):
audio_tensor = audio_tensor[0]
arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32)
# Per-character voice modulation via librosa. Apply
# pitch first (preserves duration), then rate (preserves
# pitch). Default pitch=0, rate=1.0 = no-op fast path.
if abs(node.pitch) > 1e-3:
arr = librosa.effects.pitch_shift(
arr, sr=SAMPLE_RATE, n_steps=node.pitch
)
if abs(node.rate - 1.0) > 1e-3:
arr = librosa.effects.time_stretch(arr, rate=node.rate)
arr = arr.astype(np.float32)
pieces.append(arr)
tortoise_chunks_rendered += 1
log.info(
"chunk %d/%d done (%d chars, pitch=%+.1f rate=%.2f, %.1fs audio so far)",
sub_idx + 1, len(sub_chunks), len(sub),
node.pitch, node.rate,
sum(len(p) for p in pieces) / SAMPLE_RATE,
)
elapsed_ms = int((time.monotonic() - started) * 1000)
if not pieces: