engine/tortoise: sentence chunking + device fix + pitch/rate modulation

Catches up engines/tortoise/server.py with what's been deployed on
Lucy through tonight's smoke iterations:

0.2 — _chunk_for_tortoise splits text nodes at sentence boundaries
      (max 220 chars) before each tts_with_preset call. Fixes the
      end-of-prompt gibberish past tortoise's ~20s reliable horizon.

0.3 — _get_voice now .to(DEVICE) cached samples + latents. Without
      this, non-lj voices crash with 'Expected all tensors to be on
      the same device, but found cpu and cuda:0'.

0.4 — [voice:NAME pitch=N rate=R][/voice] tag syntax. librosa
      pitch_shift + time_stretch applied per-chunk for single-voice
      multi-character renders. The strategy survived the design
      table — but the librosa phase-vocoder artifacts at ±5 semitones
      ate the quality on the 2070 Super. Parked here for the GPU
      rebuild; modulation works architecturally, just needs better
      stretching algorithm (rubberband) + more headroom.

Production stayed Kokoro. Coast-Down preferred_voice_id reverted
to kokoro_af_heart in the live DB after this experiment.
This commit is contained in:
Kayos 2026-05-14 19:08:43 -07:00
parent 7a96031aa6
commit 9df378f799

View file

@ -23,6 +23,7 @@ import time
import uuid import uuid
from pathlib import Path from pathlib import Path
import librosa
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
import torch import torch
@ -62,12 +63,31 @@ def _get_tts() -> TextToSpeech:
return _tts return _tts
def _move_to_device(obj):
"""Recursively .to(DEVICE) tensors inside the structure tortoise
returns from load_voice. voice_samples is a list of tensors;
conditioning_latents is a tuple of tensors. Anything else
passes through unchanged (e.g. None, ints)."""
if obj is None:
return obj
if isinstance(obj, torch.Tensor):
return obj.to(DEVICE)
if isinstance(obj, list):
return [_move_to_device(x) for x in obj]
if isinstance(obj, tuple):
return tuple(_move_to_device(x) for x in obj)
return obj
def _get_voice(name: str) -> tuple: def _get_voice(name: str) -> tuple:
"""Cache voice latents to avoid re-loading reference clips on """Cache voice latents to avoid re-loading reference clips on
every synthesis call. Tortoise's load_voice returns every synthesis call. Tortoise's load_voice returns
(voice_samples, conditioning_latents).""" (voice_samples, conditioning_latents) but they're created on
CPU; we move them to DEVICE so the autoregressive model (on
CUDA) doesn't fail with cpu/cuda tensor-device mismatch."""
if name not in _voice_cache: if name not in _voice_cache:
_voice_cache[name] = load_voice(name) samples, latents = load_voice(name)
_voice_cache[name] = (_move_to_device(samples), _move_to_device(latents))
return _voice_cache[name] return _voice_cache[name]
@ -75,15 +95,38 @@ def _get_voice(name: str) -> tuple:
class Node: class Node:
__slots__ = ("kind", "value", "voice") __slots__ = ("kind", "value", "voice", "pitch", "rate")
def __init__(self, kind: str, value, voice: str | None = None): def __init__(
self,
kind: str,
value,
voice: str | None = None,
pitch: float = 0.0,
rate: float = 1.0,
):
# kind ∈ {"text", "silence"}; value is str for text, float
# seconds for silence. voice/pitch/rate are character-voicing
# modifiers from [voice:NAME pitch=N rate=R] tags. Default:
# request voice, 0 semitones, 1x rate.
self.kind = kind self.kind = kind
self.value = value self.value = value
self.voice = voice self.voice = voice
self.pitch = pitch
self.rate = rate
_VOICE_OPEN_RE = re.compile(r"\[voice:([A-Za-z0-9_-]+)\]") # Voice open tag — name + optional pitch (semitones) + optional rate:
# [voice:dyatlov] → voice swap only
# [voice:lj pitch=-3] → same voice, 3 semitones lower
# [voice:lj pitch=2 rate=1.1] → higher + slightly faster (fairy)
# [voice:lj pitch=-4 rate=0.9] → lower + slower (troll)
_VOICE_OPEN_RE = re.compile(
r"\[voice:([A-Za-z0-9_-]+)"
r"(?:\s+pitch=(-?[0-9]+(?:\.[0-9]+)?))?"
r"(?:\s+rate=([0-9]+(?:\.[0-9]+)?))?"
r"\]"
)
_VOICE_CLOSE = "[/voice]" _VOICE_CLOSE = "[/voice]"
_TAG_RE = re.compile( _TAG_RE = re.compile(
r"\[(pause:(?P<dur>[0-9]+(?:\.[0-9]+)?)(?P<unit>s|ms)?|breath|scene)\]", r"\[(pause:(?P<dur>[0-9]+(?:\.[0-9]+)?)(?P<unit>s|ms)?|breath|scene)\]",
@ -102,7 +145,70 @@ def _parse_tag(match: re.Match) -> float:
return dur / 1000.0 if unit == "ms" else dur return dur / 1000.0 if unit == "ms" else dur
def _expand_inline(text: str, voice: str | None) -> list[Node]: # Tortoise's autoregressive head loses coherence past ~20s of generated
# audio per inference call. lj's pace is roughly 14 chars/s, so anything
# past ~280 chars per call risks gibberish at the end. We split inside
# _expand_inline at sentence boundaries to keep each tts_with_preset
# call inside the model's reliable horizon.
TORTOISE_MAX_CHUNK_CHARS = 220
# Sentence boundary regex — splits on `.`/`?`/`!` followed by whitespace
# and a capital letter (keeps "Mr. Smith" / "U.S." together) OR at any
# newline.
_SENTENCE_BOUNDARY = re.compile(r"(?<=[\.!?])\s+(?=[A-Z\"\(])|(?<=\n)\s*")
def _chunk_for_tortoise(text: str, max_chars: int = TORTOISE_MAX_CHUNK_CHARS) -> list[str]:
"""Split text into chunks <= max_chars at sentence boundaries.
If a single sentence exceeds max_chars (rare for prose), fall
back to splitting that sentence at commas or just hard-cutting.
"""
sentences = [s.strip() for s in _SENTENCE_BOUNDARY.split(text) if s and s.strip()]
chunks: list[str] = []
current = ""
for sent in sentences:
# Long sentence: emit alone, but try sub-splitting at commas.
if len(sent) > max_chars:
if current:
chunks.append(current.strip())
current = ""
# Split on commas
parts = [p.strip() for p in sent.split(",") if p.strip()]
sub = ""
for p in parts:
add = (sub + ", " if sub else "") + p
if len(add) <= max_chars:
sub = add
else:
if sub:
chunks.append(sub)
# If even the part alone exceeds, hard-cut at max_chars
while len(p) > max_chars:
chunks.append(p[:max_chars])
p = p[max_chars:]
sub = p
if sub:
chunks.append(sub)
continue
# Sentence fits — accumulate.
candidate = (current + " " if current else "") + sent
if len(candidate) <= max_chars:
current = candidate
else:
if current:
chunks.append(current.strip())
current = sent
if current:
chunks.append(current.strip())
return chunks
def _expand_inline(
text: str,
voice: str | None,
pitch: float = 0.0,
rate: float = 1.0,
) -> list[Node]:
out: list[Node] = [] out: list[Node] = []
text = text.strip() text = text.strip()
if not text: if not text:
@ -111,12 +217,12 @@ def _expand_inline(text: str, voice: str | None) -> list[Node]:
for m in _TAG_RE.finditer(text): for m in _TAG_RE.finditer(text):
pre = text[cursor : m.start()].strip() pre = text[cursor : m.start()].strip()
if pre: if pre:
out.append(Node("text", pre, voice)) out.append(Node("text", pre, voice, pitch, rate))
out.append(Node("silence", _parse_tag(m))) out.append(Node("silence", _parse_tag(m)))
cursor = m.end() cursor = m.end()
tail = text[cursor:].strip() tail = text[cursor:].strip()
if tail: if tail:
out.append(Node("text", tail, voice)) out.append(Node("text", tail, voice, pitch, rate))
return out return out
@ -130,12 +236,14 @@ def _split_paragraph_voices(para: str) -> list[Node]:
break break
out.extend(_expand_inline(para[cursor : m.start()], None)) out.extend(_expand_inline(para[cursor : m.start()], None))
voice = m.group(1) voice = m.group(1)
pitch = float(m.group(2)) if m.group(2) else 0.0
rate = float(m.group(3)) if m.group(3) else 1.0
body_start = m.end() body_start = m.end()
close_idx = para.find(_VOICE_CLOSE, body_start) close_idx = para.find(_VOICE_CLOSE, body_start)
if close_idx < 0: if close_idx < 0:
out.extend(_expand_inline(para[body_start:], voice)) out.extend(_expand_inline(para[body_start:], voice, pitch, rate))
break break
out.extend(_expand_inline(para[body_start:close_idx], voice)) out.extend(_expand_inline(para[body_start:close_idx], voice, pitch, rate))
cursor = close_idx + len(_VOICE_CLOSE) cursor = close_idx + len(_VOICE_CLOSE)
return out return out
@ -253,6 +361,7 @@ def synthesize(req: SynthesizeRequest) -> SynthesizeResponse:
started = time.monotonic() started = time.monotonic()
pieces: list[np.ndarray] = [] pieces: list[np.ndarray] = []
voices_used: set[str] = set() voices_used: set[str] = set()
tortoise_chunks_rendered = 0
for node in nodes: for node in nodes:
if node.kind == "silence": if node.kind == "silence":
pieces.append(_silence_samples(node.value)) pieces.append(_silence_samples(node.value))
@ -264,18 +373,37 @@ def synthesize(req: SynthesizeRequest) -> SynthesizeResponse:
except Exception as e: except Exception as e:
log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e) log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e)
samples, latents = _get_voice(voice) samples, latents = _get_voice(voice)
# Tortoise's tts_with_preset returns a torch.Tensor on the # Each text node may exceed Tortoise's reliable ~20s horizon —
# configured device. # split at sentence boundaries before feeding the model.
audio_tensor = tts.tts_with_preset( sub_chunks = _chunk_for_tortoise(node.value)
text=node.value, for sub_idx, sub in enumerate(sub_chunks):
voice_samples=samples, audio_tensor = tts.tts_with_preset(
conditioning_latents=latents, text=sub,
preset=preset, voice_samples=samples,
) conditioning_latents=latents,
if isinstance(audio_tensor, list): preset=preset,
audio_tensor = audio_tensor[0] )
arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32) if isinstance(audio_tensor, list):
pieces.append(arr) audio_tensor = audio_tensor[0]
arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32)
# Per-character voice modulation via librosa. Apply
# pitch first (preserves duration), then rate (preserves
# pitch). Default pitch=0, rate=1.0 = no-op fast path.
if abs(node.pitch) > 1e-3:
arr = librosa.effects.pitch_shift(
arr, sr=SAMPLE_RATE, n_steps=node.pitch
)
if abs(node.rate - 1.0) > 1e-3:
arr = librosa.effects.time_stretch(arr, rate=node.rate)
arr = arr.astype(np.float32)
pieces.append(arr)
tortoise_chunks_rendered += 1
log.info(
"chunk %d/%d done (%d chars, pitch=%+.1f rate=%.2f, %.1fs audio so far)",
sub_idx + 1, len(sub_chunks), len(sub),
node.pitch, node.rate,
sum(len(p) for p in pieces) / SAMPLE_RATE,
)
elapsed_ms = int((time.monotonic() - started) * 1000) elapsed_ms = int((time.monotonic() - started) * 1000)
if not pieces: if not pieces: