From 9df378f799f48db7e83635e48dbc3f6bd7cb16ef Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 19:08:43 -0700 Subject: [PATCH] engine/tortoise: sentence chunking + device fix + pitch/rate modulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Catches up engines/tortoise/server.py with what's been deployed on Lucy through tonight's smoke iterations: 0.2 — _chunk_for_tortoise splits text nodes at sentence boundaries (max 220 chars) before each tts_with_preset call. Fixes the end-of-prompt gibberish past tortoise's ~20s reliable horizon. 0.3 — _get_voice now .to(DEVICE) cached samples + latents. Without this, non-lj voices crash with 'Expected all tensors to be on the same device, but found cpu and cuda:0'. 0.4 — [voice:NAME pitch=N rate=R][/voice] tag syntax. librosa pitch_shift + time_stretch applied per-chunk for single-voice multi-character renders. The strategy survived the design table — but the librosa phase-vocoder artifacts at ±5 semitones ate the quality on the 2070 Super. Parked here for the GPU rebuild; modulation works architecturally, just needs better stretching algorithm (rubberband) + more headroom. Production stayed Kokoro. Coast-Down preferred_voice_id reverted to kokoro_af_heart in the live DB after this experiment. --- engines/tortoise/server.py | 172 ++++++++++++++++++++++++++++++++----- 1 file changed, 150 insertions(+), 22 deletions(-) diff --git a/engines/tortoise/server.py b/engines/tortoise/server.py index c39eafe..ef602e5 100644 --- a/engines/tortoise/server.py +++ b/engines/tortoise/server.py @@ -23,6 +23,7 @@ import time import uuid from pathlib import Path +import librosa import numpy as np import soundfile as sf import torch @@ -62,12 +63,31 @@ def _get_tts() -> TextToSpeech: return _tts +def _move_to_device(obj): + """Recursively .to(DEVICE) tensors inside the structure tortoise + returns from load_voice. voice_samples is a list of tensors; + conditioning_latents is a tuple of tensors. Anything else + passes through unchanged (e.g. None, ints).""" + if obj is None: + return obj + if isinstance(obj, torch.Tensor): + return obj.to(DEVICE) + if isinstance(obj, list): + return [_move_to_device(x) for x in obj] + if isinstance(obj, tuple): + return tuple(_move_to_device(x) for x in obj) + return obj + + def _get_voice(name: str) -> tuple: """Cache voice latents to avoid re-loading reference clips on every synthesis call. Tortoise's load_voice returns - (voice_samples, conditioning_latents).""" + (voice_samples, conditioning_latents) — but they're created on + CPU; we move them to DEVICE so the autoregressive model (on + CUDA) doesn't fail with cpu/cuda tensor-device mismatch.""" if name not in _voice_cache: - _voice_cache[name] = load_voice(name) + samples, latents = load_voice(name) + _voice_cache[name] = (_move_to_device(samples), _move_to_device(latents)) return _voice_cache[name] @@ -75,15 +95,38 @@ def _get_voice(name: str) -> tuple: class Node: - __slots__ = ("kind", "value", "voice") + __slots__ = ("kind", "value", "voice", "pitch", "rate") - def __init__(self, kind: str, value, voice: str | None = None): + def __init__( + self, + kind: str, + value, + voice: str | None = None, + pitch: float = 0.0, + rate: float = 1.0, + ): + # kind ∈ {"text", "silence"}; value is str for text, float + # seconds for silence. voice/pitch/rate are character-voicing + # modifiers from [voice:NAME pitch=N rate=R] tags. Default: + # request voice, 0 semitones, 1x rate. self.kind = kind self.value = value self.voice = voice + self.pitch = pitch + self.rate = rate -_VOICE_OPEN_RE = re.compile(r"\[voice:([A-Za-z0-9_-]+)\]") +# Voice open tag — name + optional pitch (semitones) + optional rate: +# [voice:dyatlov] → voice swap only +# [voice:lj pitch=-3] → same voice, 3 semitones lower +# [voice:lj pitch=2 rate=1.1] → higher + slightly faster (fairy) +# [voice:lj pitch=-4 rate=0.9] → lower + slower (troll) +_VOICE_OPEN_RE = re.compile( + r"\[voice:([A-Za-z0-9_-]+)" + r"(?:\s+pitch=(-?[0-9]+(?:\.[0-9]+)?))?" + r"(?:\s+rate=([0-9]+(?:\.[0-9]+)?))?" + r"\]" +) _VOICE_CLOSE = "[/voice]" _TAG_RE = re.compile( r"\[(pause:(?P[0-9]+(?:\.[0-9]+)?)(?Ps|ms)?|breath|scene)\]", @@ -102,7 +145,70 @@ def _parse_tag(match: re.Match) -> float: return dur / 1000.0 if unit == "ms" else dur -def _expand_inline(text: str, voice: str | None) -> list[Node]: +# Tortoise's autoregressive head loses coherence past ~20s of generated +# audio per inference call. lj's pace is roughly 14 chars/s, so anything +# past ~280 chars per call risks gibberish at the end. We split inside +# _expand_inline at sentence boundaries to keep each tts_with_preset +# call inside the model's reliable horizon. +TORTOISE_MAX_CHUNK_CHARS = 220 + +# Sentence boundary regex — splits on `.`/`?`/`!` followed by whitespace +# and a capital letter (keeps "Mr. Smith" / "U.S." together) OR at any +# newline. +_SENTENCE_BOUNDARY = re.compile(r"(?<=[\.!?])\s+(?=[A-Z\"\(])|(?<=\n)\s*") + + +def _chunk_for_tortoise(text: str, max_chars: int = TORTOISE_MAX_CHUNK_CHARS) -> list[str]: + """Split text into chunks <= max_chars at sentence boundaries. + If a single sentence exceeds max_chars (rare for prose), fall + back to splitting that sentence at commas or just hard-cutting. + """ + sentences = [s.strip() for s in _SENTENCE_BOUNDARY.split(text) if s and s.strip()] + chunks: list[str] = [] + current = "" + for sent in sentences: + # Long sentence: emit alone, but try sub-splitting at commas. + if len(sent) > max_chars: + if current: + chunks.append(current.strip()) + current = "" + # Split on commas + parts = [p.strip() for p in sent.split(",") if p.strip()] + sub = "" + for p in parts: + add = (sub + ", " if sub else "") + p + if len(add) <= max_chars: + sub = add + else: + if sub: + chunks.append(sub) + # If even the part alone exceeds, hard-cut at max_chars + while len(p) > max_chars: + chunks.append(p[:max_chars]) + p = p[max_chars:] + sub = p + if sub: + chunks.append(sub) + continue + # Sentence fits — accumulate. + candidate = (current + " " if current else "") + sent + if len(candidate) <= max_chars: + current = candidate + else: + if current: + chunks.append(current.strip()) + current = sent + if current: + chunks.append(current.strip()) + return chunks + + +def _expand_inline( + text: str, + voice: str | None, + pitch: float = 0.0, + rate: float = 1.0, +) -> list[Node]: out: list[Node] = [] text = text.strip() if not text: @@ -111,12 +217,12 @@ def _expand_inline(text: str, voice: str | None) -> list[Node]: for m in _TAG_RE.finditer(text): pre = text[cursor : m.start()].strip() if pre: - out.append(Node("text", pre, voice)) + out.append(Node("text", pre, voice, pitch, rate)) out.append(Node("silence", _parse_tag(m))) cursor = m.end() tail = text[cursor:].strip() if tail: - out.append(Node("text", tail, voice)) + out.append(Node("text", tail, voice, pitch, rate)) return out @@ -130,12 +236,14 @@ def _split_paragraph_voices(para: str) -> list[Node]: break out.extend(_expand_inline(para[cursor : m.start()], None)) voice = m.group(1) + pitch = float(m.group(2)) if m.group(2) else 0.0 + rate = float(m.group(3)) if m.group(3) else 1.0 body_start = m.end() close_idx = para.find(_VOICE_CLOSE, body_start) if close_idx < 0: - out.extend(_expand_inline(para[body_start:], voice)) + out.extend(_expand_inline(para[body_start:], voice, pitch, rate)) break - out.extend(_expand_inline(para[body_start:close_idx], voice)) + out.extend(_expand_inline(para[body_start:close_idx], voice, pitch, rate)) cursor = close_idx + len(_VOICE_CLOSE) return out @@ -253,6 +361,7 @@ def synthesize(req: SynthesizeRequest) -> SynthesizeResponse: started = time.monotonic() pieces: list[np.ndarray] = [] voices_used: set[str] = set() + tortoise_chunks_rendered = 0 for node in nodes: if node.kind == "silence": pieces.append(_silence_samples(node.value)) @@ -264,18 +373,37 @@ def synthesize(req: SynthesizeRequest) -> SynthesizeResponse: except Exception as e: log.warning("voice %s failed to load (%s); falling back to default", seg_voice, e) samples, latents = _get_voice(voice) - # Tortoise's tts_with_preset returns a torch.Tensor on the - # configured device. - audio_tensor = tts.tts_with_preset( - text=node.value, - voice_samples=samples, - conditioning_latents=latents, - preset=preset, - ) - if isinstance(audio_tensor, list): - audio_tensor = audio_tensor[0] - arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32) - pieces.append(arr) + # Each text node may exceed Tortoise's reliable ~20s horizon — + # split at sentence boundaries before feeding the model. + sub_chunks = _chunk_for_tortoise(node.value) + for sub_idx, sub in enumerate(sub_chunks): + audio_tensor = tts.tts_with_preset( + text=sub, + voice_samples=samples, + conditioning_latents=latents, + preset=preset, + ) + if isinstance(audio_tensor, list): + audio_tensor = audio_tensor[0] + arr = audio_tensor.squeeze().cpu().numpy().astype(np.float32) + # Per-character voice modulation via librosa. Apply + # pitch first (preserves duration), then rate (preserves + # pitch). Default pitch=0, rate=1.0 = no-op fast path. + if abs(node.pitch) > 1e-3: + arr = librosa.effects.pitch_shift( + arr, sr=SAMPLE_RATE, n_steps=node.pitch + ) + if abs(node.rate - 1.0) > 1e-3: + arr = librosa.effects.time_stretch(arr, rate=node.rate) + arr = arr.astype(np.float32) + pieces.append(arr) + tortoise_chunks_rendered += 1 + log.info( + "chunk %d/%d done (%d chars, pitch=%+.1f rate=%.2f, %.1fs audio so far)", + sub_idx + 1, len(sub_chunks), len(sub), + node.pitch, node.rate, + sum(len(p) for p in pieces) / SAMPLE_RATE, + ) elapsed_ms = int((time.monotonic() - started) * 1000) if not pieces: