From 36922706a2c33f5d2ff1d5d089d60610eaad752d Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 14 May 2026 09:40:59 -0700 Subject: [PATCH] engine/kokoro: question doubling + kludges notes Re-applies the Kokoro-specific hacks that main intentionally omits: - _emphasize_questions doubles '?' to '??' so the 82M's flat interrogative prosody gets a rising-pitch cue - engines/kokoro/hacks.md documents this and the other Kokoro- tuned bits (gap durations, lowercase-only respellings) with the 'remove when we move to a bigger model' marker Deploy from this branch to /mnt/cache/appdata/kokoro/build/ when you want the tuned version. Main's vanilla Kokoro is for reference / future cleanup. --- engines/kokoro/hacks.md | 48 ++++++++++++++++++++++++++++++++++++++++ engines/kokoro/server.py | 13 ++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 engines/kokoro/hacks.md diff --git a/engines/kokoro/hacks.md b/engines/kokoro/hacks.md new file mode 100644 index 0000000..0917827 --- /dev/null +++ b/engines/kokoro/hacks.md @@ -0,0 +1,48 @@ +# Kokoro engine — kludges branched off main + +This branch carries the engine-specific tweaks that don't generalise +to F5 / Tortoise. Each one is a real workaround for a real Kokoro-82M +limitation, not a stylistic choice — when we move to a bigger model +these should disappear. + +## 1. Doubled `??` for question prosody + +**File:** `server.py` — `_emphasize_questions` + `_QUESTION_RE`. + +Kokoro-82M's prosody on single `?` is flat — interrogatives read like +declaratives. The 82M parameter cap shows up here. Doubling the mark +to `??` triggers a noticeably stronger rising-pitch contour. + +Tried + works: `??`. Tried + worse: `?!` (sounds shouty), trailing +spaces (no effect). + +Remove when: upgrading to a bigger model OR a Kokoro version with +better prosody control. + +## 2. Paragraph / scene / breath gap durations + +**File:** `server.py` — `PARAGRAPH_GAP_S=0.7`, `SCENE_GAP_S=1.5`, +`BREATH_GAP_S=0.4`. + +These were eyeballed against af_heart's natural pacing for long-form +prose. Other voices (e.g. am_michael's slower delivery) may want +shorter gaps; a per-voice override map would be more correct but +isn't worth the complexity yet. + +The 2026-05-14 feedback was "some pauses are a tad too long" — the +0.7/1.5/0.4 may want to drop to 0.5/1.2/0.3 if confirmed. + +## 3. Pronunciation respellings as ALL-LOWERCASE + +**File:** *(data, not code — pronunciation_overrides DB table)* + +Kokoro's misaki phonemizer treats consecutive uppercase letters as +initialisms ("PRIP-yat" → "P-R-I-P yat"). The seeded respellings in +`pronunciation_overrides WHERE phoneme_format='respelling'` must +therefore use lowercase syllabification: `prip-yat`, `dyat-loff`, +`bryu-hah-noff`. Stress marking is lost. + +For tortoise this constraint may not hold (different phonemizer); +the respelling format is currently kokoro-tuned. Future: per-engine +phoneme_format buckets, or have skald narrate pass the engine name +when selecting overrides. diff --git a/engines/kokoro/server.py b/engines/kokoro/server.py index 169cbbd..b0a6f9d 100644 --- a/engines/kokoro/server.py +++ b/engines/kokoro/server.py @@ -113,12 +113,23 @@ def _parse_tag(match: re.Match) -> float: return dur / 1000.0 if unit == "ms" else dur +# [HACK — engine/kokoro] Kokoro-82M has weak question prosody on a +# single `?`. Doubling the question mark to `??` reliably triggers a +# more interrogative rising-pitch contour without changing semantics. +# Skip if already doubled or part of an interrobang. See hacks.md. +_QUESTION_RE = re.compile(r"(? str: + return _QUESTION_RE.sub("??", text) + + def _expand_inline(text: str, voice: str | None) -> list[Node]: """Expand inline [breath]/[pause]/[scene] tags inside a chunk of text that already has a single voice attribution. Voice blocks themselves are handled one level up in split_to_nodes.""" out: list[Node] = [] - text = text.strip() + text = _emphasize_questions(text.strip()) if not text: return out cursor = 0