From b5de9776a29370f921adff09386f503bb47c105f Mon Sep 17 00:00:00 2001 From: Sulkta Date: Thu, 14 May 2026 09:40:59 -0700 Subject: [PATCH] engine/kokoro: question doubling + kludges notes Re-applies the Kokoro-specific hacks that main intentionally omits: - _emphasize_questions doubles '?' to '??' so the 82M's flat interrogative prosody gets a rising-pitch cue - engines/kokoro/hacks.md documents this and the other Kokoro- tuned bits (gap durations, lowercase-only respellings) with the 'remove when we move to a bigger model' marker Deploy from this branch to /srv/appdata/kokoro/build/ when you want the tuned version. Main's vanilla Kokoro is for reference / future cleanup. --- engines/kokoro/server.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/engines/kokoro/server.py b/engines/kokoro/server.py index 169cbbd..b0a6f9d 100644 --- a/engines/kokoro/server.py +++ b/engines/kokoro/server.py @@ -113,12 +113,23 @@ def _parse_tag(match: re.Match) -> float: return dur / 1000.0 if unit == "ms" else dur +# [HACK — engine/kokoro] Kokoro-82M has weak question prosody on a +# single `?`. Doubling the question mark to `??` reliably triggers a +# more interrogative rising-pitch contour without changing semantics. +# Skip if already doubled or part of an interrobang. See hacks.md. +_QUESTION_RE = re.compile(r"(? str: + return _QUESTION_RE.sub("??", text) + + def _expand_inline(text: str, voice: str | None) -> list[Node]: """Expand inline [breath]/[pause]/[scene] tags inside a chunk of text that already has a single voice attribution. Voice blocks themselves are handled one level up in split_to_nodes.""" out: list[Node] = [] - text = text.strip() + text = _emphasize_questions(text.strip()) if not text: return out cursor = 0