From c2bb12fdd04d0d308b3b21d2f52f7131ab0f7800 Mon Sep 17 00:00:00 2001 From: Kayos Date: Wed, 13 May 2026 16:45:04 -0700 Subject: [PATCH] narrate: F5-TTS HTTP client + skald narrate CLI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit skald-core::narrate ships a thin reqwest client + voice DB access (get_by_name, get_default, get_by_id). The boundary is the f5-tts container's purpose-built FastAPI sidecar (python lives there because torch + transformers + safetensors do); skald never touches python. CLI: skald narrate --chapter [--voice slug] [--speed 1.0]. Voice resolution: --voice flag → story.preferred_voice_id → system default. Persists narration_runs row (engine='f5-tts', engine_version pinned, status: running → succeeded|failed). Output path stored is the f5-tts container's view (/audio/--.wav); web playback wiring deferred. --- Cargo.toml | 1 + skald-core/Cargo.toml | 1 + skald-core/src/lib.rs | 1 + skald-core/src/narrate.rs | 186 ++++++++++++++++++++++++++++++++++++++ skald/src/main.rs | 19 ++++ skald/src/narrate.rs | 180 ++++++++++++++++++++++++++++++++++++ 6 files changed, 388 insertions(+) create mode 100644 skald-core/src/narrate.rs create mode 100644 skald/src/narrate.rs diff --git a/Cargo.toml b/Cargo.toml index 9c1cd05..1201067 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,3 +30,4 @@ uuid = { version = "1", features = ["v4", "serde"] } regex = "1" async-trait = "0.1" maud = "0.27" +reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } diff --git a/skald-core/Cargo.toml b/skald-core/Cargo.toml index 295a29c..3200125 100644 --- a/skald-core/Cargo.toml +++ b/skald-core/Cargo.toml @@ -18,6 +18,7 @@ tracing = { workspace = true } chrono = { workspace = true } uuid = { workspace = true } regex = { workspace = true } +reqwest = { workspace = true } clawdforge = { path = "../vendor/clawdforge" } [dev-dependencies] diff --git a/skald-core/src/lib.rs b/skald-core/src/lib.rs index 8936a6a..5aafebd 100644 --- a/skald-core/src/lib.rs +++ b/skald-core/src/lib.rs @@ -11,6 +11,7 @@ pub mod db; pub mod forge; pub mod ingest; pub mod models; +pub mod narrate; /// Embeds the workspace `migrations/` directory at compile time. /// Run via `MIGRATOR.run(&pool).await` at boot. diff --git a/skald-core/src/narrate.rs b/skald-core/src/narrate.rs new file mode 100644 index 0000000..910cf02 --- /dev/null +++ b/skald-core/src/narrate.rs @@ -0,0 +1,186 @@ +//! F5-TTS client + voice DB access for the narrate pipeline. +//! +//! The boundary: skald (Rust) speaks HTTP+JSON to the f5-tts +//! container's purpose-built FastAPI sidecar (the python lives there +//! because torch + transformers + safetensors do). Skald never +//! imports python deps; the python service has no business logic. +//! +//! v0.1 flow: +//! 1. Skald loads chapter prose + chosen Voice row from the DB. +//! 2. POST /synthesize to f5-tts with gen_text + ref_audio_path. +//! 3. F5 writes the WAV to its /audio bind mount and returns the +//! path + duration metadata. +//! 4. Skald inserts a narration_runs row pointing at that path. +//! +//! Path note: output_path stored in narration_runs is the f5-tts +//! container's view (e.g. /audio/coast-down/8-abc.wav). To serve it +//! from skald's web UI we'll either mount the same dir on skald or +//! route audio bytes through f5-tts. Deferred. + +use std::time::Duration; + +use anyhow::Context; +use reqwest::Client as HttpClient; +use serde::{Deserialize, Serialize}; +use sqlx::PgPool; +use uuid::Uuid; + +#[derive(Debug, Clone)] +pub struct F5Config { + /// e.g. http://192.168.0.5:7792 + pub base_url: String, + /// Inference subprocess timeout. Long-form chapters (3000 words) + /// take 60-180s on an 8GB GPU; cap at 1800s to match clawdforge. + pub timeout: Duration, +} + +#[derive(Debug, Clone)] +pub struct Narrator { + cfg: F5Config, + http: HttpClient, +} + +impl Narrator { + pub fn new(cfg: F5Config) -> anyhow::Result { + let http = HttpClient::builder() + .timeout(cfg.timeout) + .user_agent(concat!("skald-narrate/", env!("CARGO_PKG_VERSION"))) + .build()?; + Ok(Self { cfg, http }) + } + + /// Synthesize one chapter to a WAV via the F5-TTS sidecar. + /// `output_filename` is a bare name (no slashes); the file lands + /// at `/audio/` in the f5-tts container. + pub async fn synthesize( + &self, + req: &SynthesizeRequest, + ) -> anyhow::Result { + let url = format!("{}/synthesize", self.cfg.base_url.trim_end_matches('/')); + let res = self + .http + .post(&url) + .json(req) + .send() + .await + .with_context(|| format!("POST {url} failed"))?; + + if !res.status().is_success() { + let status = res.status(); + let body = res.text().await.unwrap_or_default(); + anyhow::bail!("f5-tts /synthesize returned {status}: {body}"); + } + Ok(res.json::().await?) + } + + pub async fn healthz(&self) -> anyhow::Result { + let url = format!("{}/healthz", self.cfg.base_url.trim_end_matches('/')); + Ok(self.http.get(&url).send().await?.json().await?) + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct SynthesizeRequest { + pub gen_text: String, + pub ref_audio_path: String, + pub ref_text: Option, + pub output_filename: String, + pub speed: f32, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct SynthesizeResponse { + pub ok: bool, + pub output_path: String, + pub sample_rate_hz: i32, + pub duration_seconds: f32, + pub elapsed_ms: u64, + pub chars_in: i64, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct HealthResponse { + pub ok: bool, + pub device: String, + pub model: String, + pub vocoder: String, + pub loaded: bool, +} + +// ─── voice DB access ───────────────────────────────────────────── + +#[derive(Debug, Clone)] +pub struct Voice { + pub id: Uuid, + pub name: String, + pub display_name: String, + pub reference_path: Option, + pub reference_text: Option, + pub license: String, + pub is_default: bool, +} + +pub async fn get_voice_by_name(pool: &PgPool, name: &str) -> anyhow::Result> { + let row: Option<(Uuid, String, String, Option, Option, String, bool)> = + sqlx::query_as( + "SELECT id, name, display_name, reference_path, reference_text, license, is_default + FROM voices WHERE name = $1", + ) + .bind(name) + .fetch_optional(pool) + .await?; + Ok(row.map(|(id, name, display_name, reference_path, reference_text, license, is_default)| { + Voice { + id, + name, + display_name, + reference_path, + reference_text, + license, + is_default, + } + })) +} + +pub async fn get_default_voice(pool: &PgPool) -> anyhow::Result> { + let row: Option<(Uuid, String, String, Option, Option, String, bool)> = + sqlx::query_as( + "SELECT id, name, display_name, reference_path, reference_text, license, is_default + FROM voices WHERE is_default = true LIMIT 1", + ) + .fetch_optional(pool) + .await?; + Ok(row.map(|(id, name, display_name, reference_path, reference_text, license, is_default)| { + Voice { + id, + name, + display_name, + reference_path, + reference_text, + license, + is_default, + } + })) +} + +pub async fn get_voice_by_id(pool: &PgPool, id: Uuid) -> anyhow::Result> { + let row: Option<(Uuid, String, String, Option, Option, String, bool)> = + sqlx::query_as( + "SELECT id, name, display_name, reference_path, reference_text, license, is_default + FROM voices WHERE id = $1", + ) + .bind(id) + .fetch_optional(pool) + .await?; + Ok(row.map(|(id, name, display_name, reference_path, reference_text, license, is_default)| { + Voice { + id, + name, + display_name, + reference_path, + reference_text, + license, + is_default, + } + })) +} diff --git a/skald/src/main.rs b/skald/src/main.rs index 234e0e5..70ad5cf 100644 --- a/skald/src/main.rs +++ b/skald/src/main.rs @@ -7,6 +7,7 @@ mod authors_seed; mod continue_story; mod import; +mod narrate; mod serve; mod show_context; mod summarize; @@ -126,6 +127,19 @@ enum Cmd { #[arg(long, default_value = "1")] chapters: usize, }, + /// Render one chapter to audio via F5-TTS. Resolves voice via + /// --voice slug → story.preferred_voice_id → system default. + Narrate { + /// Chapter UUID to narrate. + #[arg(long)] + chapter: Uuid, + /// Optional voice slug override (e.g. "lj_speech"). + #[arg(long)] + voice: Option, + /// Speech speed (0.5–2.0). 1.0 = natural pace. + #[arg(long, default_value = "1.0")] + speed: f32, + }, } #[tokio::main] @@ -191,6 +205,11 @@ async fn run() -> anyhow::Result<()> { ) .await } + Cmd::Narrate { + chapter, + voice, + speed, + } => narrate::run(&cli.database_url, chapter, voice.as_deref(), speed).await, } } diff --git a/skald/src/narrate.rs b/skald/src/narrate.rs new file mode 100644 index 0000000..bea3c33 --- /dev/null +++ b/skald/src/narrate.rs @@ -0,0 +1,180 @@ +//! `skald narrate` — render chapter prose to audio via F5-TTS. +//! +//! Resolution order for the voice: +//! 1. --voice flag (explicit override) +//! 2. story.preferred_voice_id (per-story pin) +//! 3. voices.is_default = true (the system default) +//! +//! Output filename layout: /-.wav. +//! Story-slug isn't yet on the schema, so v0.1 uses the bare +//! story-uuid prefix. + +use std::time::Instant; + +use anyhow::{Context, bail}; +use chrono::Utc; +use skald_core::db; +use skald_core::narrate::{F5Config, Narrator, SynthesizeRequest, Voice}; +use sqlx::PgPool; +use uuid::Uuid; + +pub async fn run( + database_url: &str, + chapter_id: Uuid, + voice_slug: Option<&str>, + speed: f32, +) -> anyhow::Result<()> { + let cfg = load_f5_config()?; + tracing::info!(base_url = %cfg.base_url, "f5-tts configured"); + + let pool = db::connect_and_migrate(database_url).await?; + let narrator = Narrator::new(cfg)?; + + // Quick health probe before we burn time loading rows. + let h = narrator.healthz().await.context("f5-tts healthz failed")?; + if !h.loaded { + bail!("f5-tts /healthz says model is not loaded yet — retry shortly"); + } + tracing::info!(device = %h.device, model = %h.model, "f5-tts ready"); + + let chapter = load_chapter(&pool, chapter_id).await?; + let voice = resolve_voice(&pool, &chapter, voice_slug).await?; + tracing::info!( + voice = %voice.name, + voice_license = %voice.license, + chapter_n = chapter.n, + word_count = chapter.word_count, + "narrating", + ); + + let ref_audio_path = voice + .reference_path + .as_ref() + .ok_or_else(|| anyhow::anyhow!("voice '{}' has no reference_path", voice.name))? + .clone(); + + let run_id = Uuid::new_v4(); + let output_filename = format!("{}-{}-{}.wav", chapter.story_id, chapter.n, run_id); + + let run_row_id: Uuid = sqlx::query_scalar( + "INSERT INTO narration_runs (id, chapter_id, voice_id, engine, engine_version, status) + VALUES ($1, $2, $3, 'f5-tts', '1.1.20', 'running') RETURNING id", + ) + .bind(run_id) + .bind(chapter_id) + .bind(voice.id) + .fetch_one(&pool) + .await?; + + let started = Instant::now(); + let req = SynthesizeRequest { + gen_text: chapter.body_md.clone(), + ref_audio_path, + ref_text: voice.reference_text.clone(), + output_filename, + speed, + }; + + let res = match narrator.synthesize(&req).await { + Ok(r) => r, + Err(e) => { + sqlx::query( + "UPDATE narration_runs SET status='failed', error=$1, ended_at=$2 WHERE id=$3", + ) + .bind(format!("{e:#}")) + .bind(Utc::now()) + .bind(run_row_id) + .execute(&pool) + .await?; + return Err(e); + } + }; + let elapsed = started.elapsed(); + + sqlx::query( + "UPDATE narration_runs + SET status='succeeded', + output_path=$1, + duration_seconds=$2, + ended_at=$3 + WHERE id=$4", + ) + .bind(&res.output_path) + .bind(res.duration_seconds) + .bind(Utc::now()) + .bind(run_row_id) + .execute(&pool) + .await?; + + println!( + "narrated chapter {} of story {}: {} ({:.2}s audio, {:.1}s wall clock)", + chapter.n, + chapter.story_id, + res.output_path, + res.duration_seconds, + elapsed.as_secs_f32(), + ); + Ok(()) +} + +#[derive(Debug, Clone)] +struct ChapterRow { + story_id: Uuid, + n: i32, + body_md: String, + word_count: i32, +} + +async fn load_chapter(pool: &PgPool, id: Uuid) -> anyhow::Result { + let row: Option<(Uuid, i32, String, i32)> = sqlx::query_as( + "SELECT story_id, n, body_md, word_count FROM chapters WHERE id = $1", + ) + .bind(id) + .fetch_optional(pool) + .await?; + let (story_id, n, body_md, word_count) = + row.with_context(|| format!("chapter {id} not found"))?; + Ok(ChapterRow { + story_id, + n, + body_md, + word_count, + }) +} + +async fn resolve_voice( + pool: &PgPool, + chapter: &ChapterRow, + flag_slug: Option<&str>, +) -> anyhow::Result { + if let Some(slug) = flag_slug { + return skald_core::narrate::get_voice_by_name(pool, slug) + .await? + .with_context(|| format!("voice '{slug}' not found")); + } + // Story-pinned voice? + let pinned: Option = + sqlx::query_scalar("SELECT preferred_voice_id FROM stories WHERE id = $1") + .bind(chapter.story_id) + .fetch_optional(pool) + .await? + .flatten(); + if let Some(vid) = pinned { + if let Some(v) = skald_core::narrate::get_voice_by_id(pool, vid).await? { + return Ok(v); + } + } + // Else system default. + skald_core::narrate::get_default_voice(pool) + .await? + .ok_or_else(|| anyhow::anyhow!("no default voice set; create one or use --voice ")) +} + +fn load_f5_config() -> anyhow::Result { + let base_url = std::env::var("F5_TTS_URL") + .unwrap_or_else(|_| "http://192.168.0.5:7792".into()); + Ok(F5Config { + base_url, + timeout: std::time::Duration::from_secs(1800), + }) +}