narrate: F5-TTS HTTP client + skald narrate CLI

skald-core::narrate ships a thin reqwest client + voice DB access
(get_by_name, get_default, get_by_id). The boundary is the f5-tts
container's purpose-built FastAPI sidecar (python lives there
because torch + transformers + safetensors do); skald never touches
python.

CLI: skald narrate --chapter <uuid> [--voice slug] [--speed 1.0].
Voice resolution: --voice flag → story.preferred_voice_id → system
default. Persists narration_runs row (engine='f5-tts', engine_version
pinned, status: running → succeeded|failed). Output path stored is
the f5-tts container's view (/audio/<story>-<n>-<run>.wav); web
playback wiring deferred.
This commit is contained in:
Kayos 2026-05-13 16:45:04 -07:00
parent 3a749b7643
commit c2bb12fdd0
6 changed files with 388 additions and 0 deletions

View file

@ -30,3 +30,4 @@ uuid = { version = "1", features = ["v4", "serde"] }
regex = "1"
async-trait = "0.1"
maud = "0.27"
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }

View file

@ -18,6 +18,7 @@ tracing = { workspace = true }
chrono = { workspace = true }
uuid = { workspace = true }
regex = { workspace = true }
reqwest = { workspace = true }
clawdforge = { path = "../vendor/clawdforge" }
[dev-dependencies]

View file

@ -11,6 +11,7 @@ pub mod db;
pub mod forge;
pub mod ingest;
pub mod models;
pub mod narrate;
/// Embeds the workspace `migrations/` directory at compile time.
/// Run via `MIGRATOR.run(&pool).await` at boot.

186
skald-core/src/narrate.rs Normal file
View file

@ -0,0 +1,186 @@
//! F5-TTS client + voice DB access for the narrate pipeline.
//!
//! The boundary: skald (Rust) speaks HTTP+JSON to the f5-tts
//! container's purpose-built FastAPI sidecar (the python lives there
//! because torch + transformers + safetensors do). Skald never
//! imports python deps; the python service has no business logic.
//!
//! v0.1 flow:
//! 1. Skald loads chapter prose + chosen Voice row from the DB.
//! 2. POST /synthesize to f5-tts with gen_text + ref_audio_path.
//! 3. F5 writes the WAV to its /audio bind mount and returns the
//! path + duration metadata.
//! 4. Skald inserts a narration_runs row pointing at that path.
//!
//! Path note: output_path stored in narration_runs is the f5-tts
//! container's view (e.g. /audio/coast-down/8-abc.wav). To serve it
//! from skald's web UI we'll either mount the same dir on skald or
//! route audio bytes through f5-tts. Deferred.
use std::time::Duration;
use anyhow::Context;
use reqwest::Client as HttpClient;
use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use uuid::Uuid;
#[derive(Debug, Clone)]
pub struct F5Config {
/// e.g. http://192.168.0.5:7792
pub base_url: String,
/// Inference subprocess timeout. Long-form chapters (3000 words)
/// take 60-180s on an 8GB GPU; cap at 1800s to match clawdforge.
pub timeout: Duration,
}
#[derive(Debug, Clone)]
pub struct Narrator {
cfg: F5Config,
http: HttpClient,
}
impl Narrator {
pub fn new(cfg: F5Config) -> anyhow::Result<Self> {
let http = HttpClient::builder()
.timeout(cfg.timeout)
.user_agent(concat!("skald-narrate/", env!("CARGO_PKG_VERSION")))
.build()?;
Ok(Self { cfg, http })
}
/// Synthesize one chapter to a WAV via the F5-TTS sidecar.
/// `output_filename` is a bare name (no slashes); the file lands
/// at `/audio/<output_filename>` in the f5-tts container.
pub async fn synthesize(
&self,
req: &SynthesizeRequest,
) -> anyhow::Result<SynthesizeResponse> {
let url = format!("{}/synthesize", self.cfg.base_url.trim_end_matches('/'));
let res = self
.http
.post(&url)
.json(req)
.send()
.await
.with_context(|| format!("POST {url} failed"))?;
if !res.status().is_success() {
let status = res.status();
let body = res.text().await.unwrap_or_default();
anyhow::bail!("f5-tts /synthesize returned {status}: {body}");
}
Ok(res.json::<SynthesizeResponse>().await?)
}
pub async fn healthz(&self) -> anyhow::Result<HealthResponse> {
let url = format!("{}/healthz", self.cfg.base_url.trim_end_matches('/'));
Ok(self.http.get(&url).send().await?.json().await?)
}
}
#[derive(Debug, Clone, Serialize)]
pub struct SynthesizeRequest {
pub gen_text: String,
pub ref_audio_path: String,
pub ref_text: Option<String>,
pub output_filename: String,
pub speed: f32,
}
#[derive(Debug, Clone, Deserialize)]
pub struct SynthesizeResponse {
pub ok: bool,
pub output_path: String,
pub sample_rate_hz: i32,
pub duration_seconds: f32,
pub elapsed_ms: u64,
pub chars_in: i64,
}
#[derive(Debug, Clone, Deserialize)]
pub struct HealthResponse {
pub ok: bool,
pub device: String,
pub model: String,
pub vocoder: String,
pub loaded: bool,
}
// ─── voice DB access ─────────────────────────────────────────────
#[derive(Debug, Clone)]
pub struct Voice {
pub id: Uuid,
pub name: String,
pub display_name: String,
pub reference_path: Option<String>,
pub reference_text: Option<String>,
pub license: String,
pub is_default: bool,
}
pub async fn get_voice_by_name(pool: &PgPool, name: &str) -> anyhow::Result<Option<Voice>> {
let row: Option<(Uuid, String, String, Option<String>, Option<String>, String, bool)> =
sqlx::query_as(
"SELECT id, name, display_name, reference_path, reference_text, license, is_default
FROM voices WHERE name = $1",
)
.bind(name)
.fetch_optional(pool)
.await?;
Ok(row.map(|(id, name, display_name, reference_path, reference_text, license, is_default)| {
Voice {
id,
name,
display_name,
reference_path,
reference_text,
license,
is_default,
}
}))
}
pub async fn get_default_voice(pool: &PgPool) -> anyhow::Result<Option<Voice>> {
let row: Option<(Uuid, String, String, Option<String>, Option<String>, String, bool)> =
sqlx::query_as(
"SELECT id, name, display_name, reference_path, reference_text, license, is_default
FROM voices WHERE is_default = true LIMIT 1",
)
.fetch_optional(pool)
.await?;
Ok(row.map(|(id, name, display_name, reference_path, reference_text, license, is_default)| {
Voice {
id,
name,
display_name,
reference_path,
reference_text,
license,
is_default,
}
}))
}
pub async fn get_voice_by_id(pool: &PgPool, id: Uuid) -> anyhow::Result<Option<Voice>> {
let row: Option<(Uuid, String, String, Option<String>, Option<String>, String, bool)> =
sqlx::query_as(
"SELECT id, name, display_name, reference_path, reference_text, license, is_default
FROM voices WHERE id = $1",
)
.bind(id)
.fetch_optional(pool)
.await?;
Ok(row.map(|(id, name, display_name, reference_path, reference_text, license, is_default)| {
Voice {
id,
name,
display_name,
reference_path,
reference_text,
license,
is_default,
}
}))
}

View file

@ -7,6 +7,7 @@
mod authors_seed;
mod continue_story;
mod import;
mod narrate;
mod serve;
mod show_context;
mod summarize;
@ -126,6 +127,19 @@ enum Cmd {
#[arg(long, default_value = "1")]
chapters: usize,
},
/// Render one chapter to audio via F5-TTS. Resolves voice via
/// --voice slug → story.preferred_voice_id → system default.
Narrate {
/// Chapter UUID to narrate.
#[arg(long)]
chapter: Uuid,
/// Optional voice slug override (e.g. "lj_speech").
#[arg(long)]
voice: Option<String>,
/// Speech speed (0.52.0). 1.0 = natural pace.
#[arg(long, default_value = "1.0")]
speed: f32,
},
}
#[tokio::main]
@ -191,6 +205,11 @@ async fn run() -> anyhow::Result<()> {
)
.await
}
Cmd::Narrate {
chapter,
voice,
speed,
} => narrate::run(&cli.database_url, chapter, voice.as_deref(), speed).await,
}
}

180
skald/src/narrate.rs Normal file
View file

@ -0,0 +1,180 @@
//! `skald narrate` — render chapter prose to audio via F5-TTS.
//!
//! Resolution order for the voice:
//! 1. --voice <slug> flag (explicit override)
//! 2. story.preferred_voice_id (per-story pin)
//! 3. voices.is_default = true (the system default)
//!
//! Output filename layout: <story-slug-or-id>/<n>-<run-uuid>.wav.
//! Story-slug isn't yet on the schema, so v0.1 uses the bare
//! story-uuid prefix.
use std::time::Instant;
use anyhow::{Context, bail};
use chrono::Utc;
use skald_core::db;
use skald_core::narrate::{F5Config, Narrator, SynthesizeRequest, Voice};
use sqlx::PgPool;
use uuid::Uuid;
pub async fn run(
database_url: &str,
chapter_id: Uuid,
voice_slug: Option<&str>,
speed: f32,
) -> anyhow::Result<()> {
let cfg = load_f5_config()?;
tracing::info!(base_url = %cfg.base_url, "f5-tts configured");
let pool = db::connect_and_migrate(database_url).await?;
let narrator = Narrator::new(cfg)?;
// Quick health probe before we burn time loading rows.
let h = narrator.healthz().await.context("f5-tts healthz failed")?;
if !h.loaded {
bail!("f5-tts /healthz says model is not loaded yet — retry shortly");
}
tracing::info!(device = %h.device, model = %h.model, "f5-tts ready");
let chapter = load_chapter(&pool, chapter_id).await?;
let voice = resolve_voice(&pool, &chapter, voice_slug).await?;
tracing::info!(
voice = %voice.name,
voice_license = %voice.license,
chapter_n = chapter.n,
word_count = chapter.word_count,
"narrating",
);
let ref_audio_path = voice
.reference_path
.as_ref()
.ok_or_else(|| anyhow::anyhow!("voice '{}' has no reference_path", voice.name))?
.clone();
let run_id = Uuid::new_v4();
let output_filename = format!("{}-{}-{}.wav", chapter.story_id, chapter.n, run_id);
let run_row_id: Uuid = sqlx::query_scalar(
"INSERT INTO narration_runs (id, chapter_id, voice_id, engine, engine_version, status)
VALUES ($1, $2, $3, 'f5-tts', '1.1.20', 'running') RETURNING id",
)
.bind(run_id)
.bind(chapter_id)
.bind(voice.id)
.fetch_one(&pool)
.await?;
let started = Instant::now();
let req = SynthesizeRequest {
gen_text: chapter.body_md.clone(),
ref_audio_path,
ref_text: voice.reference_text.clone(),
output_filename,
speed,
};
let res = match narrator.synthesize(&req).await {
Ok(r) => r,
Err(e) => {
sqlx::query(
"UPDATE narration_runs SET status='failed', error=$1, ended_at=$2 WHERE id=$3",
)
.bind(format!("{e:#}"))
.bind(Utc::now())
.bind(run_row_id)
.execute(&pool)
.await?;
return Err(e);
}
};
let elapsed = started.elapsed();
sqlx::query(
"UPDATE narration_runs
SET status='succeeded',
output_path=$1,
duration_seconds=$2,
ended_at=$3
WHERE id=$4",
)
.bind(&res.output_path)
.bind(res.duration_seconds)
.bind(Utc::now())
.bind(run_row_id)
.execute(&pool)
.await?;
println!(
"narrated chapter {} of story {}: {} ({:.2}s audio, {:.1}s wall clock)",
chapter.n,
chapter.story_id,
res.output_path,
res.duration_seconds,
elapsed.as_secs_f32(),
);
Ok(())
}
#[derive(Debug, Clone)]
struct ChapterRow {
story_id: Uuid,
n: i32,
body_md: String,
word_count: i32,
}
async fn load_chapter(pool: &PgPool, id: Uuid) -> anyhow::Result<ChapterRow> {
let row: Option<(Uuid, i32, String, i32)> = sqlx::query_as(
"SELECT story_id, n, body_md, word_count FROM chapters WHERE id = $1",
)
.bind(id)
.fetch_optional(pool)
.await?;
let (story_id, n, body_md, word_count) =
row.with_context(|| format!("chapter {id} not found"))?;
Ok(ChapterRow {
story_id,
n,
body_md,
word_count,
})
}
async fn resolve_voice(
pool: &PgPool,
chapter: &ChapterRow,
flag_slug: Option<&str>,
) -> anyhow::Result<Voice> {
if let Some(slug) = flag_slug {
return skald_core::narrate::get_voice_by_name(pool, slug)
.await?
.with_context(|| format!("voice '{slug}' not found"));
}
// Story-pinned voice?
let pinned: Option<Uuid> =
sqlx::query_scalar("SELECT preferred_voice_id FROM stories WHERE id = $1")
.bind(chapter.story_id)
.fetch_optional(pool)
.await?
.flatten();
if let Some(vid) = pinned {
if let Some(v) = skald_core::narrate::get_voice_by_id(pool, vid).await? {
return Ok(v);
}
}
// Else system default.
skald_core::narrate::get_default_voice(pool)
.await?
.ok_or_else(|| anyhow::anyhow!("no default voice set; create one or use --voice <slug>"))
}
fn load_f5_config() -> anyhow::Result<F5Config> {
let base_url = std::env::var("F5_TTS_URL")
.unwrap_or_else(|_| "http://192.168.0.5:7792".into());
Ok(F5Config {
base_url,
timeout: std::time::Duration::from_secs(1800),
})
}