narrate: F5-TTS HTTP client + skald narrate CLI
skald-core::narrate ships a thin reqwest client + voice DB access (get_by_name, get_default, get_by_id). The boundary is the f5-tts container's purpose-built FastAPI sidecar (python lives there because torch + transformers + safetensors do); skald never touches python. CLI: skald narrate --chapter <uuid> [--voice slug] [--speed 1.0]. Voice resolution: --voice flag → story.preferred_voice_id → system default. Persists narration_runs row (engine='f5-tts', engine_version pinned, status: running → succeeded|failed). Output path stored is the f5-tts container's view (/audio/<story>-<n>-<run>.wav); web playback wiring deferred.
This commit is contained in:
parent
3a749b7643
commit
c2bb12fdd0
6 changed files with 388 additions and 0 deletions
|
|
@ -30,3 +30,4 @@ uuid = { version = "1", features = ["v4", "serde"] }
|
|||
regex = "1"
|
||||
async-trait = "0.1"
|
||||
maud = "0.27"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ tracing = { workspace = true }
|
|||
chrono = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
regex = { workspace = true }
|
||||
reqwest = { workspace = true }
|
||||
clawdforge = { path = "../vendor/clawdforge" }
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ pub mod db;
|
|||
pub mod forge;
|
||||
pub mod ingest;
|
||||
pub mod models;
|
||||
pub mod narrate;
|
||||
|
||||
/// Embeds the workspace `migrations/` directory at compile time.
|
||||
/// Run via `MIGRATOR.run(&pool).await` at boot.
|
||||
|
|
|
|||
186
skald-core/src/narrate.rs
Normal file
186
skald-core/src/narrate.rs
Normal file
|
|
@ -0,0 +1,186 @@
|
|||
//! F5-TTS client + voice DB access for the narrate pipeline.
|
||||
//!
|
||||
//! The boundary: skald (Rust) speaks HTTP+JSON to the f5-tts
|
||||
//! container's purpose-built FastAPI sidecar (the python lives there
|
||||
//! because torch + transformers + safetensors do). Skald never
|
||||
//! imports python deps; the python service has no business logic.
|
||||
//!
|
||||
//! v0.1 flow:
|
||||
//! 1. Skald loads chapter prose + chosen Voice row from the DB.
|
||||
//! 2. POST /synthesize to f5-tts with gen_text + ref_audio_path.
|
||||
//! 3. F5 writes the WAV to its /audio bind mount and returns the
|
||||
//! path + duration metadata.
|
||||
//! 4. Skald inserts a narration_runs row pointing at that path.
|
||||
//!
|
||||
//! Path note: output_path stored in narration_runs is the f5-tts
|
||||
//! container's view (e.g. /audio/coast-down/8-abc.wav). To serve it
|
||||
//! from skald's web UI we'll either mount the same dir on skald or
|
||||
//! route audio bytes through f5-tts. Deferred.
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use reqwest::Client as HttpClient;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct F5Config {
|
||||
/// e.g. http://192.168.0.5:7792
|
||||
pub base_url: String,
|
||||
/// Inference subprocess timeout. Long-form chapters (3000 words)
|
||||
/// take 60-180s on an 8GB GPU; cap at 1800s to match clawdforge.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Narrator {
|
||||
cfg: F5Config,
|
||||
http: HttpClient,
|
||||
}
|
||||
|
||||
impl Narrator {
|
||||
pub fn new(cfg: F5Config) -> anyhow::Result<Self> {
|
||||
let http = HttpClient::builder()
|
||||
.timeout(cfg.timeout)
|
||||
.user_agent(concat!("skald-narrate/", env!("CARGO_PKG_VERSION")))
|
||||
.build()?;
|
||||
Ok(Self { cfg, http })
|
||||
}
|
||||
|
||||
/// Synthesize one chapter to a WAV via the F5-TTS sidecar.
|
||||
/// `output_filename` is a bare name (no slashes); the file lands
|
||||
/// at `/audio/<output_filename>` in the f5-tts container.
|
||||
pub async fn synthesize(
|
||||
&self,
|
||||
req: &SynthesizeRequest,
|
||||
) -> anyhow::Result<SynthesizeResponse> {
|
||||
let url = format!("{}/synthesize", self.cfg.base_url.trim_end_matches('/'));
|
||||
let res = self
|
||||
.http
|
||||
.post(&url)
|
||||
.json(req)
|
||||
.send()
|
||||
.await
|
||||
.with_context(|| format!("POST {url} failed"))?;
|
||||
|
||||
if !res.status().is_success() {
|
||||
let status = res.status();
|
||||
let body = res.text().await.unwrap_or_default();
|
||||
anyhow::bail!("f5-tts /synthesize returned {status}: {body}");
|
||||
}
|
||||
Ok(res.json::<SynthesizeResponse>().await?)
|
||||
}
|
||||
|
||||
pub async fn healthz(&self) -> anyhow::Result<HealthResponse> {
|
||||
let url = format!("{}/healthz", self.cfg.base_url.trim_end_matches('/'));
|
||||
Ok(self.http.get(&url).send().await?.json().await?)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct SynthesizeRequest {
|
||||
pub gen_text: String,
|
||||
pub ref_audio_path: String,
|
||||
pub ref_text: Option<String>,
|
||||
pub output_filename: String,
|
||||
pub speed: f32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct SynthesizeResponse {
|
||||
pub ok: bool,
|
||||
pub output_path: String,
|
||||
pub sample_rate_hz: i32,
|
||||
pub duration_seconds: f32,
|
||||
pub elapsed_ms: u64,
|
||||
pub chars_in: i64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct HealthResponse {
|
||||
pub ok: bool,
|
||||
pub device: String,
|
||||
pub model: String,
|
||||
pub vocoder: String,
|
||||
pub loaded: bool,
|
||||
}
|
||||
|
||||
// ─── voice DB access ─────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Voice {
|
||||
pub id: Uuid,
|
||||
pub name: String,
|
||||
pub display_name: String,
|
||||
pub reference_path: Option<String>,
|
||||
pub reference_text: Option<String>,
|
||||
pub license: String,
|
||||
pub is_default: bool,
|
||||
}
|
||||
|
||||
pub async fn get_voice_by_name(pool: &PgPool, name: &str) -> anyhow::Result<Option<Voice>> {
|
||||
let row: Option<(Uuid, String, String, Option<String>, Option<String>, String, bool)> =
|
||||
sqlx::query_as(
|
||||
"SELECT id, name, display_name, reference_path, reference_text, license, is_default
|
||||
FROM voices WHERE name = $1",
|
||||
)
|
||||
.bind(name)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
Ok(row.map(|(id, name, display_name, reference_path, reference_text, license, is_default)| {
|
||||
Voice {
|
||||
id,
|
||||
name,
|
||||
display_name,
|
||||
reference_path,
|
||||
reference_text,
|
||||
license,
|
||||
is_default,
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
pub async fn get_default_voice(pool: &PgPool) -> anyhow::Result<Option<Voice>> {
|
||||
let row: Option<(Uuid, String, String, Option<String>, Option<String>, String, bool)> =
|
||||
sqlx::query_as(
|
||||
"SELECT id, name, display_name, reference_path, reference_text, license, is_default
|
||||
FROM voices WHERE is_default = true LIMIT 1",
|
||||
)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
Ok(row.map(|(id, name, display_name, reference_path, reference_text, license, is_default)| {
|
||||
Voice {
|
||||
id,
|
||||
name,
|
||||
display_name,
|
||||
reference_path,
|
||||
reference_text,
|
||||
license,
|
||||
is_default,
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
pub async fn get_voice_by_id(pool: &PgPool, id: Uuid) -> anyhow::Result<Option<Voice>> {
|
||||
let row: Option<(Uuid, String, String, Option<String>, Option<String>, String, bool)> =
|
||||
sqlx::query_as(
|
||||
"SELECT id, name, display_name, reference_path, reference_text, license, is_default
|
||||
FROM voices WHERE id = $1",
|
||||
)
|
||||
.bind(id)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
Ok(row.map(|(id, name, display_name, reference_path, reference_text, license, is_default)| {
|
||||
Voice {
|
||||
id,
|
||||
name,
|
||||
display_name,
|
||||
reference_path,
|
||||
reference_text,
|
||||
license,
|
||||
is_default,
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
|
@ -7,6 +7,7 @@
|
|||
mod authors_seed;
|
||||
mod continue_story;
|
||||
mod import;
|
||||
mod narrate;
|
||||
mod serve;
|
||||
mod show_context;
|
||||
mod summarize;
|
||||
|
|
@ -126,6 +127,19 @@ enum Cmd {
|
|||
#[arg(long, default_value = "1")]
|
||||
chapters: usize,
|
||||
},
|
||||
/// Render one chapter to audio via F5-TTS. Resolves voice via
|
||||
/// --voice slug → story.preferred_voice_id → system default.
|
||||
Narrate {
|
||||
/// Chapter UUID to narrate.
|
||||
#[arg(long)]
|
||||
chapter: Uuid,
|
||||
/// Optional voice slug override (e.g. "lj_speech").
|
||||
#[arg(long)]
|
||||
voice: Option<String>,
|
||||
/// Speech speed (0.5–2.0). 1.0 = natural pace.
|
||||
#[arg(long, default_value = "1.0")]
|
||||
speed: f32,
|
||||
},
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
|
|
@ -191,6 +205,11 @@ async fn run() -> anyhow::Result<()> {
|
|||
)
|
||||
.await
|
||||
}
|
||||
Cmd::Narrate {
|
||||
chapter,
|
||||
voice,
|
||||
speed,
|
||||
} => narrate::run(&cli.database_url, chapter, voice.as_deref(), speed).await,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
180
skald/src/narrate.rs
Normal file
180
skald/src/narrate.rs
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
//! `skald narrate` — render chapter prose to audio via F5-TTS.
|
||||
//!
|
||||
//! Resolution order for the voice:
|
||||
//! 1. --voice <slug> flag (explicit override)
|
||||
//! 2. story.preferred_voice_id (per-story pin)
|
||||
//! 3. voices.is_default = true (the system default)
|
||||
//!
|
||||
//! Output filename layout: <story-slug-or-id>/<n>-<run-uuid>.wav.
|
||||
//! Story-slug isn't yet on the schema, so v0.1 uses the bare
|
||||
//! story-uuid prefix.
|
||||
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
use chrono::Utc;
|
||||
use skald_core::db;
|
||||
use skald_core::narrate::{F5Config, Narrator, SynthesizeRequest, Voice};
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
pub async fn run(
|
||||
database_url: &str,
|
||||
chapter_id: Uuid,
|
||||
voice_slug: Option<&str>,
|
||||
speed: f32,
|
||||
) -> anyhow::Result<()> {
|
||||
let cfg = load_f5_config()?;
|
||||
tracing::info!(base_url = %cfg.base_url, "f5-tts configured");
|
||||
|
||||
let pool = db::connect_and_migrate(database_url).await?;
|
||||
let narrator = Narrator::new(cfg)?;
|
||||
|
||||
// Quick health probe before we burn time loading rows.
|
||||
let h = narrator.healthz().await.context("f5-tts healthz failed")?;
|
||||
if !h.loaded {
|
||||
bail!("f5-tts /healthz says model is not loaded yet — retry shortly");
|
||||
}
|
||||
tracing::info!(device = %h.device, model = %h.model, "f5-tts ready");
|
||||
|
||||
let chapter = load_chapter(&pool, chapter_id).await?;
|
||||
let voice = resolve_voice(&pool, &chapter, voice_slug).await?;
|
||||
tracing::info!(
|
||||
voice = %voice.name,
|
||||
voice_license = %voice.license,
|
||||
chapter_n = chapter.n,
|
||||
word_count = chapter.word_count,
|
||||
"narrating",
|
||||
);
|
||||
|
||||
let ref_audio_path = voice
|
||||
.reference_path
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow::anyhow!("voice '{}' has no reference_path", voice.name))?
|
||||
.clone();
|
||||
|
||||
let run_id = Uuid::new_v4();
|
||||
let output_filename = format!("{}-{}-{}.wav", chapter.story_id, chapter.n, run_id);
|
||||
|
||||
let run_row_id: Uuid = sqlx::query_scalar(
|
||||
"INSERT INTO narration_runs (id, chapter_id, voice_id, engine, engine_version, status)
|
||||
VALUES ($1, $2, $3, 'f5-tts', '1.1.20', 'running') RETURNING id",
|
||||
)
|
||||
.bind(run_id)
|
||||
.bind(chapter_id)
|
||||
.bind(voice.id)
|
||||
.fetch_one(&pool)
|
||||
.await?;
|
||||
|
||||
let started = Instant::now();
|
||||
let req = SynthesizeRequest {
|
||||
gen_text: chapter.body_md.clone(),
|
||||
ref_audio_path,
|
||||
ref_text: voice.reference_text.clone(),
|
||||
output_filename,
|
||||
speed,
|
||||
};
|
||||
|
||||
let res = match narrator.synthesize(&req).await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
sqlx::query(
|
||||
"UPDATE narration_runs SET status='failed', error=$1, ended_at=$2 WHERE id=$3",
|
||||
)
|
||||
.bind(format!("{e:#}"))
|
||||
.bind(Utc::now())
|
||||
.bind(run_row_id)
|
||||
.execute(&pool)
|
||||
.await?;
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
let elapsed = started.elapsed();
|
||||
|
||||
sqlx::query(
|
||||
"UPDATE narration_runs
|
||||
SET status='succeeded',
|
||||
output_path=$1,
|
||||
duration_seconds=$2,
|
||||
ended_at=$3
|
||||
WHERE id=$4",
|
||||
)
|
||||
.bind(&res.output_path)
|
||||
.bind(res.duration_seconds)
|
||||
.bind(Utc::now())
|
||||
.bind(run_row_id)
|
||||
.execute(&pool)
|
||||
.await?;
|
||||
|
||||
println!(
|
||||
"narrated chapter {} of story {}: {} ({:.2}s audio, {:.1}s wall clock)",
|
||||
chapter.n,
|
||||
chapter.story_id,
|
||||
res.output_path,
|
||||
res.duration_seconds,
|
||||
elapsed.as_secs_f32(),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ChapterRow {
|
||||
story_id: Uuid,
|
||||
n: i32,
|
||||
body_md: String,
|
||||
word_count: i32,
|
||||
}
|
||||
|
||||
async fn load_chapter(pool: &PgPool, id: Uuid) -> anyhow::Result<ChapterRow> {
|
||||
let row: Option<(Uuid, i32, String, i32)> = sqlx::query_as(
|
||||
"SELECT story_id, n, body_md, word_count FROM chapters WHERE id = $1",
|
||||
)
|
||||
.bind(id)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
let (story_id, n, body_md, word_count) =
|
||||
row.with_context(|| format!("chapter {id} not found"))?;
|
||||
Ok(ChapterRow {
|
||||
story_id,
|
||||
n,
|
||||
body_md,
|
||||
word_count,
|
||||
})
|
||||
}
|
||||
|
||||
async fn resolve_voice(
|
||||
pool: &PgPool,
|
||||
chapter: &ChapterRow,
|
||||
flag_slug: Option<&str>,
|
||||
) -> anyhow::Result<Voice> {
|
||||
if let Some(slug) = flag_slug {
|
||||
return skald_core::narrate::get_voice_by_name(pool, slug)
|
||||
.await?
|
||||
.with_context(|| format!("voice '{slug}' not found"));
|
||||
}
|
||||
// Story-pinned voice?
|
||||
let pinned: Option<Uuid> =
|
||||
sqlx::query_scalar("SELECT preferred_voice_id FROM stories WHERE id = $1")
|
||||
.bind(chapter.story_id)
|
||||
.fetch_optional(pool)
|
||||
.await?
|
||||
.flatten();
|
||||
if let Some(vid) = pinned {
|
||||
if let Some(v) = skald_core::narrate::get_voice_by_id(pool, vid).await? {
|
||||
return Ok(v);
|
||||
}
|
||||
}
|
||||
// Else system default.
|
||||
skald_core::narrate::get_default_voice(pool)
|
||||
.await?
|
||||
.ok_or_else(|| anyhow::anyhow!("no default voice set; create one or use --voice <slug>"))
|
||||
}
|
||||
|
||||
fn load_f5_config() -> anyhow::Result<F5Config> {
|
||||
let base_url = std::env::var("F5_TTS_URL")
|
||||
.unwrap_or_else(|_| "http://192.168.0.5:7792".into());
|
||||
Ok(F5Config {
|
||||
base_url,
|
||||
timeout: std::time::Duration::from_secs(1800),
|
||||
})
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue