v0.1 wave 1 (steps 2+3+4): SQLite ledger + FastAPI skeleton + async job runner

- db.py: migrations + DAOs for tokens / projects / jobs / findings (SQLite WAL)
- auth.py: SHA-256 bearer hashing + LAN-CIDR allowlist + admin/app token tiers
- models.py: Pydantic shapes (Project, Subproject, Schedule, Notify, Job, CreateJobRequest)
- server.py: FastAPI on port 8810; /healthz, /admin/tokens/*, /projects/*, /jobs, /jobs/{id}, /jobs/{id}/log, /jobs/{id}/findings
- runner.py: bounded asyncio pool, per-job timeout with process-group SIGTERM→SIGKILL escalation, orphaned-job recovery on boot
- workspace.py: bare-clone + worktree materialization, gc
- config.py: env-driven
- 62 tests across db / auth / projects / jobs / runner / e2e — all green

Cross-token project access returns 404 (not 403) — existence-leak guard.
Bearer tokens hashed at rest; admin token bootstrapped on first boot.
Recipe subprocess uses start_new_session=True so killpg targets the
whole process tree on timeout — child processes can't escape SIGKILL.
Pump task guarded with wait_for(2s) + cancel fallback against any
orphan that survives the group kill.

Wave 2 (parsers + findings extraction + MCP + email digest) pending.

Spec: memory/spec-crafting-table.md
This commit is contained in:
Kayos 2026-04-29 08:17:41 -07:00
parent 4e668a79e1
commit 0ec3a04676
20 changed files with 3328 additions and 0 deletions

View file

@ -0,0 +1,7 @@
"""crafting-table — polyglot dev/build/audit container.
Wave 1 (steps 2+3+4): SQLite ledger + FastAPI skeleton + async job runner.
Spec: memory/spec-crafting-table.md
"""
__version__ = "0.1.0"

153
crafting_table/auth.py Normal file
View file

@ -0,0 +1,153 @@
"""Bearer + IP allowlist authentication.
Mirrors clawdforge's pattern:
- Bearer tokens hashed at rest (SHA-256). No plaintext stored.
- Per-token IP allowlist (CIDR list). NULL means "any RFC1918 + loopback"
via the global LAN allowlist.
- Admin tokens are flagged in the tokens table server-side admin checks
query `is_admin` rather than comparing to a bootstrap string.
- Loopback always allowed (test client uses 127.0.0.1; FastAPI's
`request.client.host` returns 'testclient' under TestClient and we patch
that in tests).
- Bearer tokens NEVER appear in error messages or log lines. Same hygiene
as clawdforge.
"""
from __future__ import annotations
import ipaddress
import logging
import secrets
from dataclasses import dataclass
from pathlib import Path
from fastapi import HTTPException, Request
from .db import DB
log = logging.getLogger("crafting_table.auth")
ADMIN_TOKEN_NAME = "admin"
ADMIN_TOKEN_PREFIX = "ct_"
@dataclass
class AppToken:
name: str
is_admin: bool
ip_cidrs: list[str] | None # None = use global LAN allowlist
def _client_ip(request: Request) -> str:
"""Extract the client IP from a request. Tests monkeypatch this."""
return request.client.host if request.client else "0.0.0.0"
def _ip_in_any(ip_str: str, cidrs: list[str]) -> bool:
try:
ip = ipaddress.ip_address(ip_str)
except ValueError:
return False
if ip.is_loopback:
return True
for cidr in cidrs:
try:
if ip in ipaddress.ip_network(cidr, strict=False):
return True
except ValueError:
continue
return False
def _const_eq(a: str, b: str) -> bool:
if len(a) != len(b):
return False
diff = 0
for x, y in zip(a.encode(), b.encode()):
diff |= x ^ y
return diff == 0
class Auth:
"""Holds DB ref + global LAN CIDRs. Construct once at startup."""
def __init__(self, *, db: DB, lan_cidrs: list[str] | tuple[str, ...]):
self.db = db
self.lan_cidrs = list(lan_cidrs)
# ---------- bootstrap ---------------------------------------------------
def bootstrap_admin(self, admin_bearer_path: Path) -> str:
"""Mint admin token if none exists, write plaintext bearer to disk
(chmod 600). Subsequent boots reuse the existing token.
Returns the path-side bearer (read from disk) not necessarily what
we just minted, since another process may have raced us.
"""
admin_bearer_path = Path(admin_bearer_path)
admin_bearer_path.parent.mkdir(parents=True, exist_ok=True)
existing = self.db.get_token(ADMIN_TOKEN_NAME)
if existing is not None and admin_bearer_path.exists():
return admin_bearer_path.read_text(encoding="utf-8").strip()
if existing is not None:
# Token row exists but the file is gone — we cannot recover the
# plaintext (it was hashed at insert). Revoke and re-mint.
log.warning("admin token row exists but bearer file is missing; rotating")
self.db.revoke_token(ADMIN_TOKEN_NAME)
# Renaming the existing row would be cleaner, but revoke + new
# row keeps the audit trail of past admin tokens.
new_name = f"{ADMIN_TOKEN_NAME}-rotated-{int(__import__('time').time())}"
with self.db._conn() as c:
c.execute("UPDATE tokens SET name=? WHERE name=?", (new_name, ADMIN_TOKEN_NAME))
bearer = ADMIN_TOKEN_PREFIX + secrets.token_urlsafe(32)
self.db.insert_token(
name=ADMIN_TOKEN_NAME,
bearer=bearer,
is_admin=True,
ip_cidrs=None,
)
admin_bearer_path.write_text(bearer + "\n", encoding="utf-8")
admin_bearer_path.chmod(0o600)
log.info("admin bearer written to %s (chmod 600)", admin_bearer_path)
return bearer
# ---------- guards ------------------------------------------------------
def require_global_ip(self, request: Request) -> None:
ip = _client_ip(request)
if not _ip_in_any(ip, self.lan_cidrs):
raise HTTPException(403, f"ip not in LAN allowlist: {ip}")
def require_app(self, request: Request, authorization: str | None) -> AppToken:
"""Returns AppToken on success. Raises 401/403 on failure.
We check the global LAN allowlist FIRST (cheap, doesn't touch DB) so
wide-area scanners don't even cause a token lookup.
"""
self.require_global_ip(request)
if not authorization or not authorization.startswith("Bearer "):
raise HTTPException(401, "missing bearer")
bearer = authorization[7:].strip()
if not bearer:
raise HTTPException(401, "empty bearer")
rec = self.db.lookup_token_by_bearer(bearer)
if rec is None:
# Note: do NOT echo the bearer back. Generic message.
raise HTTPException(403, "unknown or revoked token")
# Per-token IP allowlist takes precedence over global LAN if set.
if rec["ip_cidrs"]:
ip = _client_ip(request)
if not _ip_in_any(ip, rec["ip_cidrs"]):
raise HTTPException(403, f"ip not in app allowlist: {ip}")
return AppToken(name=rec["name"], is_admin=rec["is_admin"], ip_cidrs=rec["ip_cidrs"])
def require_admin(self, request: Request, authorization: str | None) -> AppToken:
tok = self.require_app(request, authorization)
if not tok.is_admin:
raise HTTPException(403, "admin auth failed")
return tok

59
crafting_table/config.py Normal file
View file

@ -0,0 +1,59 @@
"""Env-driven configuration.
All settings flow through environment variables so the same image runs in
prod (compose.yml env_file) and tests (monkeypatched envs). No config files.
"""
from __future__ import annotations
import os
from dataclasses import dataclass, field
from pathlib import Path
# Default LAN allowlist mirrors the rules baked into the network: anything
# inside RFC1918 plus loopback. Override with CRAFTING_LAN_CIDRS if a deploy
# wants stricter scoping.
DEFAULT_LAN_CIDRS = (
"10.0.0.0/8",
"172.16.0.0/12",
"192.168.0.0/16",
"127.0.0.0/8",
"::1/128",
)
@dataclass(frozen=True)
class Config:
db_path: Path
workspace_root: Path
log_dir: Path
admin_bearer_path: Path
max_concurrent_jobs: int
api_port: int
api_bind: str
default_job_timeout_secs: int
lan_cidrs: tuple[str, ...]
workspace_gc_interval_secs: int
workspace_gc_age_secs: int
def load() -> Config:
cidrs_raw = os.environ.get("CRAFTING_LAN_CIDRS", "").strip()
if cidrs_raw:
cidrs = tuple(c.strip() for c in cidrs_raw.split(",") if c.strip())
else:
cidrs = DEFAULT_LAN_CIDRS
return Config(
db_path=Path(os.environ.get("CRAFTING_DB", "/data/crafting.db")),
workspace_root=Path(os.environ.get("CRAFTING_WORKSPACE", "/workspace")),
log_dir=Path(os.environ.get("CRAFTING_LOG_DIR", "/data/jobs")),
admin_bearer_path=Path(os.environ.get("CRAFTING_ADMIN_BEARER", "/data/admin-bearer.txt")),
max_concurrent_jobs=int(os.environ.get("CRAFTING_MAX_CONCURRENT", "4")),
api_port=int(os.environ.get("CRAFTING_PORT", "8810")),
api_bind=os.environ.get("CRAFTING_BIND", "0.0.0.0"),
default_job_timeout_secs=int(os.environ.get("CRAFTING_DEFAULT_JOB_TIMEOUT", "1800")),
lan_cidrs=cidrs,
workspace_gc_interval_secs=int(os.environ.get("CRAFTING_GC_INTERVAL", "3600")),
workspace_gc_age_secs=int(os.environ.get("CRAFTING_GC_AGE", "86400")),
)

502
crafting_table/db.py Normal file
View file

@ -0,0 +1,502 @@
"""SQLite ledger + migrations.
Why SQLite (not MariaDB like clawdforge): single-process, single-host service,
no need for cross-host replication. The runner is the only writer; every
HTTP worker reads. SQLite in WAL mode handles single-writer-many-readers
cleanly. Trade-off documented in README.
Why stdlib `sqlite3` + `run_in_executor` (not aiosqlite): one less dependency
and the queries are tiny (fetchone / fetchall). The runner does its own log
streaming via aiofiles-equivalent so we never block the loop on disk.
Migration system:
- Each entry in MIGRATIONS is (version_id, sql_text). Versions are date-tagged
so they sort lexicographically.
- Apply in order, INSERT OR IGNORE into schema_migrations to handle
multi-worker boot races (mirrors cauldron's pattern).
- Migrations are append-only; never edit a landed migration, add a new one.
"""
from __future__ import annotations
import asyncio
import json
import sqlite3
import time
from contextlib import contextmanager
from pathlib import Path
# fmt: off
MIGRATIONS: list[tuple[str, str]] = [
(
"001_schema_migrations",
"""
CREATE TABLE IF NOT EXISTS schema_migrations (
version TEXT PRIMARY KEY,
applied_at INTEGER NOT NULL
);
""",
),
(
"002_tokens",
"""
CREATE TABLE IF NOT EXISTS tokens (
name TEXT PRIMARY KEY,
bearer_hash TEXT NOT NULL UNIQUE,
is_admin INTEGER NOT NULL DEFAULT 0,
ip_allowlist_json TEXT,
created_at INTEGER NOT NULL,
last_used_at INTEGER,
revoked_at INTEGER
);
""",
),
(
"003_projects",
"""
CREATE TABLE IF NOT EXISTS projects (
name TEXT PRIMARY KEY,
git_url TEXT NOT NULL,
default_branch TEXT NOT NULL DEFAULT 'main',
recipe_json TEXT NOT NULL,
owner_token TEXT NOT NULL,
created_at INTEGER NOT NULL,
updated_at INTEGER NOT NULL,
FOREIGN KEY (owner_token) REFERENCES tokens(name)
);
""",
),
(
"004_jobs",
"""
CREATE TABLE IF NOT EXISTS jobs (
id TEXT PRIMARY KEY,
project_name TEXT NOT NULL,
subproject_path TEXT NOT NULL,
recipe TEXT NOT NULL,
branch TEXT NOT NULL,
status TEXT NOT NULL DEFAULT 'queued',
queued_at INTEGER NOT NULL,
started_at INTEGER,
finished_at INTEGER,
exit_code INTEGER,
log_path TEXT NOT NULL,
findings_count INTEGER NOT NULL DEFAULT 0,
recipe_snapshot_json TEXT NOT NULL,
FOREIGN KEY (project_name) REFERENCES projects(name) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_jobs_project ON jobs(project_name);
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
CREATE INDEX IF NOT EXISTS idx_jobs_queued_at ON jobs(queued_at);
""",
),
(
"005_findings",
"""
CREATE TABLE IF NOT EXISTS findings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
job_id TEXT NOT NULL,
kind TEXT NOT NULL,
severity TEXT NOT NULL,
file TEXT,
line INTEGER,
code TEXT,
message TEXT NOT NULL,
suggested_fix TEXT,
raw_json TEXT,
created_at INTEGER NOT NULL,
fingerprint TEXT NOT NULL,
FOREIGN KEY (job_id) REFERENCES jobs(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_findings_job ON findings(job_id);
CREATE INDEX IF NOT EXISTS idx_findings_fingerprint ON findings(fingerprint);
""",
),
]
# fmt: on
def _hash(token: str) -> str:
import hashlib
return hashlib.sha256(token.encode("utf-8")).hexdigest()
class DB:
"""Synchronous SQLite wrapper. Async API methods wrap calls with
run_in_executor so callers in the FastAPI loop stay non-blocking.
"""
def __init__(self, db_path: str | Path):
self.db_path = str(db_path)
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
with self._conn() as c:
# WAL = many readers + one writer without lock contention.
# synchronous=NORMAL is the standard WAL pairing — durability
# against process crash is preserved; only OS crash can drop
# the most recent commit.
c.execute("PRAGMA journal_mode=WAL")
c.execute("PRAGMA synchronous=NORMAL")
c.execute("PRAGMA foreign_keys=ON")
self.migrate()
@contextmanager
def _conn(self):
# isolation_level=None gives us autocommit; we wrap multi-stmt
# operations in BEGIN / COMMIT explicitly when we need atomicity.
conn = sqlite3.connect(self.db_path, isolation_level=None, timeout=10.0)
conn.row_factory = sqlite3.Row
try:
conn.execute("PRAGMA foreign_keys=ON")
yield conn
finally:
conn.close()
# ---------- migrations ---------------------------------------------------
def migrate(self) -> list[str]:
"""Apply any pending migrations. Returns the list of versions applied
on this call (empty if up-to-date). Idempotent + race-safe."""
applied: list[str] = []
with self._conn() as c:
# Migration 001 must run first since it creates the tracking table.
# Use IF NOT EXISTS in every CREATE so repeat runs are no-ops.
for version, sql in MIGRATIONS:
c.executescript(sql)
cur = c.execute(
"INSERT OR IGNORE INTO schema_migrations (version, applied_at) VALUES (?, ?)",
(version, int(time.time())),
)
if cur.rowcount == 1:
applied.append(version)
return applied
def applied_migrations(self) -> list[str]:
with self._conn() as c:
rows = c.execute(
"SELECT version FROM schema_migrations ORDER BY version"
).fetchall()
return [r["version"] for r in rows]
# ---------- tokens -------------------------------------------------------
def insert_token(
self,
*,
name: str,
bearer: str,
is_admin: bool,
ip_cidrs: list[str] | None,
) -> None:
ip_json = json.dumps(ip_cidrs) if ip_cidrs else None
with self._conn() as c:
c.execute(
"""
INSERT INTO tokens (name, bearer_hash, is_admin, ip_allowlist_json, created_at)
VALUES (?, ?, ?, ?, ?)
""",
(name, _hash(bearer), 1 if is_admin else 0, ip_json, int(time.time())),
)
def lookup_token_by_bearer(self, bearer: str) -> dict | None:
h = _hash(bearer)
with self._conn() as c:
row = c.execute(
"""
SELECT name, is_admin, ip_allowlist_json, revoked_at
FROM tokens WHERE bearer_hash=?
""",
(h,),
).fetchone()
if not row:
return None
if row["revoked_at"] is not None:
return None
c.execute("UPDATE tokens SET last_used_at=? WHERE bearer_hash=?", (int(time.time()), h))
ip_cidrs = json.loads(row["ip_allowlist_json"]) if row["ip_allowlist_json"] else None
return {
"name": row["name"],
"is_admin": bool(row["is_admin"]),
"ip_cidrs": ip_cidrs,
}
def get_token(self, name: str) -> dict | None:
with self._conn() as c:
row = c.execute(
"""
SELECT name, is_admin, ip_allowlist_json, created_at, last_used_at, revoked_at
FROM tokens WHERE name=?
""",
(name,),
).fetchone()
if not row:
return None
d = dict(row)
d["is_admin"] = bool(d["is_admin"])
d["ip_cidrs"] = json.loads(d.pop("ip_allowlist_json")) if d["ip_allowlist_json"] else None
return d
def list_tokens(self) -> list[dict]:
with self._conn() as c:
rows = c.execute(
"""
SELECT name, is_admin, ip_allowlist_json, created_at, last_used_at, revoked_at
FROM tokens ORDER BY name
""",
).fetchall()
out = []
for r in rows:
d = dict(r)
d["is_admin"] = bool(d["is_admin"])
d["ip_cidrs"] = json.loads(d.pop("ip_allowlist_json")) if d["ip_allowlist_json"] else None
out.append(d)
return out
def revoke_token(self, name: str) -> bool:
with self._conn() as c:
cur = c.execute(
"UPDATE tokens SET revoked_at=? WHERE name=? AND revoked_at IS NULL",
(int(time.time()), name),
)
return cur.rowcount > 0
# ---------- projects -----------------------------------------------------
def upsert_project(
self,
*,
name: str,
git_url: str,
default_branch: str,
recipe_json: str,
owner_token: str,
) -> dict:
now = int(time.time())
with self._conn() as c:
row = c.execute("SELECT created_at, owner_token FROM projects WHERE name=?", (name,)).fetchone()
if row is None:
c.execute(
"""
INSERT INTO projects (name, git_url, default_branch, recipe_json, owner_token, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?)
""",
(name, git_url, default_branch, recipe_json, owner_token, now, now),
)
created_at = now
else:
created_at = row["created_at"]
c.execute(
"""
UPDATE projects
SET git_url=?, default_branch=?, recipe_json=?, updated_at=?
WHERE name=?
""",
(git_url, default_branch, recipe_json, now, name),
)
return {
"name": name,
"git_url": git_url,
"default_branch": default_branch,
"recipe_json": recipe_json,
"owner_token": owner_token,
"created_at": created_at,
"updated_at": now,
}
def get_project(self, name: str) -> dict | None:
with self._conn() as c:
row = c.execute(
"SELECT * FROM projects WHERE name=?",
(name,),
).fetchone()
return dict(row) if row else None
def list_projects(self, *, owner_token: str | None = None) -> list[dict]:
with self._conn() as c:
if owner_token is None:
rows = c.execute("SELECT * FROM projects ORDER BY name").fetchall()
else:
rows = c.execute(
"SELECT * FROM projects WHERE owner_token=? ORDER BY name",
(owner_token,),
).fetchall()
return [dict(r) for r in rows]
def delete_project(self, name: str) -> bool:
with self._conn() as c:
cur = c.execute("DELETE FROM projects WHERE name=?", (name,))
return cur.rowcount > 0
# ---------- jobs ---------------------------------------------------------
def insert_job(
self,
*,
job_id: str,
project_name: str,
subproject_path: str,
recipe: str,
branch: str,
log_path: str,
recipe_snapshot_json: str,
) -> dict:
now = int(time.time())
with self._conn() as c:
c.execute(
"""
INSERT INTO jobs (id, project_name, subproject_path, recipe, branch,
status, queued_at, log_path, recipe_snapshot_json)
VALUES (?, ?, ?, ?, ?, 'queued', ?, ?, ?)
""",
(job_id, project_name, subproject_path, recipe, branch, now, log_path, recipe_snapshot_json),
)
return {
"id": job_id,
"project_name": project_name,
"subproject_path": subproject_path,
"recipe": recipe,
"branch": branch,
"status": "queued",
"queued_at": now,
"started_at": None,
"finished_at": None,
"exit_code": None,
"log_path": log_path,
"findings_count": 0,
}
def get_job(self, job_id: str) -> dict | None:
with self._conn() as c:
row = c.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()
return dict(row) if row else None
def list_jobs(
self,
*,
project_name: str | None = None,
status: str | None = None,
owner_token: str | None = None,
limit: int = 50,
) -> list[dict]:
sql = """
SELECT j.* FROM jobs j
JOIN projects p ON p.name = j.project_name
WHERE 1=1
"""
params: list = []
if project_name is not None:
sql += " AND j.project_name=?"
params.append(project_name)
if status is not None:
sql += " AND j.status=?"
params.append(status)
if owner_token is not None:
sql += " AND p.owner_token=?"
params.append(owner_token)
sql += " ORDER BY j.queued_at DESC LIMIT ?"
params.append(int(limit))
with self._conn() as c:
rows = c.execute(sql, params).fetchall()
return [dict(r) for r in rows]
def mark_job_running(self, job_id: str) -> None:
with self._conn() as c:
c.execute(
"UPDATE jobs SET status='running', started_at=? WHERE id=? AND status='queued'",
(int(time.time()), job_id),
)
def mark_job_finished(
self,
*,
job_id: str,
status: str,
exit_code: int | None,
) -> None:
with self._conn() as c:
c.execute(
"""
UPDATE jobs
SET status=?, finished_at=?, exit_code=?
WHERE id=?
""",
(status, int(time.time()), exit_code, job_id),
)
def mark_orphaned_jobs_failed(self, *, log_dir: Path | None = None) -> list[str]:
"""Sweep on boot — any job in 'running' state was orphaned by a process
crash. Mark them failed with exit_code=-1 so callers see the terminal
state and can re-queue if they want."""
ids: list[str] = []
now = int(time.time())
with self._conn() as c:
rows = c.execute("SELECT id, log_path FROM jobs WHERE status='running'").fetchall()
for r in rows:
ids.append(r["id"])
c.execute(
"UPDATE jobs SET status='failed', finished_at=?, exit_code=-1 WHERE id=?",
(now, r["id"]),
)
if log_dir is not None:
try:
Path(r["log_path"]).parent.mkdir(parents=True, exist_ok=True)
with open(r["log_path"], "a", encoding="utf-8") as fh:
fh.write("\n[crafting-table] runner restart, job orphaned\n")
except OSError:
pass
return ids
def increment_findings_count(self, job_id: str, n: int) -> None:
with self._conn() as c:
c.execute(
"UPDATE jobs SET findings_count = findings_count + ? WHERE id=?",
(n, job_id),
)
# ---------- findings -----------------------------------------------------
def insert_finding(
self,
*,
job_id: str,
kind: str,
severity: str,
message: str,
fingerprint: str,
file: str | None = None,
line: int | None = None,
code: str | None = None,
suggested_fix: str | None = None,
raw_json: str | None = None,
) -> int:
with self._conn() as c:
cur = c.execute(
"""
INSERT INTO findings (job_id, kind, severity, file, line, code, message,
suggested_fix, raw_json, created_at, fingerprint)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(job_id, kind, severity, file, line, code, message,
suggested_fix, raw_json, int(time.time()), fingerprint),
)
return cur.lastrowid
def list_findings(self, job_id: str) -> list[dict]:
with self._conn() as c:
rows = c.execute(
"SELECT * FROM findings WHERE job_id=? ORDER BY id",
(job_id,),
).fetchall()
return [dict(r) for r in rows]
# ---------- async wrappers ----------------------------------------------
async def arun(self, fn, *args, **kwargs):
"""Run a sync DB method in the default executor.
Use from FastAPI request handlers / runner coroutines so we don't
block the event loop on disk I/O. Most of these queries are <1ms but
the pattern stays consistent for when we add bigger ones later.
"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, lambda: fn(*args, **kwargs))

111
crafting_table/models.py Normal file
View file

@ -0,0 +1,111 @@
"""Pydantic schemas for projects, recipes, jobs, findings.
All wire shapes what HTTP request bodies look like and what the API returns.
The DB stores Project minus the name (which is the row PK) as recipe_json so
recipe drift is visible per-job (jobs snapshot their recipe at run-time).
"""
from __future__ import annotations
from typing import Literal
from pydantic import BaseModel, Field
# Slug pattern shared between project names and token names — lowercase
# alphanumerics + hyphen + underscore, must start with alphanumeric.
SLUG_PATTERN = r"^[a-z0-9][a-z0-9_-]*$"
class Subproject(BaseModel):
"""One language target inside a repo. A project has one or more."""
path: str = "."
language: str
build: str | None = None
test: str | None = None
lint: str | None = None
audit: str | None = None
timeout_secs: int = Field(default=1800, ge=1, le=86400)
class Schedule(BaseModel):
"""Cron-style schedules per recipe kind. 'manual' = caller-driven only.
Wave 1 doesn't run the scheduler yet — these strings are persisted but the
sweeper that consumes them lands in a later wave. Stored as-is.
"""
audit: str | None = None
test: str | None = None
build: str | None = None
lint: str | None = None
class Notify(BaseModel):
email: list[str] = Field(default_factory=list)
on: list[str] = Field(default_factory=lambda: ["audit_fail", "cve_found", "patch_drafted"])
auto_patch: bool = False
class Project(BaseModel):
"""Full project shape — what the API accepts on POST /projects.
`created_at` and `updated_at` are server-stamped on insert/update; if the
caller supplies them we ignore the values and use server time.
"""
name: str = Field(pattern=SLUG_PATTERN, min_length=1, max_length=64)
git_url: str = Field(min_length=1)
default_branch: str = "main"
languages: list[str] = Field(default_factory=list)
subprojects: list[Subproject] = Field(default_factory=list)
schedule: Schedule = Field(default_factory=Schedule)
notify: Notify = Field(default_factory=Notify)
created_at: int = 0
updated_at: int = 0
class CreateJobRequest(BaseModel):
recipe: Literal["build", "test", "lint", "audit"]
subproject: str | None = None
branch: str | None = None
class Job(BaseModel):
"""API view of a job row."""
id: str
project_name: str
subproject_path: str
recipe: str
branch: str
status: Literal["queued", "running", "succeeded", "failed", "timed_out", "cancelled"]
queued_at: int
started_at: int | None = None
finished_at: int | None = None
exit_code: int | None = None
log_path: str
findings_count: int = 0
class TokenCreateRequest(BaseModel):
name: str = Field(pattern=SLUG_PATTERN, min_length=1, max_length=64)
is_admin: bool = False
ip_cidrs: list[str] = Field(default_factory=list)
class Finding(BaseModel):
"""One structured finding from a parser. Wave 1 ships the schema; wave 2
actually populates these from cargo/clippy/ruff/etc. JSON output."""
id: int
job_id: str
kind: str
severity: str
file: str | None = None
line: int | None = None
code: str | None = None
message: str
suggested_fix: str | None = None
fingerprint: str
created_at: int

408
crafting_table/runner.py Normal file
View file

@ -0,0 +1,408 @@
"""Async job runner — bounded asyncio pool that materializes workspaces and
runs recipe shell commands.
Lifecycle:
1. server.lifespan calls `runner.start()`:
- mark any 'running' jobs from a previous process as failed (orphaned)
- kick off the dispatcher loop + workspace gc loop
2. POST /projects/<name>/jobs:
- inserts a row in `jobs` (status=queued)
- calls `runner.enqueue(job_id)` fast, just puts the id on a queue
3. dispatcher pulls ids off the queue, acquires a semaphore slot,
spawns `_run_job` bounded by `max_concurrent`.
4. _run_job:
a. mark running
b. materialize workspace
c. exec recipe via /bin/sh -c
d. stream stdout+stderr to log file (live)
e. enforce per-job timeout
f. mark terminal status + exit code
g. emit a `jobs_finished` event hook for wave-2 parsers / wave-8 digest
5. server.lifespan stop() drains in-flight tasks then closes.
Concurrency: asyncio.Semaphore(max_concurrent) caps in-flight subprocess
runs. The queue itself is unbounded back-pressure is enforced by the
semaphore + caller can poll job status to know when to enqueue more.
Recipe security: shell strings are run via `create_subprocess_shell` (which
uses /bin/sh -c). Admins set them; this is documented loud in README.
"""
from __future__ import annotations
import asyncio
import json
import logging
import os
import signal
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Awaitable, Callable
from .db import DB
from .workspace import WorkspaceManager, WorkspacePaths
log = logging.getLogger("crafting_table.runner")
# Hook signature: called after every job reaches a terminal state.
# Wave 2 wires this to the parser pipeline; wave 8 to the email digest queue.
JobFinishedHook = Callable[[dict], Awaitable[None]]
@dataclass
class _JobContext:
job_id: str
job: dict
project: dict
recipe: dict
subproject: dict
class Runner:
def __init__(
self,
*,
db: DB,
workspace: WorkspaceManager,
log_dir: Path,
max_concurrent: int = 4,
default_timeout_secs: int = 1800,
gc_interval_secs: int = 3600,
gc_age_secs: int = 86400,
):
self.db = db
self.workspace = workspace
self.log_dir = Path(log_dir)
self.log_dir.mkdir(parents=True, exist_ok=True)
self.max_concurrent = max_concurrent
self.default_timeout_secs = default_timeout_secs
self.gc_interval_secs = gc_interval_secs
self.gc_age_secs = gc_age_secs
self.queue: asyncio.Queue[str] = asyncio.Queue()
self.semaphore = asyncio.Semaphore(max_concurrent)
self._tasks: set[asyncio.Task] = set()
self._dispatcher_task: asyncio.Task | None = None
self._gc_task: asyncio.Task | None = None
self._stopping = False
self._hooks: list[JobFinishedHook] = []
# Test introspection — lets test_runner assert on bounded-concurrency.
self.in_flight = 0
self.peak_in_flight = 0
# ---------- lifecycle ---------------------------------------------------
def add_hook(self, hook: JobFinishedHook) -> None:
self._hooks.append(hook)
async def start(self) -> None:
# Recover orphaned 'running' jobs from a previous process — mark them
# failed with exit_code=-1 and a synthetic log line. We do NOT try to
# resume a job mid-execution; recipe state could be partial.
orphaned = await self.db.arun(
self.db.mark_orphaned_jobs_failed, log_dir=self.log_dir
)
if orphaned:
log.warning("marked %d orphaned running job(s) failed: %s", len(orphaned), orphaned)
self._dispatcher_task = asyncio.create_task(self._dispatch_loop())
self._gc_task = asyncio.create_task(self._gc_loop())
async def stop(self) -> None:
self._stopping = True
if self._dispatcher_task is not None:
self._dispatcher_task.cancel()
try:
await self._dispatcher_task
except asyncio.CancelledError:
pass
if self._gc_task is not None:
self._gc_task.cancel()
try:
await self._gc_task
except asyncio.CancelledError:
pass
# Cancel any in-flight job tasks. Recipes will see SIGTERM via the
# asyncio cancellation chain on the subprocess transport.
for t in list(self._tasks):
t.cancel()
for t in list(self._tasks):
try:
await t
except (asyncio.CancelledError, Exception):
pass
# ---------- enqueue -----------------------------------------------------
async def enqueue(self, job_id: str) -> None:
await self.queue.put(job_id)
def stats(self) -> dict:
return {
"queued": self.queue.qsize(),
"running": self.in_flight,
"max": self.max_concurrent,
"peak": self.peak_in_flight,
}
# ---------- dispatcher --------------------------------------------------
async def _dispatch_loop(self) -> None:
try:
while not self._stopping:
job_id = await self.queue.get()
# Acquire BEFORE spawning the task so we naturally block when
# the pool is full instead of building up an unbounded set of
# tasks that all immediately await the semaphore.
await self.semaphore.acquire()
if self._stopping:
self.semaphore.release()
break
t = asyncio.create_task(self._wrap_run(job_id))
self._tasks.add(t)
t.add_done_callback(self._tasks.discard)
except asyncio.CancelledError:
raise
async def _wrap_run(self, job_id: str) -> None:
self.in_flight += 1
if self.in_flight > self.peak_in_flight:
self.peak_in_flight = self.in_flight
try:
await self._run_job(job_id)
except Exception as e:
log.exception("runner: unhandled error for job %s: %s", job_id, e)
finally:
self.in_flight -= 1
self.semaphore.release()
# ---------- gc loop -----------------------------------------------------
async def _gc_loop(self) -> None:
try:
while not self._stopping:
await asyncio.sleep(self.gc_interval_secs)
try:
res = await self.workspace.gc(age_secs=self.gc_age_secs)
if res["removed"]:
log.info("workspace gc: %s", res)
except Exception as e:
log.warning("workspace gc failed: %s", e)
except asyncio.CancelledError:
raise
# ---------- core --------------------------------------------------------
async def _run_job(self, job_id: str) -> None:
ctx = await self._load_context(job_id)
if ctx is None:
return
await self.db.arun(self.db.mark_job_running, job_id)
log_path = Path(ctx.job["log_path"])
log_path.parent.mkdir(parents=True, exist_ok=True)
recipe_kind = ctx.job["recipe"]
cmd_str = ctx.subproject.get(recipe_kind)
timeout = int(ctx.subproject.get("timeout_secs") or self.default_timeout_secs)
terminal_status = "succeeded"
exit_code: int | None = None
with log_path.open("w", encoding="utf-8") as log_fh:
log_fh.write(f"[crafting-table] job {job_id}\n")
log_fh.write(f"[crafting-table] project={ctx.job['project_name']} subproject={ctx.job['subproject_path']}\n")
log_fh.write(f"[crafting-table] recipe={recipe_kind} branch={ctx.job['branch']}\n")
log_fh.write(f"[crafting-table] cmd={cmd_str!r} timeout={timeout}s\n")
log_fh.flush()
if not cmd_str:
log_fh.write(f"[crafting-table] subproject has no '{recipe_kind}' command\n")
terminal_status = "failed"
exit_code = -2
else:
try:
paths = await self.workspace.materialize(
project=ctx.job["project_name"],
job_id=job_id,
git_url=ctx.project["git_url"],
branch=ctx.job["branch"],
log_fh=log_fh,
)
except Exception as e:
log_fh.write(f"[crafting-table] workspace error: {e}\n")
terminal_status = "failed"
exit_code = -3
else:
sub_path = ctx.subproject.get("path", ".")
work_dir = paths.worktree_dir / sub_path
log_fh.write(f"[crafting-table] cwd={work_dir}\n")
log_fh.write("[crafting-table] --- recipe output begin ---\n")
log_fh.flush()
try:
exit_code, timed_out = await self._exec_recipe(
cmd=cmd_str, cwd=str(work_dir), log_fh=log_fh, timeout=timeout
)
if timed_out:
terminal_status = "timed_out"
elif exit_code == 0:
terminal_status = "succeeded"
else:
terminal_status = "failed"
except asyncio.CancelledError:
log_fh.write("[crafting-table] cancelled\n")
terminal_status = "cancelled"
exit_code = -4
# Re-raise so the dispatcher's task tracking sees cancellation.
await self.db.arun(
self.db.mark_job_finished,
job_id=job_id,
status=terminal_status,
exit_code=exit_code,
)
await self.workspace.cleanup(paths)
raise
log_fh.write(f"[crafting-table] --- recipe output end (exit={exit_code}) ---\n")
log_fh.flush()
await self.workspace.cleanup(paths)
await self.db.arun(
self.db.mark_job_finished,
job_id=job_id,
status=terminal_status,
exit_code=exit_code,
)
# Hook fan-out — wave 2 parsers + wave 8 digest hook into this.
finished_event = {
"job_id": job_id,
"project_name": ctx.job["project_name"],
"subproject_path": ctx.job["subproject_path"],
"recipe": recipe_kind,
"status": terminal_status,
"exit_code": exit_code,
"log_path": str(log_path),
"finished_at": int(time.time()),
}
for hook in self._hooks:
try:
await hook(finished_event)
except Exception as e:
log.warning("jobs_finished hook failed: %s", e)
async def _exec_recipe(
self, *, cmd: str, cwd: str, log_fh, timeout: int
) -> tuple[int, bool]:
"""Run cmd via /bin/sh -c, stream output to log_fh, return (exit, timed_out).
Uses create_subprocess_shell because recipe strings are operator-trusted
shell expressions (e.g. `cargo build && cargo test`). Stdout+stderr
merged into one stream to preserve interleaving order, which matters
for log readability.
Important asyncio detail: we wrap proc.wait() in a single task and
gate timeout with asyncio.wait() rather than wait_for(). wait_for
cancels the underlying coroutine on timeout, which on Python 3.11
marks the proc.wait() future as cancelled so a SECOND wait_for on
the same proc would immediately raise CancelledError instead of
returning the post-terminate exit code. Wrapping once with a
long-lived task lets us await it twice cleanly.
"""
# start_new_session=True puts the shell in its own process group so
# we can signal the WHOLE group on timeout. Without this, terminate()
# only signals the shell; long-running children (sleep, cargo build,
# etc.) inherit init and keep stdout open, so the pump never EOFs.
proc = await asyncio.create_subprocess_shell(
cmd,
cwd=cwd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.STDOUT,
start_new_session=True,
)
assert proc.stdout is not None
pgid = proc.pid # equals the new session/group id since we just created it
def _kill_group(sig: int) -> None:
try:
os.killpg(pgid, sig)
except ProcessLookupError:
pass
async def pump() -> None:
while True:
line = await proc.stdout.readline()
if not line:
break
log_fh.write(line.decode("utf-8", "replace"))
log_fh.flush()
pump_task = asyncio.create_task(pump())
wait_task = asyncio.create_task(proc.wait())
timed_out = False
# Phase 1 — give the process up to `timeout` seconds to finish
# naturally.
done, _ = await asyncio.wait({wait_task}, timeout=timeout)
if not done:
timed_out = True
log_fh.write(f"\n[crafting-table] timeout after {timeout}s — terminating\n")
log_fh.flush()
_kill_group(signal.SIGTERM)
# Phase 2 — graceful shutdown grace period after SIGTERM
done, _ = await asyncio.wait({wait_task}, timeout=10)
if not done:
# Phase 3 — escalate to SIGKILL on the group
log_fh.write("[crafting-table] grace expired — SIGKILL\n")
log_fh.flush()
_kill_group(signal.SIGKILL)
await wait_task
# wait_task is now done — pull the rc out
rc = wait_task.result()
# Drain pump. With process-group kill stdout will EOF cleanly;
# the wait_for guard is belt-and-braces against any orphan that
# somehow survived (e.g. a child that escaped its group).
try:
await asyncio.wait_for(pump_task, timeout=2)
except (asyncio.TimeoutError, Exception):
pump_task.cancel()
try:
await pump_task
except (asyncio.CancelledError, Exception):
pass
return int(rc), timed_out
# ---------- helpers -----------------------------------------------------
async def _load_context(self, job_id: str) -> _JobContext | None:
job = await self.db.arun(self.db.get_job, job_id)
if job is None:
log.warning("runner: job %s vanished before dispatch", job_id)
return None
recipe = json.loads(job["recipe_snapshot_json"])
# subproject inside the snapshot
subprojects = recipe.get("subprojects", [])
match = None
for s in subprojects:
if s.get("path") == job["subproject_path"]:
match = s
break
if match is None:
# Fallback to the first one — should never happen since we
# validate at enqueue time.
match = subprojects[0] if subprojects else {}
project = await self.db.arun(self.db.get_project, job["project_name"])
if project is None:
# Project was deleted while job sat in queue. The FK cascade in
# the schema would have nuked the job row too, but we may have
# popped the id off the queue before the cascade landed.
log.warning("runner: project for job %s gone", job_id)
return None
return _JobContext(job_id=job_id, job=job, project=project, recipe=recipe, subproject=match)

484
crafting_table/server.py Normal file
View file

@ -0,0 +1,484 @@
"""FastAPI app — port 8810. The HTTP surface for crafting-table.
Authentication model:
- Every request needs `Authorization: Bearer <token>`.
- The bearer is hashed and looked up in the tokens table.
- Tokens are flagged is_admin=1 or 0. Admin can do everything.
- Per-app tokens (is_admin=0) can register projects (becoming the owner)
and only see/touch projects where owner_token matches their name.
- Cross-token project access returns 404 (NOT 403) same existence-leak
guard clawdforge uses for sessions.
Endpoints:
- GET /healthz public-ish (still needs LAN IP)
- POST /admin/tokens admin only
- GET /admin/tokens admin only
- DELETE /admin/tokens/{name} admin only
- POST /projects any token (becomes owner)
- GET /projects caller's projects (or all if admin)
- GET /projects/{name} visibility-gated, 404 on cross-token
- PUT /projects/{name} owner or admin only
- DELETE /projects/{name} owner or admin only; cascades jobs+findings
- POST /projects/{name}/jobs visibility-gated; enqueues a job
- GET /jobs caller's jobs (or all if admin)
- GET /jobs/{id} owner or admin; returns last 200 log lines
- GET /jobs/{id}/log owner or admin; full log file stream
- GET /jobs/{id}/findings owner or admin; empty list in wave 1
"""
from __future__ import annotations
import asyncio
import json
import logging
import time
import uuid
from contextlib import asynccontextmanager
from pathlib import Path
from typing import Annotated
from fastapi import FastAPI, Header, HTTPException, Request
from fastapi.responses import FileResponse, JSONResponse
from .auth import Auth, AppToken
from .config import Config, load
from .db import DB
from .models import (
CreateJobRequest,
Project,
TokenCreateRequest,
)
from .runner import Runner
from .workspace import WorkspaceManager
log = logging.getLogger("crafting_table")
if not log.handlers:
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s")
# ---------- module-level singletons (rebuilt per test via fixture) ----------
cfg: Config = load()
db: DB = DB(cfg.db_path)
auth: Auth = Auth(db=db, lan_cidrs=cfg.lan_cidrs)
workspace: WorkspaceManager = WorkspaceManager(cfg.workspace_root)
runner: Runner = Runner(
db=db,
workspace=workspace,
log_dir=cfg.log_dir,
max_concurrent=cfg.max_concurrent_jobs,
default_timeout_secs=cfg.default_job_timeout_secs,
gc_interval_secs=cfg.workspace_gc_interval_secs,
gc_age_secs=cfg.workspace_gc_age_secs,
)
# ---------- lifespan --------------------------------------------------------
@asynccontextmanager
async def _lifespan(app: FastAPI):
auth.bootstrap_admin(cfg.admin_bearer_path)
await runner.start()
log.info(
"crafting-table startup: db=%s log_dir=%s max_concurrent=%d port=%d",
cfg.db_path, cfg.log_dir, cfg.max_concurrent_jobs, cfg.api_port,
)
try:
yield
finally:
await runner.stop()
log.info("crafting-table shutdown complete")
app = FastAPI(title="crafting-table", version="0.1.0", lifespan=_lifespan)
# ---------- helpers ---------------------------------------------------------
def _project_visible(project_row: dict | None, token: AppToken) -> dict:
"""Return the project row if visible to this token, else raise 404.
Existence-leak guard: cross-token access yields the same 404 a missing
project would.
"""
if project_row is None:
raise HTTPException(404, "project not found")
if token.is_admin:
return project_row
if project_row["owner_token"] == token.name:
return project_row
raise HTTPException(404, "project not found")
def _job_visible(job_row: dict | None, token: AppToken) -> dict:
if job_row is None:
raise HTTPException(404, "job not found")
if token.is_admin:
return job_row
project_row = db.get_project(job_row["project_name"])
if project_row is None or project_row["owner_token"] != token.name:
raise HTTPException(404, "job not found")
return job_row
def _project_to_api(row: dict) -> dict:
"""Inflate a DB row + recipe_json into the API-shaped Project dict."""
recipe = json.loads(row["recipe_json"])
recipe["name"] = row["name"]
recipe["git_url"] = row["git_url"]
recipe["default_branch"] = row["default_branch"]
recipe["created_at"] = row["created_at"]
recipe["updated_at"] = row["updated_at"]
return recipe
def _project_recipe_blob(p: Project) -> str:
"""Serialize the parts of Project we store inside recipe_json (omit the
fields that get their own columns: name, git_url, default_branch,
created_at, updated_at)."""
return json.dumps({
"languages": p.languages,
"subprojects": [s.model_dump() for s in p.subprojects],
"schedule": p.schedule.model_dump(),
"notify": p.notify.model_dump(),
})
# ---------- endpoints -------------------------------------------------------
@app.get("/healthz")
async def healthz(request: Request):
auth.require_global_ip(request)
# Cheap liveness — DB query that exercises the connection.
try:
await db.arun(db.applied_migrations)
db_ok = True
except Exception as e:
db_ok = False
log.warning("healthz db check failed: %s", e)
return {
"ok": True,
"db": "ok" if db_ok else "fail",
"runner": runner.stats(),
"version": "0.1.0",
}
# ---- /admin/tokens ---------------------------------------------------------
@app.post("/admin/tokens")
async def admin_create_token(
request: Request,
body: TokenCreateRequest,
authorization: Annotated[str | None, Header()] = None,
):
auth.require_admin(request, authorization)
import secrets as _s
bearer = ("ct_" if not body.is_admin else "ctadmin_") + _s.token_urlsafe(32)
try:
await db.arun(
db.insert_token,
name=body.name,
bearer=bearer,
is_admin=body.is_admin,
ip_cidrs=body.ip_cidrs or None,
)
except Exception as e:
# UNIQUE-violation, etc. Don't leak DB internals.
raise HTTPException(409, f"token create failed: {type(e).__name__}")
return {
"ok": True,
"name": body.name,
"bearer": bearer,
"is_admin": body.is_admin,
"ip_cidrs": body.ip_cidrs,
}
@app.get("/admin/tokens")
async def admin_list_tokens(
request: Request,
authorization: Annotated[str | None, Header()] = None,
):
auth.require_admin(request, authorization)
rows = await db.arun(db.list_tokens)
return {"ok": True, "tokens": rows}
@app.delete("/admin/tokens/{name}")
async def admin_revoke_token(
name: str,
request: Request,
authorization: Annotated[str | None, Header()] = None,
):
auth.require_admin(request, authorization)
if name == "admin":
raise HTTPException(400, "cannot revoke the admin token via API")
revoked = await db.arun(db.revoke_token, name)
if not revoked:
raise HTTPException(404, "token not found or already revoked")
return {"ok": True}
# ---- /projects -------------------------------------------------------------
@app.post("/projects")
async def register_project(
request: Request,
body: Project,
authorization: Annotated[str | None, Header()] = None,
):
tok = auth.require_app(request, authorization)
existing = await db.arun(db.get_project, body.name)
if existing is not None:
# Cross-token registration of the same name is treated as a 409 even
# for admin — admin who wants to take over should DELETE then re-POST,
# or PUT.
if not tok.is_admin and existing["owner_token"] != tok.name:
# 404, not 409 — don't leak that the name is taken under a
# different token.
raise HTTPException(404, "project not found")
raise HTTPException(409, "project already exists; use PUT to update")
row = await db.arun(
db.upsert_project,
name=body.name,
git_url=body.git_url,
default_branch=body.default_branch,
recipe_json=_project_recipe_blob(body),
owner_token=tok.name,
)
return {"ok": True, "project": _project_to_api(row)}
@app.put("/projects/{name}")
async def update_project(
name: str,
request: Request,
body: Project,
authorization: Annotated[str | None, Header()] = None,
):
tok = auth.require_app(request, authorization)
existing = await db.arun(db.get_project, name)
_project_visible(existing, tok)
if body.name != name:
raise HTTPException(400, "name in body must match path")
row = await db.arun(
db.upsert_project,
name=name,
git_url=body.git_url,
default_branch=body.default_branch,
recipe_json=_project_recipe_blob(body),
owner_token=existing["owner_token"],
)
return {"ok": True, "project": _project_to_api(row)}
@app.delete("/projects/{name}")
async def delete_project(
name: str,
request: Request,
authorization: Annotated[str | None, Header()] = None,
):
tok = auth.require_app(request, authorization)
existing = await db.arun(db.get_project, name)
_project_visible(existing, tok)
deleted = await db.arun(db.delete_project, name)
if not deleted:
raise HTTPException(404, "project not found")
return {"ok": True}
@app.get("/projects")
async def list_projects(
request: Request,
authorization: Annotated[str | None, Header()] = None,
):
tok = auth.require_app(request, authorization)
owner = None if tok.is_admin else tok.name
rows = await db.arun(db.list_projects, owner_token=owner)
return {"ok": True, "projects": [_project_to_api(r) for r in rows]}
@app.get("/projects/{name}")
async def get_project(
name: str,
request: Request,
authorization: Annotated[str | None, Header()] = None,
):
tok = auth.require_app(request, authorization)
row = await db.arun(db.get_project, name)
_project_visible(row, tok)
return {"ok": True, "project": _project_to_api(row)}
# ---- /projects/{name}/jobs -------------------------------------------------
@app.post("/projects/{name}/jobs")
async def create_job(
name: str,
request: Request,
body: CreateJobRequest,
authorization: Annotated[str | None, Header()] = None,
):
tok = auth.require_app(request, authorization)
project_row = await db.arun(db.get_project, name)
_project_visible(project_row, tok)
recipe = json.loads(project_row["recipe_json"])
subprojects = recipe.get("subprojects", [])
if not subprojects:
raise HTTPException(400, "project has no subprojects")
# Pick the right subproject:
# - explicit body.subproject takes the matching path entry
# - otherwise pick the first subproject that has a non-empty command for
# the requested recipe kind
chosen = None
if body.subproject is not None:
for s in subprojects:
if s.get("path") == body.subproject:
chosen = s
break
if chosen is None:
raise HTTPException(400, f"subproject '{body.subproject}' not found in project")
else:
for s in subprojects:
if s.get(body.recipe):
chosen = s
break
if chosen is None:
raise HTTPException(400, f"no subproject defines a '{body.recipe}' command")
if not chosen.get(body.recipe):
raise HTTPException(400, f"subproject '{chosen.get('path', '.')}' has no '{body.recipe}' command")
job_id = str(uuid.uuid4())
log_path = str(Path(cfg.log_dir) / f"{job_id}.log")
branch = body.branch or project_row["default_branch"]
# Snapshot the recipe at run-time. Future recipe edits don't retcon this
# job's view of what command should run — every job carries its own
# frozen copy.
snapshot = {
"git_url": project_row["git_url"],
"default_branch": project_row["default_branch"],
"subprojects": subprojects,
"languages": recipe.get("languages", []),
}
row = await db.arun(
db.insert_job,
job_id=job_id,
project_name=name,
subproject_path=chosen.get("path", "."),
recipe=body.recipe,
branch=branch,
log_path=log_path,
recipe_snapshot_json=json.dumps(snapshot),
)
await runner.enqueue(job_id)
return {"ok": True, "job_id": job_id, "status": "queued", "job": row}
# ---- /jobs -----------------------------------------------------------------
@app.get("/jobs")
async def list_jobs(
request: Request,
authorization: Annotated[str | None, Header()] = None,
project: str | None = None,
status: str | None = None,
limit: int = 50,
):
tok = auth.require_app(request, authorization)
owner = None if tok.is_admin else tok.name
rows = await db.arun(
db.list_jobs,
project_name=project,
status=status,
owner_token=owner,
limit=max(1, min(limit, 500)),
)
return {"ok": True, "jobs": rows}
@app.get("/jobs/{id}")
async def get_job(
id: str,
request: Request,
authorization: Annotated[str | None, Header()] = None,
):
tok = auth.require_app(request, authorization)
row = await db.arun(db.get_job, id)
_job_visible(row, tok)
log_tail: list[str] = []
log_path = Path(row["log_path"])
if log_path.exists():
try:
# Tail at most 200 lines without reading whole file into memory.
log_tail = _tail_lines(log_path, 200)
except Exception as e:
log.warning("log tail failed for %s: %s", row["log_path"], e)
return {"ok": True, "job": row, "log_tail": log_tail}
@app.get("/jobs/{id}/log")
async def get_job_log(
id: str,
request: Request,
authorization: Annotated[str | None, Header()] = None,
):
tok = auth.require_app(request, authorization)
row = await db.arun(db.get_job, id)
_job_visible(row, tok)
log_path = Path(row["log_path"])
if not log_path.exists():
raise HTTPException(404, "log file not present")
return FileResponse(str(log_path), media_type="text/plain", filename=f"{id}.log")
@app.get("/jobs/{id}/findings")
async def get_job_findings(
id: str,
request: Request,
authorization: Annotated[str | None, Header()] = None,
):
tok = auth.require_app(request, authorization)
row = await db.arun(db.get_job, id)
_job_visible(row, tok)
findings = await db.arun(db.list_findings, id)
return {"ok": True, "findings": findings}
# ---------- helpers ---------------------------------------------------------
def _tail_lines(path: Path, n: int) -> list[str]:
"""Read the last n lines of a file without slurping the whole thing.
Implementation: seek backwards in chunks, splitting on \\n. Good enough
for log files in the MB range; if a single line is huge (rare) we'll
read more than the strict minimum.
"""
BLOCK = 4096
with path.open("rb") as fh:
fh.seek(0, 2)
size = fh.tell()
data = b""
while size > 0 and data.count(b"\n") <= n:
read = min(BLOCK, size)
size -= read
fh.seek(size)
data = fh.read(read) + data
text = data.decode("utf-8", "replace")
lines = text.splitlines()
return lines[-n:]

195
crafting_table/workspace.py Normal file
View file

@ -0,0 +1,195 @@
"""Workspace materialization — git clone + worktree + gc.
Layout (per project):
/workspace/<project>/.cache/ bare clone of the upstream
/workspace/<project>/<job_id>/ worktree for the requested branch+sha
Strategy:
- First time we see a project: bare clone --bare to .cache/.
- Subsequent jobs: `git fetch` the cache, then `git worktree add` the
requested branch into the per-job dir.
- After the job finishes: `git worktree remove` the per-job dir. Bare clone
stays put for the next run.
- Periodic gc: any worktree dir older than CRAFTING_GC_AGE seconds gets
pruned (defends against orphans from runner crashes).
Why bare + worktree (not fresh full clones): cargo/maven/gradle caches live
in /caches, but the source tree itself is fast to materialize this way and
leaves zero cross-job contamination. Fresh git clone of a 100MB repo takes
seconds; worktree-add is milliseconds.
Recipe commands run in the worktree dir (subproject path resolved against
the worktree root).
"""
from __future__ import annotations
import asyncio
import logging
import shutil
import time
from dataclasses import dataclass
from pathlib import Path
log = logging.getLogger("crafting_table.workspace")
@dataclass
class WorkspacePaths:
project_root: Path
cache_dir: Path # .cache/ — bare clone
worktree_dir: Path # per-job worktree
class WorkspaceManager:
def __init__(self, root: Path):
self.root = Path(root)
self.root.mkdir(parents=True, exist_ok=True)
def paths_for(self, *, project: str, job_id: str) -> WorkspacePaths:
project_root = self.root / project
return WorkspacePaths(
project_root=project_root,
cache_dir=project_root / ".cache",
worktree_dir=project_root / job_id,
)
async def materialize(
self,
*,
project: str,
job_id: str,
git_url: str,
branch: str,
log_fh,
) -> WorkspacePaths:
"""Ensure the per-job worktree exists and is checked out at branch.
Writes git progress lines into log_fh. Raises CalledProcessError-like
exceptions through if a git step fails runner.py catches and marks
the job failed.
"""
paths = self.paths_for(project=project, job_id=job_id)
paths.project_root.mkdir(parents=True, exist_ok=True)
if not paths.cache_dir.exists():
log_fh.write(f"[workspace] bare clone {git_url} -> {paths.cache_dir}\n")
log_fh.flush()
await _git(["clone", "--bare", git_url, str(paths.cache_dir)], log_fh, cwd=str(paths.project_root))
else:
log_fh.write(f"[workspace] fetching latest into {paths.cache_dir}\n")
log_fh.flush()
# --prune drops branches deleted upstream so worktree-add doesn't
# silently land on a stale ref.
await _git(["fetch", "--prune", "origin", "+refs/heads/*:refs/heads/*"], log_fh, cwd=str(paths.cache_dir))
if paths.worktree_dir.exists():
# A previous run for the same job_id (replay or restart). Wipe it.
log_fh.write(f"[workspace] removing existing worktree {paths.worktree_dir}\n")
log_fh.flush()
await self._cleanup_worktree(paths)
log_fh.write(f"[workspace] worktree add {paths.worktree_dir} branch={branch}\n")
log_fh.flush()
await _git(
["worktree", "add", "--force", str(paths.worktree_dir), branch],
log_fh,
cwd=str(paths.cache_dir),
)
return paths
async def cleanup(self, paths: WorkspacePaths) -> None:
"""Remove a worktree post-job. Best-effort — failures logged, not raised."""
try:
await self._cleanup_worktree(paths)
except Exception as e:
log.warning("worktree cleanup failed for %s: %s", paths.worktree_dir, e)
async def _cleanup_worktree(self, paths: WorkspacePaths) -> None:
if paths.worktree_dir.exists() and paths.cache_dir.exists():
try:
proc = await asyncio.create_subprocess_exec(
"git", "worktree", "remove", "--force", str(paths.worktree_dir),
cwd=str(paths.cache_dir),
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.STDOUT,
)
await proc.wait()
except Exception:
pass
# Fallback: rmtree if the worktree dir is still around.
if paths.worktree_dir.exists():
shutil.rmtree(paths.worktree_dir, ignore_errors=True)
async def gc(self, *, age_secs: int) -> dict:
"""Sweep worktrees older than age_secs. Returns counters."""
cutoff = time.time() - age_secs
removed = 0
scanned = 0
for project_dir in self.root.iterdir():
if not project_dir.is_dir():
continue
cache_dir = project_dir / ".cache"
for child in project_dir.iterdir():
scanned += 1
if child.name == ".cache":
continue
if not child.is_dir():
continue
try:
mtime = child.stat().st_mtime
except OSError:
continue
if mtime > cutoff:
continue
# Old worktree — prune.
if cache_dir.exists():
try:
proc = await asyncio.create_subprocess_exec(
"git", "worktree", "remove", "--force", str(child),
cwd=str(cache_dir),
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.STDOUT,
)
await proc.wait()
except Exception:
pass
shutil.rmtree(child, ignore_errors=True)
removed += 1
# Periodic `git gc` on the bare clone if it's been quiet for >7d
if cache_dir.exists():
try:
cache_mtime = cache_dir.stat().st_mtime
if time.time() - cache_mtime > 7 * 86400:
proc = await asyncio.create_subprocess_exec(
"git", "gc", "--prune=now", "--quiet",
cwd=str(cache_dir),
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.STDOUT,
)
await proc.wait()
except Exception:
pass
return {"scanned": scanned, "removed": removed}
async def _git(args: list[str], log_fh, *, cwd: str | None = None) -> None:
"""Run `git <args>` and stream stdout+stderr to log_fh."""
proc = await asyncio.create_subprocess_exec(
"git", *args,
cwd=cwd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.STDOUT,
)
assert proc.stdout is not None
while True:
line = await proc.stdout.readline()
if not line:
break
log_fh.write(line.decode("utf-8", "replace"))
log_fh.flush()
rc = await proc.wait()
if rc != 0:
raise RuntimeError(f"git {args[0]} exited {rc}")