auth: retry guard on transient OIDC; admin-bearer alt for bulk-start

Two small adds:

(1) auth/callback no longer 500s on transient JWKS-fetch failures or
    on a stale state cookie from a prior failed callback. Catches three
    distinct failure modes and renders templates/auth_retry.html with
    a 503 / 400 status:
      - RequestsConnectionError + Timeout → "couldn't reach the auth
        server — usually a momentary DNS or network blip"
      - MismatchingStateError → "that login link expired (you probably
        retried after a blip). hit login again to start fresh"
      - OAuthError → "auth handshake failed: <detail>"
    All three paths pop _state_cauldron_authlib so the next /login
    starts with a fresh state token. Today's incident: cauldron
    container hit a brief auth.sulkta.com DNS resolution failure
    during OIDC callback, threw a stack trace, then the user's retry
    failed differently because of the leftover state cookie.

(2) /api/admin/sterilize/bulk-start (require_bearer) accepts
    {"started_by_sub": "user@example.com"} body. Resolves that user's
    household, decrypts their stored Mealie token, spawns the same
    preview thread the session-authed endpoint does. The job appears
    in /sterilize for the user as if they'd started it themselves.
    Plus admin-status (GET /api/admin/sterilize/jobs/<id>) and
    admin-cancel (POST /api/admin/sterilize/bulk-cancel/<id>) for
    operator symmetry. Existing user-session endpoints unchanged.

Net: kayos can kick off + cancel + monitor sterilize jobs from the
outside without piggybacking on Cobb's browser cookie, and a brief
DNS hiccup on the LAN no longer turns into a "your login is broken"
experience for the user.
This commit is contained in:
Kayos 2026-04-30 10:30:49 -07:00
parent 4707e6aacc
commit a0ad363915
2 changed files with 133 additions and 1 deletions

View file

@ -24,7 +24,10 @@ Routes (current):
from datetime import date, datetime, timedelta
from functools import wraps
import requests
from authlib.integrations.base_client.errors import MismatchingStateError, OAuthError
from flask import Flask, jsonify, redirect, render_template, request, session, url_for
from requests.exceptions import ConnectionError as RequestsConnectionError
from .config import load
from .crypto import TokenCrypto
@ -213,7 +216,38 @@ def create_app() -> Flask:
@app.get("/auth/callback")
def auth_callback():
token = oauth.cauldron.authorize_access_token()
# Wrap the OIDC exchange so transient DNS/JWKS hiccups (resolver
# blip on auth.sulkta.com → ConnectionError → 500) render a
# friendly retry page instead of dumping a stack trace, AND
# clear the stashed state so the user's retry doesn't trip the
# MismatchingState CSRF guard from a stale state cookie.
try:
token = oauth.cauldron.authorize_access_token()
except (RequestsConnectionError, requests.Timeout) as e:
app.logger.warning("OIDC callback: upstream unreachable: %s", e)
session.pop("_state_cauldron_authlib", None)
return render_template(
"auth_retry.html",
reason="upstream",
detail="couldn't reach the auth server — usually a momentary DNS or network blip.",
), 503
except MismatchingStateError:
# Stale state from a previous failed callback. Clear and ask
# the user to start a fresh login.
session.pop("_state_cauldron_authlib", None)
return render_template(
"auth_retry.html",
reason="stale",
detail="that login link expired (you probably retried after a blip). hit login again to start fresh.",
), 400
except OAuthError as e:
app.logger.warning("OIDC callback: oauth error: %s", e)
session.pop("_state_cauldron_authlib", None)
return render_template(
"auth_retry.html",
reason="oauth",
detail=f"auth handshake failed: {e}",
), 400
userinfo = token.get("userinfo") or oauth.cauldron.userinfo(token=token)
sub = userinfo.get("sub") or userinfo.get("email")
email = userinfo.get("email") or sub
@ -960,6 +994,74 @@ def create_app() -> Flask:
db.finalize_sterilize_job(job_id, state="cancelled")
return jsonify({"ok": True})
# ---------- admin sterilizer (bearer-auth, kick off on user's behalf) -
@app.post("/api/admin/sterilize/bulk-start")
@require_bearer
def admin_sterilize_bulk_start():
"""Bearer-authed alternate to /api/sterilize/bulk-start. Body:
{"started_by_sub": "cobb@sulkta.com"}
Resolves that user's household + decrypts their stored Mealie
token + spawns a preview thread. Lets cauldron operators kick
off bulk runs without needing a Flask session same job state
and proposals the user will see in /sterilize."""
body = request.get_json(silent=True) or {}
sub = (body.get("started_by_sub") or "").strip()
if not sub:
return jsonify({"error": "started_by_sub required"}), 400
hid = db.get_user_household_id(sub)
if not hid:
return jsonify({"error": "user has no household"}), 404
active = db.running_sterilize_job_for_household(hid)
if active:
return jsonify({"error": "already_running", "job_id": active["id"]}), 409
blob = db.get_user_mealie_token_blob(sub)
if not blob:
return jsonify({"error": "user_not_connected_to_mealie"}), 409
try:
tok = crypto.decrypt(blob)
except Exception:
return jsonify({"error": "user_token_undecryptable"}), 500
mealie = Mealie(base_url=cfg.mealie_api_url, api_token=tok)
sterilizer = Sterilizer(mealie=mealie, forge=forge, model=cfg.default_model)
try:
page1 = sterilizer.mealie.list_recipes(page=1, per_page=1)
except MealieError as e:
return jsonify({"error": "mealie_unreachable", "detail": str(e)}), 502
total = int(page1.get("total") or page1.get("totalItems") or 0)
job_id = db.create_sterilize_job(
household_id=hid, started_by_sub=sub, total=total
)
bulk_sterilize.spawn_preview_thread(
db=db, job_id=job_id, sterilizer=sterilizer
)
return jsonify({"ok": True, "job_id": job_id, "total": total})
@app.get("/api/admin/sterilize/jobs/<int:job_id>")
@require_bearer
def admin_sterilize_job_status(job_id: int):
"""Bearer-authed read of any job's state — for poll-from-outside."""
job = db.get_sterilize_job(job_id)
if not job:
return jsonify({"error": "not_found"}), 404
return jsonify({"job": _job_payload(job)})
@app.post("/api/admin/sterilize/bulk-cancel/<int:job_id>")
@require_bearer
def admin_sterilize_bulk_cancel(job_id: int):
job = db.get_sterilize_job(job_id)
if not job:
return jsonify({"error": "not_found"}), 404
if job["state"] not in ("running", "review", "applying"):
return jsonify({"error": f"bad_state:{job['state']}"}), 409
db.finalize_sterilize_job(job_id, state="cancelled")
return jsonify({"ok": True})
# ---------- v0.1 admin endpoints (carry over) ------------------------
@app.get("/api/recipes")

View file

@ -0,0 +1,30 @@
{% extends "_base.html" %}
{% block title %}Try again · Cauldron{% endblock %}
{% block content %}
<div class="page-head">
<div class="crumb">// auth · interrupted</div>
<h1>almost <span class="accent">there</span></h1>
<div class="lede">{{ detail }}</div>
</div>
<section class="panel">
<div class="panel-head">
<h2>{{ reason }}</h2>
<span class="pill pill-warn">retry</span>
</div>
<p>
{% if reason == 'upstream' %}
the auth server (<code>auth.sulkta.com</code>) didn't answer in time.
usually means a brief DNS / network blip on the LAN. try again in a sec.
{% elif reason == 'stale' %}
your previous login attempt left a one-time token behind that's now expired.
we cleared it on our side — hit the button below to start fresh.
{% else %}
something didn't line up in the OIDC handshake. starting over usually fixes it.
{% endif %}
</p>
<p><a class="btn btn-purple" href="/login">↻ try login again</a></p>
</section>
{% endblock %}