From a0ad3639156474d1646080fbdb57d01b7e0b8fb3 Mon Sep 17 00:00:00 2001 From: Kayos Date: Thu, 30 Apr 2026 10:30:49 -0700 Subject: [PATCH] auth: retry guard on transient OIDC; admin-bearer alt for bulk-start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two small adds: (1) auth/callback no longer 500s on transient JWKS-fetch failures or on a stale state cookie from a prior failed callback. Catches three distinct failure modes and renders templates/auth_retry.html with a 503 / 400 status: - RequestsConnectionError + Timeout → "couldn't reach the auth server — usually a momentary DNS or network blip" - MismatchingStateError → "that login link expired (you probably retried after a blip). hit login again to start fresh" - OAuthError → "auth handshake failed: " All three paths pop _state_cauldron_authlib so the next /login starts with a fresh state token. Today's incident: cauldron container hit a brief auth.sulkta.com DNS resolution failure during OIDC callback, threw a stack trace, then the user's retry failed differently because of the leftover state cookie. (2) /api/admin/sterilize/bulk-start (require_bearer) accepts {"started_by_sub": "user@example.com"} body. Resolves that user's household, decrypts their stored Mealie token, spawns the same preview thread the session-authed endpoint does. The job appears in /sterilize for the user as if they'd started it themselves. Plus admin-status (GET /api/admin/sterilize/jobs/) and admin-cancel (POST /api/admin/sterilize/bulk-cancel/) for operator symmetry. Existing user-session endpoints unchanged. Net: kayos can kick off + cancel + monitor sterilize jobs from the outside without piggybacking on Cobb's browser cookie, and a brief DNS hiccup on the LAN no longer turns into a "your login is broken" experience for the user. --- cauldron/server.py | 104 ++++++++++++++++++++++++++++- cauldron/templates/auth_retry.html | 30 +++++++++ 2 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 cauldron/templates/auth_retry.html diff --git a/cauldron/server.py b/cauldron/server.py index 3412554..62baa1b 100644 --- a/cauldron/server.py +++ b/cauldron/server.py @@ -24,7 +24,10 @@ Routes (current): from datetime import date, datetime, timedelta from functools import wraps +import requests +from authlib.integrations.base_client.errors import MismatchingStateError, OAuthError from flask import Flask, jsonify, redirect, render_template, request, session, url_for +from requests.exceptions import ConnectionError as RequestsConnectionError from .config import load from .crypto import TokenCrypto @@ -213,7 +216,38 @@ def create_app() -> Flask: @app.get("/auth/callback") def auth_callback(): - token = oauth.cauldron.authorize_access_token() + # Wrap the OIDC exchange so transient DNS/JWKS hiccups (resolver + # blip on auth.sulkta.com → ConnectionError → 500) render a + # friendly retry page instead of dumping a stack trace, AND + # clear the stashed state so the user's retry doesn't trip the + # MismatchingState CSRF guard from a stale state cookie. + try: + token = oauth.cauldron.authorize_access_token() + except (RequestsConnectionError, requests.Timeout) as e: + app.logger.warning("OIDC callback: upstream unreachable: %s", e) + session.pop("_state_cauldron_authlib", None) + return render_template( + "auth_retry.html", + reason="upstream", + detail="couldn't reach the auth server — usually a momentary DNS or network blip.", + ), 503 + except MismatchingStateError: + # Stale state from a previous failed callback. Clear and ask + # the user to start a fresh login. + session.pop("_state_cauldron_authlib", None) + return render_template( + "auth_retry.html", + reason="stale", + detail="that login link expired (you probably retried after a blip). hit login again to start fresh.", + ), 400 + except OAuthError as e: + app.logger.warning("OIDC callback: oauth error: %s", e) + session.pop("_state_cauldron_authlib", None) + return render_template( + "auth_retry.html", + reason="oauth", + detail=f"auth handshake failed: {e}", + ), 400 userinfo = token.get("userinfo") or oauth.cauldron.userinfo(token=token) sub = userinfo.get("sub") or userinfo.get("email") email = userinfo.get("email") or sub @@ -960,6 +994,74 @@ def create_app() -> Flask: db.finalize_sterilize_job(job_id, state="cancelled") return jsonify({"ok": True}) + # ---------- admin sterilizer (bearer-auth, kick off on user's behalf) - + + @app.post("/api/admin/sterilize/bulk-start") + @require_bearer + def admin_sterilize_bulk_start(): + """Bearer-authed alternate to /api/sterilize/bulk-start. Body: + {"started_by_sub": "cobb@sulkta.com"} + Resolves that user's household + decrypts their stored Mealie + token + spawns a preview thread. Lets cauldron operators kick + off bulk runs without needing a Flask session — same job state + and proposals the user will see in /sterilize.""" + body = request.get_json(silent=True) or {} + sub = (body.get("started_by_sub") or "").strip() + if not sub: + return jsonify({"error": "started_by_sub required"}), 400 + + hid = db.get_user_household_id(sub) + if not hid: + return jsonify({"error": "user has no household"}), 404 + + active = db.running_sterilize_job_for_household(hid) + if active: + return jsonify({"error": "already_running", "job_id": active["id"]}), 409 + + blob = db.get_user_mealie_token_blob(sub) + if not blob: + return jsonify({"error": "user_not_connected_to_mealie"}), 409 + try: + tok = crypto.decrypt(blob) + except Exception: + return jsonify({"error": "user_token_undecryptable"}), 500 + mealie = Mealie(base_url=cfg.mealie_api_url, api_token=tok) + sterilizer = Sterilizer(mealie=mealie, forge=forge, model=cfg.default_model) + + try: + page1 = sterilizer.mealie.list_recipes(page=1, per_page=1) + except MealieError as e: + return jsonify({"error": "mealie_unreachable", "detail": str(e)}), 502 + total = int(page1.get("total") or page1.get("totalItems") or 0) + + job_id = db.create_sterilize_job( + household_id=hid, started_by_sub=sub, total=total + ) + bulk_sterilize.spawn_preview_thread( + db=db, job_id=job_id, sterilizer=sterilizer + ) + return jsonify({"ok": True, "job_id": job_id, "total": total}) + + @app.get("/api/admin/sterilize/jobs/") + @require_bearer + def admin_sterilize_job_status(job_id: int): + """Bearer-authed read of any job's state — for poll-from-outside.""" + job = db.get_sterilize_job(job_id) + if not job: + return jsonify({"error": "not_found"}), 404 + return jsonify({"job": _job_payload(job)}) + + @app.post("/api/admin/sterilize/bulk-cancel/") + @require_bearer + def admin_sterilize_bulk_cancel(job_id: int): + job = db.get_sterilize_job(job_id) + if not job: + return jsonify({"error": "not_found"}), 404 + if job["state"] not in ("running", "review", "applying"): + return jsonify({"error": f"bad_state:{job['state']}"}), 409 + db.finalize_sterilize_job(job_id, state="cancelled") + return jsonify({"ok": True}) + # ---------- v0.1 admin endpoints (carry over) ------------------------ @app.get("/api/recipes") diff --git a/cauldron/templates/auth_retry.html b/cauldron/templates/auth_retry.html new file mode 100644 index 0000000..9c38725 --- /dev/null +++ b/cauldron/templates/auth_retry.html @@ -0,0 +1,30 @@ +{% extends "_base.html" %} +{% block title %}Try again · Cauldron{% endblock %} +{% block content %} + +
+
// auth · interrupted
+

almost there

+
{{ detail }}
+
+ +
+
+

{{ reason }}

+ retry +
+

+ {% if reason == 'upstream' %} + the auth server (auth.sulkta.com) didn't answer in time. + usually means a brief DNS / network blip on the LAN. try again in a sec. + {% elif reason == 'stale' %} + your previous login attempt left a one-time token behind that's now expired. + we cleared it on our side — hit the button below to start fresh. + {% else %} + something didn't line up in the OIDC handshake. starting over usually fixes it. + {% endif %} +

+

↻ try login again

+
+ +{% endblock %}