auth: retry guard on transient OIDC; admin-bearer alt for bulk-start
Two small adds:
(1) auth/callback no longer 500s on transient JWKS-fetch failures or
on a stale state cookie from a prior failed callback. Catches three
distinct failure modes and renders templates/auth_retry.html with
a 503 / 400 status:
- RequestsConnectionError + Timeout → "couldn't reach the auth
server — usually a momentary DNS or network blip"
- MismatchingStateError → "that login link expired (you probably
retried after a blip). hit login again to start fresh"
- OAuthError → "auth handshake failed: <detail>"
All three paths pop _state_cauldron_authlib so the next /login
starts with a fresh state token. Today's incident: cauldron
container hit a brief auth.sulkta.com DNS resolution failure
during OIDC callback, threw a stack trace, then the user's retry
failed differently because of the leftover state cookie.
(2) /api/admin/sterilize/bulk-start (require_bearer) accepts
{"started_by_sub": "user@example.com"} body. Resolves that user's
household, decrypts their stored Mealie token, spawns the same
preview thread the session-authed endpoint does. The job appears
in /sterilize for the user as if they'd started it themselves.
Plus admin-status (GET /api/admin/sterilize/jobs/<id>) and
admin-cancel (POST /api/admin/sterilize/bulk-cancel/<id>) for
operator symmetry. Existing user-session endpoints unchanged.
Net: kayos can kick off + cancel + monitor sterilize jobs from the
outside without piggybacking on Cobb's browser cookie, and a brief
DNS hiccup on the LAN no longer turns into a "your login is broken"
experience for the user.
This commit is contained in:
parent
4707e6aacc
commit
a0ad363915
2 changed files with 133 additions and 1 deletions
|
|
@ -24,7 +24,10 @@ Routes (current):
|
|||
from datetime import date, datetime, timedelta
|
||||
from functools import wraps
|
||||
|
||||
import requests
|
||||
from authlib.integrations.base_client.errors import MismatchingStateError, OAuthError
|
||||
from flask import Flask, jsonify, redirect, render_template, request, session, url_for
|
||||
from requests.exceptions import ConnectionError as RequestsConnectionError
|
||||
|
||||
from .config import load
|
||||
from .crypto import TokenCrypto
|
||||
|
|
@ -213,7 +216,38 @@ def create_app() -> Flask:
|
|||
|
||||
@app.get("/auth/callback")
|
||||
def auth_callback():
|
||||
token = oauth.cauldron.authorize_access_token()
|
||||
# Wrap the OIDC exchange so transient DNS/JWKS hiccups (resolver
|
||||
# blip on auth.sulkta.com → ConnectionError → 500) render a
|
||||
# friendly retry page instead of dumping a stack trace, AND
|
||||
# clear the stashed state so the user's retry doesn't trip the
|
||||
# MismatchingState CSRF guard from a stale state cookie.
|
||||
try:
|
||||
token = oauth.cauldron.authorize_access_token()
|
||||
except (RequestsConnectionError, requests.Timeout) as e:
|
||||
app.logger.warning("OIDC callback: upstream unreachable: %s", e)
|
||||
session.pop("_state_cauldron_authlib", None)
|
||||
return render_template(
|
||||
"auth_retry.html",
|
||||
reason="upstream",
|
||||
detail="couldn't reach the auth server — usually a momentary DNS or network blip.",
|
||||
), 503
|
||||
except MismatchingStateError:
|
||||
# Stale state from a previous failed callback. Clear and ask
|
||||
# the user to start a fresh login.
|
||||
session.pop("_state_cauldron_authlib", None)
|
||||
return render_template(
|
||||
"auth_retry.html",
|
||||
reason="stale",
|
||||
detail="that login link expired (you probably retried after a blip). hit login again to start fresh.",
|
||||
), 400
|
||||
except OAuthError as e:
|
||||
app.logger.warning("OIDC callback: oauth error: %s", e)
|
||||
session.pop("_state_cauldron_authlib", None)
|
||||
return render_template(
|
||||
"auth_retry.html",
|
||||
reason="oauth",
|
||||
detail=f"auth handshake failed: {e}",
|
||||
), 400
|
||||
userinfo = token.get("userinfo") or oauth.cauldron.userinfo(token=token)
|
||||
sub = userinfo.get("sub") or userinfo.get("email")
|
||||
email = userinfo.get("email") or sub
|
||||
|
|
@ -960,6 +994,74 @@ def create_app() -> Flask:
|
|||
db.finalize_sterilize_job(job_id, state="cancelled")
|
||||
return jsonify({"ok": True})
|
||||
|
||||
# ---------- admin sterilizer (bearer-auth, kick off on user's behalf) -
|
||||
|
||||
@app.post("/api/admin/sterilize/bulk-start")
|
||||
@require_bearer
|
||||
def admin_sterilize_bulk_start():
|
||||
"""Bearer-authed alternate to /api/sterilize/bulk-start. Body:
|
||||
{"started_by_sub": "cobb@sulkta.com"}
|
||||
Resolves that user's household + decrypts their stored Mealie
|
||||
token + spawns a preview thread. Lets cauldron operators kick
|
||||
off bulk runs without needing a Flask session — same job state
|
||||
and proposals the user will see in /sterilize."""
|
||||
body = request.get_json(silent=True) or {}
|
||||
sub = (body.get("started_by_sub") or "").strip()
|
||||
if not sub:
|
||||
return jsonify({"error": "started_by_sub required"}), 400
|
||||
|
||||
hid = db.get_user_household_id(sub)
|
||||
if not hid:
|
||||
return jsonify({"error": "user has no household"}), 404
|
||||
|
||||
active = db.running_sterilize_job_for_household(hid)
|
||||
if active:
|
||||
return jsonify({"error": "already_running", "job_id": active["id"]}), 409
|
||||
|
||||
blob = db.get_user_mealie_token_blob(sub)
|
||||
if not blob:
|
||||
return jsonify({"error": "user_not_connected_to_mealie"}), 409
|
||||
try:
|
||||
tok = crypto.decrypt(blob)
|
||||
except Exception:
|
||||
return jsonify({"error": "user_token_undecryptable"}), 500
|
||||
mealie = Mealie(base_url=cfg.mealie_api_url, api_token=tok)
|
||||
sterilizer = Sterilizer(mealie=mealie, forge=forge, model=cfg.default_model)
|
||||
|
||||
try:
|
||||
page1 = sterilizer.mealie.list_recipes(page=1, per_page=1)
|
||||
except MealieError as e:
|
||||
return jsonify({"error": "mealie_unreachable", "detail": str(e)}), 502
|
||||
total = int(page1.get("total") or page1.get("totalItems") or 0)
|
||||
|
||||
job_id = db.create_sterilize_job(
|
||||
household_id=hid, started_by_sub=sub, total=total
|
||||
)
|
||||
bulk_sterilize.spawn_preview_thread(
|
||||
db=db, job_id=job_id, sterilizer=sterilizer
|
||||
)
|
||||
return jsonify({"ok": True, "job_id": job_id, "total": total})
|
||||
|
||||
@app.get("/api/admin/sterilize/jobs/<int:job_id>")
|
||||
@require_bearer
|
||||
def admin_sterilize_job_status(job_id: int):
|
||||
"""Bearer-authed read of any job's state — for poll-from-outside."""
|
||||
job = db.get_sterilize_job(job_id)
|
||||
if not job:
|
||||
return jsonify({"error": "not_found"}), 404
|
||||
return jsonify({"job": _job_payload(job)})
|
||||
|
||||
@app.post("/api/admin/sterilize/bulk-cancel/<int:job_id>")
|
||||
@require_bearer
|
||||
def admin_sterilize_bulk_cancel(job_id: int):
|
||||
job = db.get_sterilize_job(job_id)
|
||||
if not job:
|
||||
return jsonify({"error": "not_found"}), 404
|
||||
if job["state"] not in ("running", "review", "applying"):
|
||||
return jsonify({"error": f"bad_state:{job['state']}"}), 409
|
||||
db.finalize_sterilize_job(job_id, state="cancelled")
|
||||
return jsonify({"ok": True})
|
||||
|
||||
# ---------- v0.1 admin endpoints (carry over) ------------------------
|
||||
|
||||
@app.get("/api/recipes")
|
||||
|
|
|
|||
30
cauldron/templates/auth_retry.html
Normal file
30
cauldron/templates/auth_retry.html
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
{% extends "_base.html" %}
|
||||
{% block title %}Try again · Cauldron{% endblock %}
|
||||
{% block content %}
|
||||
|
||||
<div class="page-head">
|
||||
<div class="crumb">// auth · interrupted</div>
|
||||
<h1>almost <span class="accent">there</span></h1>
|
||||
<div class="lede">{{ detail }}</div>
|
||||
</div>
|
||||
|
||||
<section class="panel">
|
||||
<div class="panel-head">
|
||||
<h2>{{ reason }}</h2>
|
||||
<span class="pill pill-warn">retry</span>
|
||||
</div>
|
||||
<p>
|
||||
{% if reason == 'upstream' %}
|
||||
the auth server (<code>auth.sulkta.com</code>) didn't answer in time.
|
||||
usually means a brief DNS / network blip on the LAN. try again in a sec.
|
||||
{% elif reason == 'stale' %}
|
||||
your previous login attempt left a one-time token behind that's now expired.
|
||||
we cleared it on our side — hit the button below to start fresh.
|
||||
{% else %}
|
||||
something didn't line up in the OIDC handshake. starting over usually fixes it.
|
||||
{% endif %}
|
||||
</p>
|
||||
<p><a class="btn btn-purple" href="/login">↻ try login again</a></p>
|
||||
</section>
|
||||
|
||||
{% endblock %}
|
||||
Loading…
Add table
Add a link
Reference in a new issue