patcher: robust extract_diff_json — handles 5 model-output shapes

The 4 patcher-fired-but-malformed_response failures showed extract_diff_json was too strict: it required {"diff": "..."} as the top-level JSON shape with at most 1 brace nesting depth (regex-based). Real model output varies more. Now handles: 1. Bare JSON {"diff", "explanation", "confidence"} 2. Fenced JSON: ```json {…} ``` 3. Fenced diff + prose: ```diff …unified diff… ``` + loose explanation 4. Bare unified diff (no JSON wrapper, no fence) 5. JSON with deeply-nested {} inside the diff string (struct literals, function bodies) Fixes: - Replaced regex-based balanced-{} matcher (capped at depth 1) with a string-aware depth-tracking generator that handles arbitrary nesting + skips brace chars inside JSON string literals - Walk all fenced blocks not just the first; recognize ```diff and ```patch language tags - Fall back to fenced-diff-with-prose construction when no JSON form matches — synthetic payload with surrounding text as explanation - Final fallback for bare unified diffs (no fence, no wrapper) using a simple line-prefix detector - Normalize alternate keys (patch, content, diff_text → diff) - Always set confidence (defaults to medium when absent, low for bare diffs that have no model commentary) Tests: 16 → 20 (5 new shape coverage tests). All green.
2026-04-29 15:37:24 -07:00 · 2026-04-29 15:37:24 -07:00 · 3273d66003
commit 3273d66003
parent 80c4eebf3b
2 changed files with 206 additions and 28 deletions
--- a/tests/test_patcher.py
+++ b/tests/test_patcher.py
@ -205,7 +205,12 @@ def test_findings_were_actionable_cve():

 def test_extract_diff_json_plain():
    obj = extract_diff_json('{"diff": "x", "explanation": "y"}')
-    assert obj == {"diff": "x", "explanation": "y"}
+    # Parser normalizes — confidence defaults to "medium" when absent so
+    # downstream code can rely on the field always being present.
+    assert obj is not None
+    assert obj["diff"] == "x"
+    assert obj["explanation"] == "y"
+    assert obj["confidence"] == "medium"


 def test_extract_diff_json_fenced():
@ -218,6 +223,56 @@ def test_extract_diff_json_returns_none_on_garbage():
    assert extract_diff_json("not even json") is None


+def test_extract_diff_json_fenced_diff_block():
+    """Real-world Opus shape: prose + a fenced ```diff block, no JSON wrapper."""
+    text = (
+        "Here is the fix:\n\n"
+        "```diff\n"
+        "--- a/src/lib.rs\n"
+        "+++ b/src/lib.rs\n"
+        "@@ -1 +1 @@\n"
+        "-old\n"
+        "+new\n"
+        "```\n\n"
+        "That should resolve the off-by-one."
+    )
+    obj = extract_diff_json(text)
+    assert obj is not None
+    assert "lib.rs" in obj["diff"]
+    assert "off-by-one" in obj["explanation"]
+
+
+def test_extract_diff_json_bare_unified_diff():
+    """No fence, no JSON wrapper — just the diff body."""
+    text = "--- a/x\n+++ b/x\n@@ -1 +1 @@\n-old\n+new\n"
+    obj = extract_diff_json(text)
+    assert obj is not None
+    assert obj["diff"].rstrip() == text.rstrip()  # parser strips trailing whitespace; semantic equivalence
+    assert obj["confidence"] == "low"  # bare diff is low-confidence — no model commentary to weigh
+
+
+def test_extract_diff_json_deeply_nested_braces_in_diff():
+    """The old regex was capped at one level of brace nesting; real diffs
+    contain struct literals etc. with arbitrary depth."""
+    deep = (
+        '{"diff": "--- a/x.rs\\n+++ b/x.rs\\n@@\\n'
+        '-fn x() { Some(Foo { a: 1 }) }\\n'
+        '+fn x() { Some(Foo { a: 2 }) }", '
+        '"explanation": "depth-2 nesting", "confidence": "high"}'
+    )
+    obj = extract_diff_json(deep)
+    assert obj is not None
+    assert obj["explanation"] == "depth-2 nesting"
+
+
+def test_extract_diff_json_alt_key():
+    """Models sometimes use 'patch' instead of 'diff'."""
+    obj = extract_diff_json('{"patch": "--- a\\n+++ b\\n@@\\n-x\\n+y", "explanation": "via alt key"}')
+    assert obj is not None
+    # Normalizer copies the alt key into the canonical 'diff' field
+    assert obj["diff"].startswith("--- a")
+
+
 def test_turn_text_concatenates_text_events():
    assert turn_text({"events": [
        {"type": "text", "content": "hello "},