- parsers/ package: rust / python / go / typescript / generic
- parser registry with language+recipe -> fallback resolution
- fingerprint hash (kind+file+line+code) for cross-run dedup
- runner.py post-exec hook: parse log, persist findings, count on job row
(extraction runs before mark_job_finished so callers polling on terminal
status see findings_count populated atomically)
- db.insert_finding / list_findings / increment_findings_count DAOs already
shipped in wave 1; wired here
- GET /jobs/{id}/findings now returns real data (server route already
shipped; was returning empty list because nothing populated the table)
- tests/test_parsers/: 6 modules + 11 fixtures (rust/python/go/typescript)
- tests/test_runner_findings.py: 3 integration tests
- README: tick steps 2-6, add Findings section
Suite: 108 passing (62 wave-1 + 46 new).
Spec: memory/spec-crafting-table.md
110 lines
3.8 KiB
Python
110 lines
3.8 KiB
Python
"""Parser protocol + Finding dataclass + fingerprint helper.
|
|
|
|
A Finding mirrors the columns in the `findings` table (see db.py migration
|
|
005). Parsers produce a list of these; the runner persists them with a
|
|
fingerprint computed via `fingerprint()` so the same lint reappearing across
|
|
nightly runs deduplicates cleanly.
|
|
|
|
The fingerprint deliberately excludes `message` because tool wording drifts
|
|
version-to-version (clippy especially loves to rephrase). The locator —
|
|
kind+file+line+code — is what makes "this is the same finding" stable.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
from dataclasses import dataclass, field
|
|
from typing import Protocol, runtime_checkable
|
|
|
|
|
|
@dataclass
|
|
class Finding:
|
|
"""Structured finding row. ``raw_json`` is an optional escape hatch — the
|
|
full original JSON object from the tool, serialized — so that callers /
|
|
later analysis can re-extract fields we didn't break out into columns
|
|
(e.g. clippy's `spans[]` array, audit's full advisory body).
|
|
"""
|
|
|
|
kind: str # "lint" | "cve" | "test_fail" | "recipe_fail" | ...
|
|
severity: str # "info" | "warn" | "error" | "high" | "critical"
|
|
message: str
|
|
file: str | None = None
|
|
line: int | None = None
|
|
code: str | None = None
|
|
suggested_fix: str | None = None
|
|
raw_json: str | None = None
|
|
# Some parsers emit metadata fields (package/version/fixed_in/advisory)
|
|
# that aren't first-class DB columns — those go into raw_json so the
|
|
# info isn't lost. Keep `extras` here for parsers to stash structured
|
|
# bits before we serialize.
|
|
extras: dict = field(default_factory=dict)
|
|
|
|
|
|
@runtime_checkable
|
|
class Parser(Protocol):
|
|
"""Per-language parser. Implementations are stateless — every method is a
|
|
classmethod — so we don't bother instantiating them; the registry holds
|
|
classes."""
|
|
|
|
@classmethod
|
|
def matches(cls, language: str, recipe: str) -> bool:
|
|
"""Does this parser claim (language, recipe)? Called by the registry
|
|
during resolution — the parser owns the decision so we can register
|
|
multi-recipe parsers (e.g. RustParser handles all rust recipes)."""
|
|
...
|
|
|
|
@classmethod
|
|
def parse(cls, raw_log: str, exit_code: int, recipe: str) -> list[Finding]:
|
|
"""Turn raw subprocess output into Finding rows. Must not raise on
|
|
malformed input — return [] and let the caller log the recipe as
|
|
un-parsed."""
|
|
...
|
|
|
|
|
|
def fingerprint(
|
|
kind: str,
|
|
file: str | None,
|
|
line: int | None,
|
|
code: str | None,
|
|
message: str,
|
|
) -> str:
|
|
"""Stable 16-char hash over the locator parts of a finding.
|
|
|
|
`message` is intentionally NOT in the hash — tool wording drifts
|
|
version-to-version so two consecutive nightlies would otherwise produce
|
|
different fingerprints for the same underlying issue. The locator
|
|
(kind+file+line+code) is what makes "this is the same finding"
|
|
stable across runs.
|
|
|
|
The 16-char truncation gives 64 bits of collision space, more than
|
|
enough for one project's findings table.
|
|
"""
|
|
h = hashlib.sha256()
|
|
h.update(f"{kind}|{file or ''}|{line or 0}|{code or ''}".encode("utf-8"))
|
|
return h.hexdigest()[:16]
|
|
|
|
|
|
def _safe_json_loads(s: str):
|
|
"""Try json.loads(s); return None on failure. Several parsers wrap this
|
|
so they can degrade to [] without bringing down the runner."""
|
|
import json
|
|
|
|
try:
|
|
return json.loads(s)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
|
|
def _iter_jsonl(text: str):
|
|
"""Yield parsed JSON objects from NDJSON/JSON-Lines text, skipping any
|
|
line that doesn't parse. cargo clippy, mypy --output=json, and
|
|
govulncheck all emit JSON-lines."""
|
|
import json
|
|
|
|
for line in text.splitlines():
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
yield json.loads(line)
|
|
except ValueError:
|
|
continue
|