gstack/test/redact-engine.test.ts

455 lines
19 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Unit tests for lib/redact-engine.ts + lib/redact-patterns.ts.
*
* One positive test per pattern, plus FP-filters, validators (Luhn/entropy/
* RFC1918), email allowlist, no-promotion visibility semantics, tool-fence
* degrade, normalization (zero-width / homoglyph / entity), oversize fail-closed,
* and pure-function purity.
*/
import { describe, test, expect } from "bun:test";
import {
scan,
exitCodeFor,
maskPreview,
normalizeWithMap,
redactFindingSpans,
type RepoVisibility,
} from "../lib/redact-engine";
import {
PATTERNS,
luhnValid,
shannonEntropy,
isPublicIPv4,
isPlaceholderSpan,
} from "../lib/redact-patterns";
function ids(text: string, vis: RepoVisibility = "private"): string[] {
return scan(text, { repoVisibility: vis }).findings.map((f) => f.id);
}
describe("HIGH credential patterns", () => {
const cases: Array<[string, string]> = [
["aws.access_key", "key = AKIA1234567890ABCDEF"],
["aws.secret_key", "aws_secret_access_key = AbCdEfGhIjKlMnOpQrStUvWxYz0123456789AbCd"],
["github.pat", "token ghp_" + "1234567890abcdefghijklmnopqrstuvwxyz"],
["github.oauth", "gho_" + "1234567890abcdefghijklmnopqrstuvwxyz"],
["github.server", "ghs_1234567890abcdefghijklmnopqrstuvwxyz"],
["github.fine_grained", "github_pat_" + "A".repeat(82)],
["anthropic.key", "sk-ant-" + "api03-abcdefghij1234567890XYZ"],
["openai.key", "sk-proj-" + "a".repeat(40)],
["sendgrid.key", "SG." + "a".repeat(22) + "." + "b".repeat(43)],
["stripe.secret", "sk_live_" + "a".repeat(30)],
["slack.token", "xox" + "b-1234567890-abcdefghijklmnop"],
["slack.webhook", "https://hooks.slack.com/services/T00000000/B11111111/" + "a".repeat(24)],
["discord.webhook", "https://discord.com/api/webhooks/123456789012345678/" + "a".repeat(60)],
["pem.private_key", "-----BEGIN RSA PRIVATE KEY-----"],
// #1946 coverage-gap additions
["gitlab.token", "remote: glpat-" + "Ab12Cd34Ef56Gh78Ij90"],
["gitlab.token", "trigger glptt-" + "a1b2c3d4e5f6a7b8c9d0e1f2"],
["gitlab.token", "deploy gldt-" + "Zy98Xw76Vu54Ts32Rq10"],
["huggingface.token", "hf_" + "AbCdEfGhIjKlMnOpQrStUvWxYz012345"],
["npm.token", "npm_" + "a1B2c3D4e5F6g7H8i9J0k1L2m3N4o5P6q7R8"],
["digitalocean.token", "dop_v1_" + "0123456789abcdef".repeat(4)],
[
"gcp.service_account",
'{"private_key_id": "abc123", "private_key": "-----BEGIN PRIVATE KEY-----\\nMIIE..."}',
],
];
for (const [id, text] of cases) {
test(`flags ${id}`, () => {
expect(ids(text)).toContain(id);
});
}
// #1868 — modern OpenAI keys use base64url bodies (with - and _). The old
// [A-Za-z0-9]{32,} regex stopped at the first separator and missed them all,
// failing a HIGH credential OPEN through the redaction gate.
test("openai.key flags modern sk-proj-/sk-svcacct-/sk-admin- shapes (#1868)", () => {
const missed = [
"sk-proj-Ab12_Cd34-Ef56Gh78Ij90Kl12Mn34Op56Qr78St90Uv",
"sk-svcacct-abc_def-ghijklmnopqrstuvwxyz0123456789ABCDEF",
"sk-admin-AAAA_BBBB-CCCC_DDDD-EEEE_FFFF-GGGG_HHHH1234",
];
for (const key of missed) {
expect(ids(`OPENAI_API_KEY=${key}`)).toContain("openai.key");
}
// legacy contiguous shape still flags
expect(ids("sk-proj-" + "a".repeat(40))).toContain("openai.key");
});
test("openai.key does not over-match prose / malformed sk- strings (#1868 calibration)", () => {
// HIGH tier BLOCKS, so false positives on prose are costly. None of these
// should flag as openai.key.
const benign = [
"the sk-learning-rate-schedule-was-tuned-carefully", // hyphenated prose
"sk--double-dash-typo-not-a-real-key",
"use sk-proj for the project prefix in docs", // no body
"sk-short", // too short, no prefix
];
for (const text of benign) {
expect(ids(text)).not.toContain("openai.key");
}
});
test("twilio.auth_token needs an SID nearby", () => {
const sid = "AC" + "a".repeat(32);
const tok = "b".repeat(32);
expect(ids(`account ${sid} token ${tok}`)).toContain("twilio.auth_token");
// bare 32-hex with no SID nearby should NOT flag as twilio
expect(ids(`random ${tok} here`)).not.toContain("twilio.auth_token");
});
test("db.url_with_password flags real password, skips placeholder/env-var", () => {
expect(ids("postgres://user:s3cretP@ss@db.example.com/app")).toContain("db.url_with_password");
expect(ids("postgres://user:${DB_PASSWORD}@host/app")).not.toContain("db.url_with_password");
});
test("all HIGH patterns block (exit 3)", () => {
const r = scan("AKIA1234567890ABCDEF", { repoVisibility: "private" });
expect(exitCodeFor(r)).toBe(3);
});
});
describe("MEDIUM demoted credential-shaped patterns (TENSION-1)", () => {
test("stripe.publishable is MEDIUM not HIGH", () => {
const f = scan("pk_live_" + "a".repeat(30), { repoVisibility: "private" }).findings.find(
(x) => x.id === "stripe.publishable",
);
expect(f?.tier).toBe("MEDIUM");
});
test("google.api_key is MEDIUM", () => {
const f = scan("AIza" + "a".repeat(35), { repoVisibility: "private" }).findings.find(
(x) => x.id === "google.api_key",
);
expect(f?.tier).toBe("MEDIUM");
});
test("jwt is MEDIUM", () => {
const jwt = "eyJhbGciOiJ.eyJzdWIiOiI." + "x".repeat(20);
const f = scan(jwt, { repoVisibility: "private" }).findings.find((x) => x.id === "jwt");
expect(f?.tier).toBe("MEDIUM");
});
test("env.kv fires on high-entropy, skips placeholder", () => {
expect(ids("API_TOKEN=8Fk2pQ9vXz4wL7mN3rT6yB1cD5eG0hJ")).toContain("env.kv");
expect(ids("API_KEY=changeme")).not.toContain("env.kv");
expect(ids("API_KEY=${MY_VAR}")).not.toContain("env.kv");
});
// #1946 — Bearer is the most FP-prone shape in the wave: docs and examples
// are full of "Authorization: Bearer <token>". MEDIUM + header proximity +
// the env.kv entropy recipe keep it calibrated.
test("auth.bearer fires on a high-entropy token in header context", () => {
const text = "curl -H 'Authorization: Bearer 8Fk2pQ9vXz4wL7mN3rT6yB1cD5eG0hJq'";
const f = scan(text, { repoVisibility: "private" }).findings.find(
(x) => x.id === "auth.bearer",
);
expect(f).toBeDefined();
expect(f?.tier).toBe("MEDIUM");
});
test("auth.bearer skips placeholders and env interpolations", () => {
expect(ids("Authorization: Bearer YOUR_TOKEN_HERE_PLACEHOLDER")).not.toContain("auth.bearer");
expect(ids("Authorization: Bearer ${ACCESS_TOKEN_FROM_ENV}")).not.toContain("auth.bearer");
});
test("auth.bearer requires header context (bare 'Bearer x' prose doesn't fire)", () => {
expect(ids("the Bearer 8Fk2pQ9vXz4wL7mN3rT6yB1cD5eG0hJq walked in")).not.toContain(
"auth.bearer",
);
});
});
describe("#1946 pattern negatives (placeholders never fire)", () => {
test("short or placeholder shapes don't trip the new HIGH patterns", () => {
expect(ids("glpat-xxxx")).not.toContain("gitlab.token");
expect(ids("hf_token")).not.toContain("huggingface.token");
expect(ids("npm_install")).not.toContain("npm.token");
expect(ids("dop_v1_short")).not.toContain("digitalocean.token");
// pem header WITHOUT the GCP JSON shape stays pem.private_key only.
expect(ids("-----BEGIN PRIVATE KEY-----")).not.toContain("gcp.service_account");
});
});
describe("PII patterns", () => {
test("email flags + is autoRedactable", () => {
const f = scan("ping alice@corp.io please", { repoVisibility: "private" }).findings.find(
(x) => x.id === "pii.email",
);
expect(f).toBeTruthy();
expect(f?.autoRedactable).toBe(true);
});
test("email allowlist: example.com, noreply, self, repo-public", () => {
expect(ids("see user@example.com")).not.toContain("pii.email");
expect(ids("from noreply@github.com")).not.toContain("pii.email");
expect(
scan("me@garry.dev", { repoVisibility: "private", selfEmail: "me@garry.dev" }).findings,
).toHaveLength(0);
expect(
scan("bob@acme.co", { repoVisibility: "private", repoPublicEmails: ["bob@acme.co"] }).findings,
).toHaveLength(0);
});
test("phone E.164", () => {
expect(ids("call +14155550123 now")).toContain("pii.phone.e164");
});
test("ssn flags valid, skips 000 octet", () => {
expect(ids("ssn 123-45-6789")).toContain("pii.ssn");
expect(ids("000-12-3456")).not.toContain("pii.ssn");
});
test("credit card needs Luhn", () => {
expect(ids("card 4111111111111111")).toContain("pii.cc");
expect(ids("num 4111111111111112")).not.toContain("pii.cc");
});
test("public IP flagged, RFC1918 skipped", () => {
expect(ids("connect 8.8.8.8")).toContain("pii.ip_public");
expect(ids("local 192.168.1.5")).not.toContain("pii.ip_public");
expect(ids("local 10.0.0.1")).not.toContain("pii.ip_public");
});
});
describe("internal + legal patterns", () => {
test("internal hostname", () => {
expect(ids("db1.corp internal host")).toContain("internal.hostname");
});
test("localhost url with path", () => {
expect(ids("hit http://localhost:8080/admin/secrets")).toContain("internal.url_private");
});
test("NDA marker", () => {
expect(ids("This is CONFIDENTIAL material")).toContain("legal.nda_marker");
});
test("named criticism needs a capitalized full name nearby", () => {
expect(ids("John Smith is incompetent at this")).toContain("legal.named_criticism");
expect(ids("the build is incompet019ently configured".replace("019", ""))).not.toContain(
"legal.named_criticism",
);
});
});
describe("LOW patterns surface only", () => {
test("user path is LOW", () => {
const f = scan("/Users/bob/secret/config", { repoVisibility: "private" }).findings.find(
(x) => x.id === "internal.user_path",
);
expect(f?.tier).toBe("LOW");
});
test("TODO marker is LOW", () => {
const f = scan("TODO(alice) fix later", { repoVisibility: "private" }).findings.find(
(x) => x.id === "hygiene.todo",
);
expect(f?.tier).toBe("LOW");
});
});
describe("placeholder suppression (per-span)", () => {
test("AWS docs EXAMPLE key not flagged", () => {
expect(ids("AKIAIOSFODNN7EXAMPLE")).not.toContain("aws.access_key");
});
test("your_ prefix not flagged", () => {
expect(isPlaceholderSpan("your_api_key")).toBe(true);
});
test("a real secret on a line that ALSO contains EXAMPLE still flags", () => {
// line-based suppression would wrongly skip this; per-span must catch it.
expect(ids("# EXAMPLE usage\nkey AKIA1234567890ABCDEF")).toContain("aws.access_key");
});
});
describe("no visibility-based tier promotion (TENSION-2-followup)", () => {
test("email stays MEDIUM on both private and public", () => {
const priv = scan("x@corp.io", { repoVisibility: "private" }).findings[0];
const pub = scan("x@corp.io", { repoVisibility: "public" }).findings[0];
expect(priv.tier).toBe("MEDIUM");
expect(pub.tier).toBe("MEDIUM");
expect(pub.severity).toBe("MEDIUM"); // NOT promoted to HIGH
expect(pub.repoVisibility).toBe("public"); // recorded for sterner wording
});
test("demoted credential patterns stay MEDIUM on public", () => {
const pub = scan("pk_live_" + "a".repeat(30), { repoVisibility: "public" }).findings[0];
expect(pub.severity).toBe("MEDIUM");
});
test("unknown visibility treated as public for wording, still no promotion", () => {
const r = scan("x@corp.io", { repoVisibility: "unknown" });
expect(r.findings[0].severity).toBe("MEDIUM");
});
});
describe("tool-attributed fence WARN-degrade (TENSION-3)", () => {
test("placeholder-shaped credential in tool fence → WARN", () => {
const text = "```codex-review\nfound your_aws_key AKIAIOSFODNN7EXAMPLE in code\n```";
const r = scan(text, { repoVisibility: "private" });
// the EXAMPLE key is suppressed as placeholder; verify a non-credential note doesn't block
expect(r.counts.HIGH).toBe(0);
});
test("live-format credential in tool fence STILL blocks", () => {
const text = "```codex-review\nleaked AKIA1234567890ABCDEF here\n```";
const r = scan(text, { repoVisibility: "private" });
expect(r.counts.HIGH).toBe(1); // not degraded — live format
});
test("AKIA outside any fence blocks", () => {
expect(exitCodeFor(scan("AKIA1234567890ABCDEF", {}))).toBe(3);
});
});
describe("normalization", () => {
test("zero-width chars inside a key are stripped before matching", () => {
const zwsp = "";
const broken = "AKIA1234567890" + zwsp + "ABCDEF";
expect(ids(broken)).toContain("aws.access_key");
});
test("HTML entity decode", () => {
const { normalized } = normalizeWithMap("a &amp; b");
expect(normalized).toBe("a & b");
});
test("offset map points back into original", () => {
const input = "xyz";
const { normalized, map } = normalizeWithMap(input);
expect(normalized).toBe("xyz");
// 'z' is at normalized index 2, original index 3
expect(map[2]).toBe(3);
});
});
describe("oversize fails CLOSED", () => {
test("input over the byte cap returns a single blocking HIGH finding", () => {
const big = "a".repeat(2000);
const r = scan(big, { maxBytes: 1000 });
expect(r.oversize).toBe(true);
expect(r.counts.HIGH).toBe(1);
expect(r.findings[0].id).toBe("engine.input_too_large");
expect(exitCodeFor(r)).toBe(3);
});
// #1824: a malformed --max-bytes used to reach the engine as NaN. `byteLen >
// NaN` is always false, silently disabling the fail-closed guard. The engine
// guardrail must fall back to the default cap for any non-finite / <= 0 value.
test("NaN maxBytes falls back to the default cap (does NOT disable the guard)", () => {
const big = "a".repeat(2 * 1024 * 1024); // > 1 MiB default cap
const r = scan(big, { maxBytes: NaN });
expect(r.oversize).toBe(true);
expect(r.findings[0].id).toBe("engine.input_too_large");
expect(exitCodeFor(r)).toBe(3);
});
test("negative / zero maxBytes falls back to the default cap", () => {
// negative would make `byteLen > -5` always true (block everything);
// the guardrail normalizes it to the default instead.
const small = "ok";
expect(scan(small, { maxBytes: -5 }).oversize).toBeFalsy();
expect(scan(small, { maxBytes: 0 }).oversize).toBeFalsy();
const big = "a".repeat(2 * 1024 * 1024);
expect(scan(big, { maxBytes: -5 }).oversize).toBe(true);
});
});
describe("validators", () => {
test("luhn", () => {
expect(luhnValid("4111111111111111")).toBe(true);
expect(luhnValid("4111111111111112")).toBe(false);
});
test("entropy", () => {
expect(shannonEntropy("aaaaaaaa")).toBeLessThan(1);
expect(shannonEntropy("8Fk2pQ9vXz4wL7mN")).toBeGreaterThan(3);
});
test("isPublicIPv4", () => {
expect(isPublicIPv4("8.8.8.8")).toBe(true);
expect(isPublicIPv4("10.1.2.3")).toBe(false);
expect(isPublicIPv4("172.16.5.5")).toBe(false);
expect(isPublicIPv4("999.1.1.1")).toBe(false);
});
});
describe("masking + purity", () => {
test("preview never leaks more than 4 leading chars", () => {
expect(maskPreview("AKIA1234567890ABCDEF")).toBe("AKIA********…");
expect(maskPreview("abc")).toBe("abc");
});
test("scan is pure — same input twice yields identical findings", () => {
const a = scan("AKIA1234567890ABCDEF x@corp.io", { repoVisibility: "public" });
const b = scan("AKIA1234567890ABCDEF x@corp.io", { repoVisibility: "public" });
expect(a).toEqual(b);
});
});
describe("redactFindingSpans — machine-egress masking (#1947)", () => {
test("clean input passes through unchanged", () => {
const text = "push failed: remote rejected the branch";
expect(redactFindingSpans(text, { repoVisibility: "private" })).toBe(text);
});
test("a single finding's span becomes <REDACTED-{id}>, context survives", () => {
const token = "ghp_" + "1234567890abcdefghijklmnopqrstuvwxyz";
const out = redactFindingSpans(`auth ${token} rejected`, { repoVisibility: "private" });
expect(out).toBe("auth <REDACTED-github.pat> rejected");
});
test("multiple findings are all replaced (right-to-left splice keeps offsets valid)", () => {
const pat = "ghp_" + "1234567890abcdefghijklmnopqrstuvwxyz";
const aws = "AKIA1234567890ABCDEF";
const out = redactFindingSpans(`first ${aws} then ${pat} end`, {
repoVisibility: "private",
});
expect(out).toBe("first <REDACTED-aws.access_key> then <REDACTED-github.pat> end");
});
test("fails closed (null) when a span cannot be relocated — never raw passthrough", () => {
// env.kv's span (the value) starts well past the regex match start (the
// var name), so locateSpan's rewind-2 re-exec misses it. The contract is
// null → caller drops the whole payload. The one thing that must never
// happen is the secret surviving in the output.
const secret = "8Fk2pQ9vXz4wL7mN3rT6yB1cD5eG0hJq";
const out = redactFindingSpans(`API_KEY=${secret}`, { repoVisibility: "private" });
if (out !== null) {
// If locateSpan ever learns to find context-prefixed spans, masking
// must actually mask.
expect(out).not.toContain(secret);
} else {
expect(out).toBeNull();
}
});
test("multiline input redacts a finding past the first line (locateSpan line/col path)", () => {
const token = "ghp_" + "1234567890abcdefghijklmnopqrstuvwxyz";
const out = redactFindingSpans(`line one\nline two has ${token}\nline three`, {
repoVisibility: "private",
});
expect(out).toBe("line one\nline two has <REDACTED-github.pat>\nline three");
});
// Pre-landing review CRITICAL: pem.private_key and gcp.service_account
// capture only the HEADER, not the key material — a span splice would
// redact the marker and forward the key body. Marker-only patterns must
// drop the whole payload.
test("PEM private key → null (header-only span must not forward the key body)", () => {
const msg =
"deploy failed: -----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASC\n-----END PRIVATE KEY-----";
expect(redactFindingSpans(msg, { repoVisibility: "private" })).toBeNull();
});
test("GCP service-account JSON → null (key body follows the captured marker)", () => {
const msg =
'config dump: {"private_key_id": "abc123", "private_key": "-----BEGIN PRIVATE KEY-----\\nMIIEvQIBADANBg..."}';
expect(redactFindingSpans(msg, { repoVisibility: "private" })).toBeNull();
});
// Pre-landing review: overlapping spans (a Bearer token that is also a
// JWT) must coalesce — independent splices apply stale offsets and can
// leave trailing secret bytes or mangled markers.
test("overlapping spans (Bearer JWT fires auth.bearer + jwt) never leak and produce clean markers", () => {
const jwt = "eyJ" + "a".repeat(20) + ".eyJ" + "b".repeat(20) + "." + "c".repeat(20);
const out = redactFindingSpans(`Authorization: Bearer ${jwt}`, { repoVisibility: "private" });
expect(out).not.toBeNull();
expect(out!).not.toContain("eyJ");
expect(out!).not.toContain("aaaa");
expect(out!).not.toContain("cccc");
// One coalesced, well-formed marker — no truncated fragments.
expect(out!).toMatch(/^Authorization: Bearer <REDACTED-[a-z._+]+>$/);
});
});
describe("taxonomy integrity", () => {
test("every pattern has a unique id", () => {
const set = new Set(PATTERNS.map((p) => p.id));
expect(set.size).toBe(PATTERNS.length);
});
test("autoRedactable patterns have a redactToken", () => {
for (const p of PATTERNS) {
if (p.autoRedactable) expect(p.redactToken).toBeTruthy();
}
});
});