From cfd2b5d792bb4f1d94988d2aa361215d8f260324 Mon Sep 17 00:00:00 2001 From: drummerms Date: Sat, 16 May 2026 13:56:23 -0700 Subject: [PATCH] fix(gbrain-sync): cut source-id slugs on hyphen boundaries (+ #1357) Cherry-picked from #1481 by drummerms and extended with the explicit HTTPS-remote regression case for #1357 (decision D2=A). `constrainSourceId` truncated the slug with `slug.slice(-tailBudget)`, which cut mid-word when the boundary fell inside a token. For a repo where the combined `prefix-org-repo-pathhash` exceeded 32 chars, this produced embarrassing artifacts like `gstack-code-kill-270c0001-c32152` (from `drummerms-av-sow-wiz-skill-270c0001`). Two changes carried from #1481, adapted for the #1468 hostpathhash: 1. `constrainSourceId` now walks hyphen-separated tokens from the right, accumulating whole tokens until adding the next would exceed `tailBudget`. When no token fits, falls through to the existing `${prefix}-${hash}` form. 2. `deriveCodeSourceId` now retries with `repo-only-hostpathhash` (dropping the org segment) when the full `org-repo-hostpathhash` triggers truncation. Keeps the repo name readable when it fits at all. Plus a new test asserting the source id is period-free for the exact HTTPS-with-.git remote shape from #1357 (`https://github.com/foo/bar.git`). canonicalizeRemote strips `.git`; the sanitizer strips any residual non-alnum. The test closes #1357 by pinning the property. Closes #1357 Co-Authored-By: Claude --- bin/gstack-gbrain-sync.ts | 28 +++++++++++++- test/gstack-gbrain-sync.test.ts | 68 +++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 2 deletions(-) diff --git a/bin/gstack-gbrain-sync.ts b/bin/gstack-gbrain-sync.ts index 0f0263f12..21388aac7 100644 --- a/bin/gstack-gbrain-sync.ts +++ b/bin/gstack-gbrain-sync.ts @@ -186,7 +186,14 @@ function deriveCodeSourceId(repoPath: string): string { if (remote) { const segs = remote.split("/").filter(Boolean); const slugSource = segs.slice(-2).join("-"); - return constrainSourceId("gstack-code", `${slugSource}-${hostPathHash}`); + const fullId = constrainSourceId("gstack-code", `${slugSource}-${hostPathHash}`); + // If the org+repo+hostpathhash fits cleanly (suffix preserved), use it. + if (fullId.endsWith(`-${hostPathHash}`)) return fullId; + // Otherwise drop the org prefix and retry with just repo+hostpathhash so + // the repo name stays readable. If that still doesn't fit, + // constrainSourceId falls back to a deterministic hash-only form. + const repoOnly = segs[segs.length - 1] || "repo"; + return constrainSourceId("gstack-code", `${repoOnly}-${hostPathHash}`); } const base = repoPath.split("/").pop() || "repo"; return constrainSourceId("gstack-code", `${base}-${hostPathHash}`); @@ -383,6 +390,10 @@ export function removeOrphanedSource(oldId: string): boolean { * Build a gbrain-valid source id (1-32 lowercase alnum + interior hyphens). Sanitizes * `raw`, prefixes with `prefix`, and falls back to a hashed-tail form when total length * would exceed 32 chars. + * + * Truncation cuts on hyphen boundaries (whole-word units) from the right, never + * mid-word. Inputs like "drummerms-av-sow-wiz-skill-270c0001" produce + * "${prefix}-270c0001-", not "${prefix}-kill-270c0001-". */ function constrainSourceId(prefix: string, raw: string): string { const MAX = 32; @@ -401,7 +412,20 @@ function constrainSourceId(prefix: string, raw: string): string { // Total budget: prefix + "-" + tail + "-" + hash const tailBudget = MAX - prefix.length - 2 - hash.length; if (tailBudget < 1) return `${prefix}-${hash}`; - const tail = slug.slice(-tailBudget).replace(/^-+|-+$/g, ""); + // Cut on hyphen boundaries instead of mid-word. Walk tokens from the right, + // accumulating until adding the next token would exceed tailBudget. This + // preserves readable suffixes (pathhash, repo name) and avoids embarrassing + // mid-word artifacts like "skill" → "kill". + const tokens = slug.split("-").filter(Boolean); + const kept: string[] = []; + let len = 0; + for (let i = tokens.length - 1; i >= 0; i--) { + const add = kept.length === 0 ? tokens[i].length : tokens[i].length + 1; + if (len + add > tailBudget) break; + kept.unshift(tokens[i]); + len += add; + } + const tail = kept.join("-"); return tail ? `${prefix}-${tail}-${hash}` : `${prefix}-${hash}`; } diff --git a/test/gstack-gbrain-sync.test.ts b/test/gstack-gbrain-sync.test.ts index 74b0e2e8f..26711913d 100644 --- a/test/gstack-gbrain-sync.test.ts +++ b/test/gstack-gbrain-sync.test.ts @@ -728,6 +728,74 @@ describe("planHostnameFoldMigration", () => { }); }); +describe("constrainSourceId truncation (hyphen-boundary cut)", () => { + // PR #1481 (Drummerms): the old slug.slice(-tailBudget) cut mid-word when + // the boundary fell inside a token. For a long repo like + // `drummerms-av-sow-wiz-skill-270c0001` the truncated tail used to end in + // `kill-270c0001` (from `skill`). The new tokenized cut walks hyphen + // boundaries from the right and only keeps whole tokens. + // + // Exercised via the dry-run preview (`gbrain sources add gstack-code-…`), + // since constrainSourceId is module-private. + it("never produces mid-word truncation artifacts like `kill` (from `skill`)", () => { + const home = makeTestHome(); + const gstackHome = join(home, ".gstack"); + mkdirSync(gstackHome, { recursive: true }); + const repo = mkdtempSync(join(tmpdir(), "gstack-hyphen-cut-")); + spawnSync("git", ["init", "--quiet", "-b", "main"], { cwd: repo }); + // Remote chosen to be long enough that constrainSourceId truncates and + // the boundary lands inside the word `skill`. + spawnSync("git", ["remote", "add", "origin", "https://github.com/drummerms-av-sow-wiz/skill-270c0001.git"], { cwd: repo }); + + const r = spawnSync("bun", [SCRIPT, "--dry-run", "--code-only", "--quiet"], { + encoding: "utf-8", + timeout: 60000, + cwd: repo, + env: { ...process.env, HOME: home, GSTACK_HOME: gstackHome }, + }); + expect(r.status).toBe(0); + const id = (r.stdout || "").match(/gbrain sources add (\S+)/)?.[1]; + expect(id).toBeTruthy(); + // The id must not contain the mid-word fragment `kill` (left over from + // slicing inside `skill`). Tokens that survive truncation must be whole. + expect(id).not.toMatch(/(^|-)kill(-|$)/); + // Still gbrain-valid. + expect(id!.length).toBeLessThanOrEqual(32); + expect(id!).toMatch(/^[a-z0-9](?:[a-z0-9-]{0,30}[a-z0-9])?$/); + + rmSync(repo, { recursive: true, force: true }); + rmSync(home, { recursive: true, force: true }); + }); + + // Closes #1357: HTTPS remotes ending in `.git` used to pass periods through + // to the source id. canonicalizeRemote strips the `.git` suffix; the + // sanitizer also strips any residual non-alnum. Test asserts the source id + // is period-free for the exact case from the issue. + it("produces a period-free source id for HTTPS remotes ending in .git (#1357)", () => { + const home = makeTestHome(); + const gstackHome = join(home, ".gstack"); + mkdirSync(gstackHome, { recursive: true }); + const repo = mkdtempSync(join(tmpdir(), "gstack-https-period-")); + spawnSync("git", ["init", "--quiet", "-b", "main"], { cwd: repo }); + spawnSync("git", ["remote", "add", "origin", "https://github.com/foo/bar.git"], { cwd: repo }); + + const r = spawnSync("bun", [SCRIPT, "--dry-run", "--code-only", "--quiet"], { + encoding: "utf-8", + timeout: 60000, + cwd: repo, + env: { ...process.env, HOME: home, GSTACK_HOME: gstackHome }, + }); + expect(r.status).toBe(0); + const id = (r.stdout || "").match(/gbrain sources add (\S+)/)?.[1]; + expect(id).toBeTruthy(); + expect(id).not.toContain("."); + expect(id!).toMatch(/^[a-z0-9](?:[a-z0-9-]{0,30}[a-z0-9])?$/); + + rmSync(repo, { recursive: true, force: true }); + rmSync(home, { recursive: true, force: true }); + }); +}); + describe("sourceLocalPath", () => { let bindir: string; beforeEach(() => {