fix(gbrain-sync): cut source-id slugs on hyphen boundaries (+ #1357)

Cherry-picked from #1481 by drummerms and extended with the explicit
HTTPS-remote regression case for #1357 (decision D2=A).

`constrainSourceId` truncated the slug with `slug.slice(-tailBudget)`,
which cut mid-word when the boundary fell inside a token. For a repo
where the combined `prefix-org-repo-pathhash` exceeded 32 chars, this
produced embarrassing artifacts like `gstack-code-kill-270c0001-c32152`
(from `drummerms-av-sow-wiz-skill-270c0001`).

Two changes carried from #1481, adapted for the #1468 hostpathhash:

1. `constrainSourceId` now walks hyphen-separated tokens from the right,
   accumulating whole tokens until adding the next would exceed
   `tailBudget`. When no token fits, falls through to the existing
   `${prefix}-${hash}` form.

2. `deriveCodeSourceId` now retries with `repo-only-hostpathhash`
   (dropping the org segment) when the full `org-repo-hostpathhash`
   triggers truncation. Keeps the repo name readable when it fits at all.

Plus a new test asserting the source id is period-free for the exact
HTTPS-with-.git remote shape from #1357 (`https://github.com/foo/bar.git`).
canonicalizeRemote strips `.git`; the sanitizer strips any residual
non-alnum. The test closes #1357 by pinning the property.

Closes #1357

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
drummerms 2026-05-16 13:56:23 -07:00 committed by Garry Tan
parent cdd3a2f77a
commit cfd2b5d792
No known key found for this signature in database
GPG Key ID: C1F69E85C74EFE1D
2 changed files with 94 additions and 2 deletions

View File

@ -186,7 +186,14 @@ function deriveCodeSourceId(repoPath: string): string {
if (remote) {
const segs = remote.split("/").filter(Boolean);
const slugSource = segs.slice(-2).join("-");
return constrainSourceId("gstack-code", `${slugSource}-${hostPathHash}`);
const fullId = constrainSourceId("gstack-code", `${slugSource}-${hostPathHash}`);
// If the org+repo+hostpathhash fits cleanly (suffix preserved), use it.
if (fullId.endsWith(`-${hostPathHash}`)) return fullId;
// Otherwise drop the org prefix and retry with just repo+hostpathhash so
// the repo name stays readable. If that still doesn't fit,
// constrainSourceId falls back to a deterministic hash-only form.
const repoOnly = segs[segs.length - 1] || "repo";
return constrainSourceId("gstack-code", `${repoOnly}-${hostPathHash}`);
}
const base = repoPath.split("/").pop() || "repo";
return constrainSourceId("gstack-code", `${base}-${hostPathHash}`);
@ -383,6 +390,10 @@ export function removeOrphanedSource(oldId: string): boolean {
* Build a gbrain-valid source id (1-32 lowercase alnum + interior hyphens). Sanitizes
* `raw`, prefixes with `prefix`, and falls back to a hashed-tail form when total length
* would exceed 32 chars.
*
* Truncation cuts on hyphen boundaries (whole-word units) from the right, never
* mid-word. Inputs like "drummerms-av-sow-wiz-skill-270c0001" produce
* "${prefix}-270c0001-<hash>", not "${prefix}-kill-270c0001-<hash>".
*/
function constrainSourceId(prefix: string, raw: string): string {
const MAX = 32;
@ -401,7 +412,20 @@ function constrainSourceId(prefix: string, raw: string): string {
// Total budget: prefix + "-" + tail + "-" + hash
const tailBudget = MAX - prefix.length - 2 - hash.length;
if (tailBudget < 1) return `${prefix}-${hash}`;
const tail = slug.slice(-tailBudget).replace(/^-+|-+$/g, "");
// Cut on hyphen boundaries instead of mid-word. Walk tokens from the right,
// accumulating until adding the next token would exceed tailBudget. This
// preserves readable suffixes (pathhash, repo name) and avoids embarrassing
// mid-word artifacts like "skill" → "kill".
const tokens = slug.split("-").filter(Boolean);
const kept: string[] = [];
let len = 0;
for (let i = tokens.length - 1; i >= 0; i--) {
const add = kept.length === 0 ? tokens[i].length : tokens[i].length + 1;
if (len + add > tailBudget) break;
kept.unshift(tokens[i]);
len += add;
}
const tail = kept.join("-");
return tail ? `${prefix}-${tail}-${hash}` : `${prefix}-${hash}`;
}

View File

@ -728,6 +728,74 @@ describe("planHostnameFoldMigration", () => {
});
});
describe("constrainSourceId truncation (hyphen-boundary cut)", () => {
// PR #1481 (Drummerms): the old slug.slice(-tailBudget) cut mid-word when
// the boundary fell inside a token. For a long repo like
// `drummerms-av-sow-wiz-skill-270c0001` the truncated tail used to end in
// `kill-270c0001` (from `skill`). The new tokenized cut walks hyphen
// boundaries from the right and only keeps whole tokens.
//
// Exercised via the dry-run preview (`gbrain sources add gstack-code-…`),
// since constrainSourceId is module-private.
it("never produces mid-word truncation artifacts like `kill` (from `skill`)", () => {
const home = makeTestHome();
const gstackHome = join(home, ".gstack");
mkdirSync(gstackHome, { recursive: true });
const repo = mkdtempSync(join(tmpdir(), "gstack-hyphen-cut-"));
spawnSync("git", ["init", "--quiet", "-b", "main"], { cwd: repo });
// Remote chosen to be long enough that constrainSourceId truncates and
// the boundary lands inside the word `skill`.
spawnSync("git", ["remote", "add", "origin", "https://github.com/drummerms-av-sow-wiz/skill-270c0001.git"], { cwd: repo });
const r = spawnSync("bun", [SCRIPT, "--dry-run", "--code-only", "--quiet"], {
encoding: "utf-8",
timeout: 60000,
cwd: repo,
env: { ...process.env, HOME: home, GSTACK_HOME: gstackHome },
});
expect(r.status).toBe(0);
const id = (r.stdout || "").match(/gbrain sources add (\S+)/)?.[1];
expect(id).toBeTruthy();
// The id must not contain the mid-word fragment `kill` (left over from
// slicing inside `skill`). Tokens that survive truncation must be whole.
expect(id).not.toMatch(/(^|-)kill(-|$)/);
// Still gbrain-valid.
expect(id!.length).toBeLessThanOrEqual(32);
expect(id!).toMatch(/^[a-z0-9](?:[a-z0-9-]{0,30}[a-z0-9])?$/);
rmSync(repo, { recursive: true, force: true });
rmSync(home, { recursive: true, force: true });
});
// Closes #1357: HTTPS remotes ending in `.git` used to pass periods through
// to the source id. canonicalizeRemote strips the `.git` suffix; the
// sanitizer also strips any residual non-alnum. Test asserts the source id
// is period-free for the exact case from the issue.
it("produces a period-free source id for HTTPS remotes ending in .git (#1357)", () => {
const home = makeTestHome();
const gstackHome = join(home, ".gstack");
mkdirSync(gstackHome, { recursive: true });
const repo = mkdtempSync(join(tmpdir(), "gstack-https-period-"));
spawnSync("git", ["init", "--quiet", "-b", "main"], { cwd: repo });
spawnSync("git", ["remote", "add", "origin", "https://github.com/foo/bar.git"], { cwd: repo });
const r = spawnSync("bun", [SCRIPT, "--dry-run", "--code-only", "--quiet"], {
encoding: "utf-8",
timeout: 60000,
cwd: repo,
env: { ...process.env, HOME: home, GSTACK_HOME: gstackHome },
});
expect(r.status).toBe(0);
const id = (r.stdout || "").match(/gbrain sources add (\S+)/)?.[1];
expect(id).toBeTruthy();
expect(id).not.toContain(".");
expect(id!).toMatch(/^[a-z0-9](?:[a-z0-9-]{0,30}[a-z0-9])?$/);
rmSync(repo, { recursive: true, force: true });
rmSync(home, { recursive: true, force: true });
});
});
describe("sourceLocalPath", () => {
let bindir: string;
beforeEach(() => {