From bf7a31e42716d15d885bc0af5aafbd06b8569a68 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 18 May 2026 21:19:35 -0700 Subject: [PATCH] fix(codex): surface non-zero exits so wrappers stop reading as silent stalls (#1467, #1327) When codex exits non-zero (parse errors, arg-shape breaks, model API errors that propagate as non-zero status), the calling agent previously saw an empty output and burned 30-60 minutes misdiagnosing as a silent model/API stall. The hang-detection block only caught exit 124 (the timeout-wrapper signal). Adds elif blocks in all four codex invocation sites (Review default, Challenge, Consult new-session, Consult resume) that: - Echo "[codex exit N] " to stdout - Indent the first 20 stderr lines for inline context - Log codex_nonzero_exit telemetry tagged with the call site Contributed by @genisis0x via #1467. Closes #1327. Co-Authored-By: Claude Opus 4.7 (1M context) --- codex/SKILL.md | 25 +++++++++++++++++++++++++ codex/SKILL.md.tmpl | 25 +++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/codex/SKILL.md b/codex/SKILL.md index d74a52588..dbc6bbcb6 100644 --- a/codex/SKILL.md +++ b/codex/SKILL.md @@ -955,6 +955,13 @@ if [ "$_CODEX_EXIT" = "124" ]; then _gstack_codex_log_event "codex_timeout" "330" _gstack_codex_log_hang "review" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)" echo "Codex stalled past 5.5 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/." +elif [ "$_CODEX_EXIT" != "0" ]; then + # Surface non-zero exits (parse errors, arg-shape breaks, etc.) so the + # calling agent doesn't read "no output" as a silent model/API stall and + # burn 30-60min misdiagnosing it. See #1327. + echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")" + head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true + _gstack_codex_log_event "codex_nonzero_exit" "review:$_CODEX_EXIT" fi ``` @@ -1245,6 +1252,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then _gstack_codex_log_event "codex_timeout" "600" _gstack_codex_log_hang "challenge" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)" echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/." +elif [ "$_CODEX_EXIT" != "0" ]; then + # Surface non-zero exits so the calling agent doesn't read "no output" as + # a silent model/API stall. See #1327. + echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")" + head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true + _gstack_codex_log_event "codex_nonzero_exit" "challenge:$_CODEX_EXIT" fi # Fix 2: surface auth errors from captured stderr instead of dropping them if grep -qiE "auth|login|unauthorized" "$TMPERR" 2>/dev/null; then @@ -1392,6 +1405,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then _gstack_codex_log_event "codex_timeout" "600" _gstack_codex_log_hang "consult" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)" echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/." +elif [ "$_CODEX_EXIT" != "0" ]; then + # Surface non-zero exits so the calling agent doesn't read "no output" as + # a silent model/API stall. See #1327. + echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")" + head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true + _gstack_codex_log_event "codex_nonzero_exit" "consult:$_CODEX_EXIT" fi ``` @@ -1414,6 +1433,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then _gstack_codex_log_event "codex_timeout" "600" _gstack_codex_log_hang "consult-resume" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)" echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/." +elif [ "$_CODEX_EXIT" != "0" ]; then + # Surface non-zero exits so the calling agent doesn't read "no output" as + # a silent model/API stall. See #1327. + echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")" + head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true + _gstack_codex_log_event "codex_nonzero_exit" "consult-resume:$_CODEX_EXIT" fi 5. Capture session ID from the streamed output. The parser prints `SESSION_ID:` diff --git a/codex/SKILL.md.tmpl b/codex/SKILL.md.tmpl index 6cda8a5a2..333de7d8d 100644 --- a/codex/SKILL.md.tmpl +++ b/codex/SKILL.md.tmpl @@ -183,6 +183,13 @@ if [ "$_CODEX_EXIT" = "124" ]; then _gstack_codex_log_event "codex_timeout" "330" _gstack_codex_log_hang "review" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)" echo "Codex stalled past 5.5 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/." +elif [ "$_CODEX_EXIT" != "0" ]; then + # Surface non-zero exits (parse errors, arg-shape breaks, etc.) so the + # calling agent doesn't read "no output" as a silent model/API stall and + # burn 30-60min misdiagnosing it. See #1327. + echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")" + head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true + _gstack_codex_log_event "codex_nonzero_exit" "review:$_CODEX_EXIT" fi ``` @@ -366,6 +373,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then _gstack_codex_log_event "codex_timeout" "600" _gstack_codex_log_hang "challenge" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)" echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/." +elif [ "$_CODEX_EXIT" != "0" ]; then + # Surface non-zero exits so the calling agent doesn't read "no output" as + # a silent model/API stall. See #1327. + echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")" + head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true + _gstack_codex_log_event "codex_nonzero_exit" "challenge:$_CODEX_EXIT" fi # Fix 2: surface auth errors from captured stderr instead of dropping them if grep -qiE "auth|login|unauthorized" "$TMPERR" 2>/dev/null; then @@ -513,6 +526,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then _gstack_codex_log_event "codex_timeout" "600" _gstack_codex_log_hang "consult" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)" echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/." +elif [ "$_CODEX_EXIT" != "0" ]; then + # Surface non-zero exits so the calling agent doesn't read "no output" as + # a silent model/API stall. See #1327. + echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")" + head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true + _gstack_codex_log_event "codex_nonzero_exit" "consult:$_CODEX_EXIT" fi ``` @@ -535,6 +554,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then _gstack_codex_log_event "codex_timeout" "600" _gstack_codex_log_hang "consult-resume" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)" echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/." +elif [ "$_CODEX_EXIT" != "0" ]; then + # Surface non-zero exits so the calling agent doesn't read "no output" as + # a silent model/API stall. See #1327. + echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")" + head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true + _gstack_codex_log_event "codex_nonzero_exit" "consult-resume:$_CODEX_EXIT" fi 5. Capture session ID from the streamed output. The parser prints `SESSION_ID:`