fix(codex): surface non-zero exits so wrappers stop reading as silent stalls (#1467, #1327)

When codex exits non-zero (parse errors, arg-shape breaks, model API
errors that propagate as non-zero status), the calling agent
previously saw an empty output and burned 30-60 minutes misdiagnosing
as a silent model/API stall. The hang-detection block only caught
exit 124 (the timeout-wrapper signal).

Adds elif blocks in all four codex invocation sites (Review default,
Challenge, Consult new-session, Consult resume) that:
- Echo "[codex exit N] <stderr first line>" to stdout
- Indent the first 20 stderr lines for inline context
- Log codex_nonzero_exit telemetry tagged with the call site

Contributed by @genisis0x via #1467. Closes #1327.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan 2026-05-18 21:19:35 -07:00
parent 75872b9541
commit bf7a31e427
No known key found for this signature in database
GPG Key ID: C1F69E85C74EFE1D
2 changed files with 50 additions and 0 deletions

View File

@ -955,6 +955,13 @@ if [ "$_CODEX_EXIT" = "124" ]; then
_gstack_codex_log_event "codex_timeout" "330"
_gstack_codex_log_hang "review" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
echo "Codex stalled past 5.5 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
elif [ "$_CODEX_EXIT" != "0" ]; then
# Surface non-zero exits (parse errors, arg-shape breaks, etc.) so the
# calling agent doesn't read "no output" as a silent model/API stall and
# burn 30-60min misdiagnosing it. See #1327.
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
_gstack_codex_log_event "codex_nonzero_exit" "review:$_CODEX_EXIT"
fi
```
@ -1245,6 +1252,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
_gstack_codex_log_event "codex_timeout" "600"
_gstack_codex_log_hang "challenge" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
elif [ "$_CODEX_EXIT" != "0" ]; then
# Surface non-zero exits so the calling agent doesn't read "no output" as
# a silent model/API stall. See #1327.
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
_gstack_codex_log_event "codex_nonzero_exit" "challenge:$_CODEX_EXIT"
fi
# Fix 2: surface auth errors from captured stderr instead of dropping them
if grep -qiE "auth|login|unauthorized" "$TMPERR" 2>/dev/null; then
@ -1392,6 +1405,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
_gstack_codex_log_event "codex_timeout" "600"
_gstack_codex_log_hang "consult" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
elif [ "$_CODEX_EXIT" != "0" ]; then
# Surface non-zero exits so the calling agent doesn't read "no output" as
# a silent model/API stall. See #1327.
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
_gstack_codex_log_event "codex_nonzero_exit" "consult:$_CODEX_EXIT"
fi
```
@ -1414,6 +1433,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
_gstack_codex_log_event "codex_timeout" "600"
_gstack_codex_log_hang "consult-resume" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
elif [ "$_CODEX_EXIT" != "0" ]; then
# Surface non-zero exits so the calling agent doesn't read "no output" as
# a silent model/API stall. See #1327.
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
_gstack_codex_log_event "codex_nonzero_exit" "consult-resume:$_CODEX_EXIT"
fi
5. Capture session ID from the streamed output. The parser prints `SESSION_ID:<id>`

View File

@ -183,6 +183,13 @@ if [ "$_CODEX_EXIT" = "124" ]; then
_gstack_codex_log_event "codex_timeout" "330"
_gstack_codex_log_hang "review" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
echo "Codex stalled past 5.5 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
elif [ "$_CODEX_EXIT" != "0" ]; then
# Surface non-zero exits (parse errors, arg-shape breaks, etc.) so the
# calling agent doesn't read "no output" as a silent model/API stall and
# burn 30-60min misdiagnosing it. See #1327.
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
_gstack_codex_log_event "codex_nonzero_exit" "review:$_CODEX_EXIT"
fi
```
@ -366,6 +373,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
_gstack_codex_log_event "codex_timeout" "600"
_gstack_codex_log_hang "challenge" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
elif [ "$_CODEX_EXIT" != "0" ]; then
# Surface non-zero exits so the calling agent doesn't read "no output" as
# a silent model/API stall. See #1327.
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
_gstack_codex_log_event "codex_nonzero_exit" "challenge:$_CODEX_EXIT"
fi
# Fix 2: surface auth errors from captured stderr instead of dropping them
if grep -qiE "auth|login|unauthorized" "$TMPERR" 2>/dev/null; then
@ -513,6 +526,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
_gstack_codex_log_event "codex_timeout" "600"
_gstack_codex_log_hang "consult" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
elif [ "$_CODEX_EXIT" != "0" ]; then
# Surface non-zero exits so the calling agent doesn't read "no output" as
# a silent model/API stall. See #1327.
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
_gstack_codex_log_event "codex_nonzero_exit" "consult:$_CODEX_EXIT"
fi
```
@ -535,6 +554,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
_gstack_codex_log_event "codex_timeout" "600"
_gstack_codex_log_hang "consult-resume" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
elif [ "$_CODEX_EXIT" != "0" ]; then
# Surface non-zero exits so the calling agent doesn't read "no output" as
# a silent model/API stall. See #1327.
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
_gstack_codex_log_event "codex_nonzero_exit" "consult-resume:$_CODEX_EXIT"
fi
5. Capture session ID from the streamed output. The parser prints `SESSION_ID:<id>`