mirror of https://github.com/garrytan/gstack.git
When codex exits non-zero (parse errors, arg-shape breaks, model API errors that propagate as non-zero status), the calling agent previously saw an empty output and burned 30-60 minutes misdiagnosing as a silent model/API stall. The hang-detection block only caught exit 124 (the timeout-wrapper signal). Adds elif blocks in all four codex invocation sites (Review default, Challenge, Consult new-session, Consult resume) that: - Echo "[codex exit N] <stderr first line>" to stdout - Indent the first 20 stderr lines for inline context - Log codex_nonzero_exit telemetry tagged with the call site Contributed by @genisis0x via #1467. Closes #1327. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
75872b9541
commit
bf7a31e427
|
|
@ -955,6 +955,13 @@ if [ "$_CODEX_EXIT" = "124" ]; then
|
|||
_gstack_codex_log_event "codex_timeout" "330"
|
||||
_gstack_codex_log_hang "review" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
|
||||
echo "Codex stalled past 5.5 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
|
||||
elif [ "$_CODEX_EXIT" != "0" ]; then
|
||||
# Surface non-zero exits (parse errors, arg-shape breaks, etc.) so the
|
||||
# calling agent doesn't read "no output" as a silent model/API stall and
|
||||
# burn 30-60min misdiagnosing it. See #1327.
|
||||
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
|
||||
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
|
||||
_gstack_codex_log_event "codex_nonzero_exit" "review:$_CODEX_EXIT"
|
||||
fi
|
||||
```
|
||||
|
||||
|
|
@ -1245,6 +1252,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
|
|||
_gstack_codex_log_event "codex_timeout" "600"
|
||||
_gstack_codex_log_hang "challenge" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
|
||||
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
|
||||
elif [ "$_CODEX_EXIT" != "0" ]; then
|
||||
# Surface non-zero exits so the calling agent doesn't read "no output" as
|
||||
# a silent model/API stall. See #1327.
|
||||
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
|
||||
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
|
||||
_gstack_codex_log_event "codex_nonzero_exit" "challenge:$_CODEX_EXIT"
|
||||
fi
|
||||
# Fix 2: surface auth errors from captured stderr instead of dropping them
|
||||
if grep -qiE "auth|login|unauthorized" "$TMPERR" 2>/dev/null; then
|
||||
|
|
@ -1392,6 +1405,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
|
|||
_gstack_codex_log_event "codex_timeout" "600"
|
||||
_gstack_codex_log_hang "consult" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
|
||||
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
|
||||
elif [ "$_CODEX_EXIT" != "0" ]; then
|
||||
# Surface non-zero exits so the calling agent doesn't read "no output" as
|
||||
# a silent model/API stall. See #1327.
|
||||
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
|
||||
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
|
||||
_gstack_codex_log_event "codex_nonzero_exit" "consult:$_CODEX_EXIT"
|
||||
fi
|
||||
```
|
||||
|
||||
|
|
@ -1414,6 +1433,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
|
|||
_gstack_codex_log_event "codex_timeout" "600"
|
||||
_gstack_codex_log_hang "consult-resume" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
|
||||
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
|
||||
elif [ "$_CODEX_EXIT" != "0" ]; then
|
||||
# Surface non-zero exits so the calling agent doesn't read "no output" as
|
||||
# a silent model/API stall. See #1327.
|
||||
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
|
||||
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
|
||||
_gstack_codex_log_event "codex_nonzero_exit" "consult-resume:$_CODEX_EXIT"
|
||||
fi
|
||||
|
||||
5. Capture session ID from the streamed output. The parser prints `SESSION_ID:<id>`
|
||||
|
|
|
|||
|
|
@ -183,6 +183,13 @@ if [ "$_CODEX_EXIT" = "124" ]; then
|
|||
_gstack_codex_log_event "codex_timeout" "330"
|
||||
_gstack_codex_log_hang "review" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
|
||||
echo "Codex stalled past 5.5 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
|
||||
elif [ "$_CODEX_EXIT" != "0" ]; then
|
||||
# Surface non-zero exits (parse errors, arg-shape breaks, etc.) so the
|
||||
# calling agent doesn't read "no output" as a silent model/API stall and
|
||||
# burn 30-60min misdiagnosing it. See #1327.
|
||||
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
|
||||
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
|
||||
_gstack_codex_log_event "codex_nonzero_exit" "review:$_CODEX_EXIT"
|
||||
fi
|
||||
```
|
||||
|
||||
|
|
@ -366,6 +373,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
|
|||
_gstack_codex_log_event "codex_timeout" "600"
|
||||
_gstack_codex_log_hang "challenge" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
|
||||
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
|
||||
elif [ "$_CODEX_EXIT" != "0" ]; then
|
||||
# Surface non-zero exits so the calling agent doesn't read "no output" as
|
||||
# a silent model/API stall. See #1327.
|
||||
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
|
||||
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
|
||||
_gstack_codex_log_event "codex_nonzero_exit" "challenge:$_CODEX_EXIT"
|
||||
fi
|
||||
# Fix 2: surface auth errors from captured stderr instead of dropping them
|
||||
if grep -qiE "auth|login|unauthorized" "$TMPERR" 2>/dev/null; then
|
||||
|
|
@ -513,6 +526,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
|
|||
_gstack_codex_log_event "codex_timeout" "600"
|
||||
_gstack_codex_log_hang "consult" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
|
||||
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
|
||||
elif [ "$_CODEX_EXIT" != "0" ]; then
|
||||
# Surface non-zero exits so the calling agent doesn't read "no output" as
|
||||
# a silent model/API stall. See #1327.
|
||||
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
|
||||
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
|
||||
_gstack_codex_log_event "codex_nonzero_exit" "consult:$_CODEX_EXIT"
|
||||
fi
|
||||
```
|
||||
|
||||
|
|
@ -535,6 +554,12 @@ if [ "$_CODEX_EXIT" = "124" ]; then
|
|||
_gstack_codex_log_event "codex_timeout" "600"
|
||||
_gstack_codex_log_hang "consult-resume" "$(wc -c < "$TMPERR" 2>/dev/null || echo 0)"
|
||||
echo "Codex stalled past 10 minutes. Common causes: model API stall, long prompt, network issue. Try re-running. If persistent, split the prompt or check ~/.codex/logs/."
|
||||
elif [ "$_CODEX_EXIT" != "0" ]; then
|
||||
# Surface non-zero exits so the calling agent doesn't read "no output" as
|
||||
# a silent model/API stall. See #1327.
|
||||
echo "[codex exit $_CODEX_EXIT] $(head -1 "$TMPERR" 2>/dev/null || echo "no stderr captured")"
|
||||
head -20 "$TMPERR" 2>/dev/null | sed 's/^/ /' || true
|
||||
_gstack_codex_log_event "codex_nonzero_exit" "consult-resume:$_CODEX_EXIT"
|
||||
fi
|
||||
|
||||
5. Capture session ID from the streamed output. The parser prints `SESSION_ID:<id>`
|
||||
|
|
|
|||
Loading…
Reference in New Issue