#!/bin/bash set -Eeuo pipefail # Paxel upload script # =================== # # What this does (up to 17 steps): # On your machine # 1. Check Docker is installed and running # 2. Sign you in (browser-based device auth) # 3. Pull or build the Paxel Docker image # # Inside the container — file bodies stay local; only aggregate metrics + # metadata (paths, commit numstat, session events) are uploaded # 4. Discover projects and sessions (Claude Code, Codex CLI, Cursor) # 5. Read your git history # 6. Parse transcripts # 7. Summarize each session (cloud Haiku via YC proxy) # 8. Group git commits by session # 9. Group sessions into multi-day work streams # 10. Extract steering traces # 11. Extract decision exchanges (cloud Haiku) # 12. Redact code before upload (regex pattern redaction) # 13. Link decisions to outcomes # 14. Analyze code quality (L1 deterministic) # 15. Score episodes across 5 axes (cloud Haiku) # 16. Assemble your report # 17. Upload redacted summaries + scores to the server # # Then: opens your results in the browser # # Some steps are skipped when there's nothing to do (no new sessions, no # work streams, no server to upload to). 17 is the ceiling; fewer can run. # # What stays on this machine: # File bodies (source code contents), full raw transcripts, and raw plan # file contents. Diffs never leave — only aggregate line counts do. # What gets uploaded: # Scores, behavioral summaries, narrative outputs, redacted decision # records, session metadata (including file paths your agent Read/Edited/ # Created and bash commands it ran), per-commit numstat (touched paths # with added/deleted line counts), git commit metadata (sha, author, # date, subject), aggregate velocity/LOC stats, and pipeline telemetry. # Transcript excerpts (including snippets of tool calls) flow to Claude # through the YC LLM proxy for narrative analysis — the proxy logs # request/response to our Postgres for anti-gaming verification. See # /data-handling for the field-by-field breakdown. # # Caches and re-runs: # LLM call results cache in a local Docker volume (paxel-cache-). # Re-running on the same repo (or after a mid-pipeline failure) typically # hits 95%+ cache and finishes in minutes instead of re-doing every step. # The container itself is --rm — nothing persists inside. Only the LLM # cache survives between runs. # # After the upload: # The server runs one more pass — cohort anomaly detection, cross-session # analysis via embeddings, narrative synthesis, and (after 3+ uploads) # builder profile updates. That takes ~1-5 minutes after the container # finishes. The results page polls automatically. # # Review it or ask your agent before running: # curl -fsSL 'https://paxel.ycombinator.com/upload/upload.sh' -o paxel-upload.sh && less paxel-upload.sh # curl -fsSL 'https://paxel.ycombinator.com/upload/upload.sh' | claude -p "explain what this bash script does" # curl -fsSL 'https://paxel.ycombinator.com/upload/upload.sh' | codex exec "walk me through this script" # # Usage: # upload # Docker mode: analyze locally, upload scores # upload --project NAME # select project by repo name # upload --since 2m # sessions from last 2 months (recommended) # upload --all # skip auto-detect; analyze every project # upload --no-repo # skip repo mount (transcripts only) # upload --no-sentry # disable client-side error reporting for this run # upload --clear-cache # clear project-remote cache CACHE_DIR="${HOME}/.paxel/cache" mkdir -p "$CACHE_DIR" # Owner-only on the ~/.paxel dirs is the real protection: a 0700 directory # blocks other users on a shared host from traversing to ANY file inside # (cache, logs, data, git_metrics.txt, the LLM cache + pending-upload stash on # the data mount), regardless of each file's own mode. We deliberately do NOT # tighten the files themselves with a global umask — several are bind-mounted # read-only into the client container, which runs as uid 1000, so owner-only # files would become unreadable there on Linux hosts whose uid != 1000. # Dir-level 0700 keeps container reads working while closing the exposure. chmod 700 "${HOME}/.paxel" "$CACHE_DIR" 2>/dev/null || true cleanup_temp_dirs() { rm -f "${HOME}/.paxel/git_metrics.txt" rm -rf "${HOME}/.paxel/cache/filtered-transcripts-$$" rm -rf "${HOME}/.paxel/cache/cursor_extracted-$$" rm -rf "${HOME}/.paxel/cache/codex_extracted-$$" rm -rf "${HOME}/.paxel/cache/opencode_extracted-$$" rm -rf "${HOME}/.paxel/cache/gemini_extracted-$$" rm -rf "${HOME}/.paxel/cache/filtered-codex-$$" # Use the same helper the scan + bind-mount consult, so a future # override of _DOCKER_ALL_SIDECAR_DIR (or a relocation of the path # convention) stays consistent across all three call sites. if declare -f _docker_all_sidecar_dir >/dev/null 2>&1; then rm -rf "$(_docker_all_sidecar_dir)" fi [ -n "${_RMDC_LOG_FILE:-}" ] && rm -f "$_RMDC_LOG_FILE" } trap cleanup_temp_dirs EXIT # ERR trap: fires only on unhandled command failures (not on explicit `exit N` # after a user-friendly banner, and not on `|| true`-protected commands). # Prints exit code, failing command, line number, and a function stack so the # user has something concrete to send us. `set -E` above propagates this into # functions and subshells. _paxel_on_error() { local ec=$? local failed_line="${BASH_LINENO[0]:-?}" local failed_cmd="${BASH_COMMAND:-?}" { echo "" echo "────────────────────────────────────────────────────────" echo "Paxel upload hit an unexpected error." echo "" echo " exit code: $ec" echo " line: $failed_line" echo " command: $failed_cmd" if [ "${#FUNCNAME[@]}" -gt 1 ]; then echo " stack:" local i=0 while [ "$i" -lt "${#FUNCNAME[@]}" ]; do local fn="${FUNCNAME[$i]:-main}" local ln="${BASH_LINENO[$i]:-?}" echo " at ${fn} (line ${ln})" i=$((i + 1)) done fi echo "" echo "Please email paxel@ycombinator.com with the above (and the" echo "last ~30 lines of output) so we can fix it." echo "────────────────────────────────────────────────────────" } >&2 } trap _paxel_on_error ERR find "${HOME}/.paxel/cache" -maxdepth 1 -name "filtered-transcripts-*" -mmin +1440 -exec rm -rf {} + 2>/dev/null || true find "${HOME}/.paxel/cache" -maxdepth 1 -name "cursor_extracted-*" -mmin +1440 -exec rm -rf {} + 2>/dev/null || true find "${HOME}/.paxel/cache" -maxdepth 1 -name "codex_extracted-*" -mmin +1440 -exec rm -rf {} + 2>/dev/null || true find "${HOME}/.paxel/cache" -maxdepth 1 -name "opencode_extracted-*" -mmin +1440 -exec rm -rf {} + 2>/dev/null || true find "${HOME}/.paxel/cache" -maxdepth 1 -name "gemini_extracted-*" -mmin +1440 -exec rm -rf {} + 2>/dev/null || true find "${HOME}/.paxel/cache" -maxdepth 1 -name "filtered-codex-*" -mmin +1440 -exec rm -rf {} + 2>/dev/null || true find "${HOME}/.paxel/cache" -maxdepth 1 -name "docker-all-sidecar-*" -mmin +1440 -exec rm -rf {} + 2>/dev/null || true # Sweep stale run logs (timestamped replay-*.log + per-run extract logs) after # 14 days — they persist for debugging but shouldn't accumulate forever. find "${HOME}/.paxel/logs" -maxdepth 1 -type f -mtime +14 -exec rm -f {} + 2>/dev/null || true CACHE_FILE="$CACHE_DIR/transcripts.tar.gz" UPLOAD_URL="${UPLOAD_URL:-https://paxel.ycombinator.com/upload}" CLAUDE_DIR="${TRANSCRIPT_DIR:-${CLAUDE_DIR:-$HOME/.claude/projects}}" CODEX_DIR="${CODEX_DIR:-$HOME/.codex/sessions}" if [ "$(uname)" = "Darwin" ]; then CURSOR_DIR="${CURSOR_DIR:-$HOME/Library/Application Support/Cursor/User/workspaceStorage}" CURSOR_GLOBAL_DB="${CURSOR_GLOBAL_DB:-$HOME/Library/Application Support/Cursor/User/globalStorage/state.vscdb}" else CURSOR_DIR="${CURSOR_DIR:-$HOME/.config/Cursor/User/workspaceStorage}" CURSOR_GLOBAL_DB="${CURSOR_GLOBAL_DB:-$HOME/.config/Cursor/User/globalStorage/state.vscdb}" fi # opencode stores sessions in a SQLite DB under the XDG data dir on BOTH macOS # and Linux (it uses the xdg-basedir convention, not ~/Library on macOS). # collect_opencode_sessions scans OPENCODE_DIR for opencode*.db (covers channel # DBs like opencode-beta.db; WAL files end in -wal/-shm and don't match). # Set OPENCODE_DB to point at a single DB explicitly (used by the test harness). OPENCODE_DIR="${OPENCODE_DIR:-${XDG_DATA_HOME:-$HOME/.local/share}/opencode}" # Gemini CLI stores one JSONL transcript per session under ~/.gemini/tmp//chats/ # on BOTH macOS and Linux (its home dir is always ~/.gemini — there is no env-var # override in gemini-cli). Each dir has a .project_root naming the repo. # collect_gemini_sessions copies these raw (the server GeminiNormalizer reconstructs # them). Override GEMINI_DIR to point at a fixture tree (used by the test harness). GEMINI_DIR="${GEMINI_DIR:-$HOME/.gemini/tmp}" DRY_RUN="${DRY_RUN:-0}" PAXEL_SERVER="${PAXEL_SERVER:-https://paxel.ycombinator.com}" PAXEL_LLM_PROXY="${PAXEL_LLM_PROXY:-https://paxel-llm.ycombinator.com}" PAXEL_TOKEN_FILE="${HOME}/.paxel/token" PAXEL_CLIENT_IMAGE="${PAXEL_CLIENT_IMAGE:-ghcr.io/yc-software/paxel-client:latest}" PAXEL_REPO_ROOT="${PAXEL_REPO_ROOT:-}" PAXEL_BAKED_TOKEN="" # Defaults PROJECT_NAME="" ALL_PROJECTS=0 SINCE_EPOCH="" OLDEST_SESSION_EPOCH="" NO_REPO=0 # Grouped project data (parallel arrays for bash 3.2 compat) GROUP_REMOTES=() GROUP_DISPLAYS=() GROUP_DIRS=() # pipe-separated dir names per group GROUP_COUNTS=() # session count per group GROUP_DIR_COUNTS=() # workspace count per group # Selected project dirs (set by auto-detect or interactive selection) PROJECT_DIRS=() # Multi-repo mode state MULTI_REPO_RUNNING=0 MULTI_REPO_MODE="" MULTI_REPO_SELECTED=0 # Zero-based indices into the CHILD_REPO_* arrays for the selected subset. # Populated by show_child_repo_menu; consumed by run_selected_child_repos. MULTI_REPO_SELECTED_LIST=() CHILD_REPO_DIRS=() CHILD_REPO_REMOTES=() CHILD_REPO_NAMES=() CHILD_REPO_SESSIONS=() CHILD_TRANSCRIPT_DIRS=() CHILD_CODEX_DIRS=() # --- Functions --- # Determine whether to emit ANSI escape codes. Evaluated ONCE here at module # load so `[ -t 1 ]` sees the script's actual stdout — evaluating inside a # function via `$(...)` would inherit the subshell's pipe and always be false. # NO_COLOR (https://no-color.org/) disables color regardless of TTY. if [ -t 1 ] && [ -z "${NO_COLOR:-}" ]; then _IS_TTY=1 else _IS_TTY=0 fi # Semantic color helpers. Scope is narrow on purpose: decision moments only. # Informational output stays uncolored so the color itself is a signal — # "your attention is wanted here." # Takes a color name and text; returns text unchanged when stdout isn't a tty # or NO_COLOR is set. osascript/notifier strings and any var that flows into # non-tty surfaces (logs, telemetry) MUST NOT be wrapped in these helpers — # wrap only at echo time. _color() { if [ "${_IS_TTY:-0}" != "1" ]; then printf '%s' "$2" return fi local code case "$1" in yellow) code='33' ;; green) code='32' ;; *) printf '%s' "$2"; return ;; esac printf '\033[%sm%s\033[0m' "$code" "$2" } _bold() { if [ "${_IS_TTY:-0}" != "1" ]; then printf '%s' "$1" return fi printf '\033[1m%s\033[0m' "$1" } # Build a copy-pasteable re-run command with flags rerun_cmd() { local flags="$*" # Always TOKENLESS. The live API token is never baked into a URL we echo to # the terminal — that would put a credential into scrollback, screen-shares, # teed logs, CI output, and pasted issue reports (the token-in-query-string # form is already known to land in Cloudflare request logs). A re-run picks up # the saved token from ${PAXEL_TOKEN_FILE} (chmod 600); a fresh user with no # saved token falls through to the normal interactive auth flow. echo "curl -fsSL '${PAXEL_SERVER}/upload.sh' | bash -s -- ${flags}" } # Env-aware remediation phrase for user-facing messages. Mirrors # AnthropicClient#rebuild_user_action / #auth_user_action # (app/models/concerns/anthropic_client.rb:581-598): bin/upload exports # PAXEL_CLIENT_MODE=dev, so devs get `bin/upload`-style commands; public # curl|bash runs leave it unset and get copy-pasteable curl commands via # rerun_cmd. # # `reauth` intentionally does NOT delegate to rerun_cmd in prod — a tokenless # rerun_cmd re-loads the SAVED token (${PAXEL_TOKEN_FILE}), which on an # AUTH_REQUIRED failure is exactly the credential we want the user to REPLACE. # The user needs a fresh login from their dashboard. rerun_phrase() { local kind="$1" local dev=0 [ "${PAXEL_CLIENT_MODE:-}" = "dev" ] && dev=1 case "$kind" in fresh) if [ "$dev" = 1 ]; then echo "Run bin/upload again for a fresh analysis." else echo "To re-run: $(rerun_cmd)" fi ;; next_upload) if [ "$dev" = 1 ]; then echo "Will retry on next bin/upload." else echo "Will retry on your next upload." fi ;; reauth) if [ "$dev" = 1 ]; then echo "Run bin/upload interactively to refresh your token." else echo "Re-login on your Paxel dashboard for a fresh upload command; pending uploads will retry on the next run." fi ;; bypass_replay) if [ "$dev" = 1 ]; then echo "Bypass with bin/upload --no-replay." else echo "To bypass: $(rerun_cmd --no-replay)" fi ;; *) echo "Re-run the upload command from your Paxel dashboard." echo "[paxel] internal: rerun_phrase called with unknown kind '$kind'" >&2 return 2 ;; esac } # After a replay-and-exit, hint that re-running analyzes any repo that didn't # finish. The replay gate runs BEFORE child-repo detection, so when a multi-repo # run leaves one repo's upload stashed and a sibling crashed mid-analysis (before # it could stash), the next run replays-and-exits and the crashed sibling is never # re-picked — the user thinks "re-running fixed it" while a report is still missing. # Only emitted when this directory actually holds >=2 child repos, so single-repo # replays stay quiet. Cheap: stat-only on immediate children, no git/cache scan. multi_repo_replay_hint() { local _n=0 local _c for _c in ./*/; do [ -d "$_c" ] || continue if [ -e "${_c}.git" ] || [ -e "${_c}.jj" ]; then _n=$((_n + 1)) [ "$_n" -ge 2 ] && break fi done [ "$_n" -ge 2 ] || return 0 echo "[paxel] Multiple repos here — re-run from this directory to analyze any that didn't finish." } # Detect a copy-on-write cp flag ONCE: --reflink=auto on GNU coreutils # (Linux/btrfs+XFS), -c (clonefile) on BSD/macOS+APFS. Both share storage instead # of byte-copying and both fall back to a normal copy when CoW isn't possible # (cross-volume, non-CoW fs), so the result is always a correct, independent copy. # Each flag is PROBED on a throwaway file rather than inferred from `cp --version`: # a flag the local cp rejects (pre-coreutils-7.5 GNU, pre-10.12 BSD, BusyBox) must # never reach the real copy, which would fail the repo. Always returns 0 (never # aborts the caller). _paxel_detect_cp_cow() { [ -n "${_PAXEL_CP_COW_DETECTED:-}" ] && return 0 _PAXEL_CP_COW_DETECTED=1 _PAXEL_CP_COW_FLAG="" local _t _t=$(mktemp -d 2>/dev/null) || return 0 if : > "$_t/probe" 2>/dev/null; then if cp --reflink=auto "$_t/probe" "$_t/r" 2>/dev/null; then _PAXEL_CP_COW_FLAG="--reflink=auto" # GNU coreutils elif cp -c "$_t/probe" "$_t/c" 2>/dev/null; then _PAXEL_CP_COW_FLAG="-c" # BSD/macOS clonefile fi fi rm -rf "$_t" 2>/dev/null || true return 0 } # Recursively copy a transcript dir, preferring the CoW clone above. Keeps -RLp on # every path: recurse, DEREF source symlinks (-L — the trees are deliberately # symlinked, so -a/-al would dangle them and the container would silently drop # those sessions), preserve mtime (-p, for the container's --since File.mtime # check). CoW makes 'analyze all' over a multi-GB ~/.claude stop byte-copying every # repo's sessions. Returns cp's exit status verbatim so callers fail loud (the # single-repo sites are bare under active errexit; the multi-repo site wraps it in # `if` and routes failure to failed_repos). _paxel_cp_transcripts() { _paxel_detect_cp_cow cp -RLp ${_PAXEL_CP_COW_FLAG:-} "$1" "$2" } # Count JSONL session files in a directory (excludes subagents, _git, _metadata). # Counts Claude-style JSONLs only (one file per session, flat layout). Codex # sessions use YYYY/MM/DD subdirs and are counted by the collect_* paths that # write them into the upload archive. count_sessions() { local dir="$1" # Guard a missing dir (e.g. a Codex/Cursor-only user with no ~/.claude/projects): # `find` on a nonexistent path exits non-zero, which under `set -Eeuo pipefail` # aborts the whole upload (CLIENT-F / kalyanprakash). `|| true` also absorbs a # mid-walk permission error on an existing tree. [ -d "$dir" ] || { echo 0; return 0; } { find "$dir" -name "*.jsonl" -not -name "_*" -not -path "*/_git/*" -not -path "*/subagents/*" -maxdepth 3 2>/dev/null || true; } | wc -l | tr -d ' ' } count_subagent_sessions() { local dir="$1" [ -d "$dir" ] || { echo 0; return 0; } { find "$dir" -path "*/subagents/*.jsonl" -maxdepth 5 2>/dev/null || true; } | wc -l | tr -d ' ' } # Returns data size in MB for display get_data_size() { local dir="$1" du -sm "$dir" 2>/dev/null | cut -f1 || echo "0" } # Estimate client-side pipeline time in minutes from session count. # Cloud-only model calibrated 2026-04-25 (cloud Haiku via proxy, no Ollama). # Dominant cost is 3 LLM steps (summarize, decisions, scoring) at 20-thread # parallelism. ~0.85s/session fresh, near-zero when cached. # Calibration: 1082 sessions → 947s actual (15m 47s). # # Segments (continuous at boundaries): # s <= 30: 30 + s*3 overhead dominates (parsing, git, upload) # 30 < s <= 200: 120 + (s-30)*1 LLM parallelism kicks in # s > 200: 290 + (s-200)*1 sustained ~1s/session # # SYNC: keep in sync with estimate_processing_time() in results_helper.rb estimate_time() { local s=$1 local total_secs if [ "$s" -le 30 ]; then total_secs=$((30 + s * 3)) elif [ "$s" -le 200 ]; then total_secs=$((120 + (s - 30) * 1)) else total_secs=$((290 + (s - 200) * 1)) fi local minutes=$(( (total_secs + 59) / 60 )) [ "$minutes" -lt 2 ] && minutes=2 echo "$minutes" } # Print time estimate with session count, data size, and email notice. # `codex_count` here is standalone Codex (the user invoked `codex` directly). # `codex_cross_tool_count` is Codex sessions launched by Claude via codex-companion # (or other tool); the caller folds these into `subagent_count` so the header math # (Found N sessions + M subagents) does not double-count. print_estimate() { local sessions=$1 local data_mb=$2 local claude_count=${3:-0} local codex_count=${4:-0} local codex_cross_tool_count=${5:-0} local project_name=${6:-} local subagent_count=${7:-0} local total=$((sessions + subagent_count)) local minutes minutes=$(estimate_time "$total") ESTIMATED_MINUTES="$minutes" echo "" local label="" if [ -n "$project_name" ]; then label=" for ${project_name}" fi local session_word="sessions" [ "$sessions" -eq 1 ] && session_word="session" local prefix="Found ${sessions} ${session_word}" if [ "$subagent_count" -gt 0 ]; then local sub_word="subagent" [ "$subagent_count" -gt 1 ] && sub_word="subagents" prefix="${prefix} + ${subagent_count} ${sub_word}" fi prefix="${prefix}${label} (${data_mb}MB)." echo "${prefix} Estimated time: ~${minutes} minutes." if [ "$claude_count" -gt 0 ]; then echo " Claude Code: ${claude_count} sessions" fi if [ "$codex_count" -gt 0 ]; then echo " Codex CLI: ${codex_count} sessions" fi if [ "$codex_cross_tool_count" -gt 0 ]; then echo " Codex launched by Claude: ${codex_cross_tool_count} sessions" fi echo "" echo " ★ You'll get an email when your report is ready." echo "" } user_read() { read "$@" &2 echo " $(rerun_cmd --project NAME)" >&2 echo " $(rerun_cmd --all)" >&2 exit 1 fi } # Parse --since value to epoch threshold parse_since() { local since_str="$1" local now_epoch now_epoch=$(date +%s) # Match relative durations: 6h, 7d, 2w, 1m case "$since_str" in *h) local hours="${since_str%h}" echo $(($now_epoch - $hours * 3600)) ;; *d) local days="${since_str%d}" echo $(($now_epoch - $days * 86400)) ;; *w) local weeks="${since_str%w}" echo $(($now_epoch - $weeks * 7 * 86400)) ;; *m) local months="${since_str%m}" echo $(($now_epoch - $months * 30 * 86400)) ;; *) # Try as absolute date (YYYY-MM-DD), interpreted as MIDNIGHT local time. # GNU `date -d YYYY-MM-DD` already means midnight; BSD `date -j -f "%Y-%m-%d"` # (no time component) fills in the CURRENT time-of-day — wrong (the cutoff # drifts by up to a day) and non-deterministic — so pin BSD to 00:00:00 with a # full format. BSD/GNU syntaxes are mutually incompatible (each exits 1 on the # other's flags); GNU is tried first to match the stat-order convention here. local epoch epoch=$(date -d "$since_str" "+%s" 2>/dev/null) \ || epoch=$(date -j -f "%Y-%m-%d %H:%M:%S" "$since_str 00:00:00" "+%s" 2>/dev/null) if [ -n "$epoch" ]; then echo "$epoch" return fi echo "Error: Invalid --since format: $since_str (use 6h, 7d, 2w, 1m, or YYYY-MM-DD)" >&2 exit 1 ;; esac } # Extract human-readable repo name from remote URL remote_display_name() { local remote_url="$1" # "git@github.com:example-org/example-repo.git" -> "example-repo" # "https://github.com/user/repo.git" -> "repo" local name name=$(echo "$remote_url" | sed 's/\.git$//' | sed 's|.*[/:]||') echo "$name" } # Extract real filesystem path from a project directory get_project_cwd() { local project_dir_name="$1" local project_dir="$CLAUDE_DIR/$project_dir_name" local index_file="$project_dir/sessions-index.json" # 1. Try sessions-index.json originalPath if [ -f "$index_file" ]; then local original_path="" if command -v jq &>/dev/null; then # Handle both array format and {version, entries} format original_path=$(jq -r ' if type == "array" then .[0].originalPath // empty elif type == "object" then (.entries // [])[0].originalPath // empty else empty end ' "$index_file" 2>/dev/null || true) fi if [ -z "$original_path" ]; then # grep fallback original_path=$(grep -o '"originalPath":"[^"]*"' "$index_file" 2>/dev/null | head -1 | sed 's/"originalPath":"//;s/"$//' || true) fi if [ -n "$original_path" ]; then echo "$original_path" return fi fi # 2. Scan first JSONL for cwd field (skip queue-operation lines) local first_jsonl first_jsonl=$(find "$project_dir" -name "*.jsonl" -maxdepth 1 -size +0 -print -quit 2>/dev/null || true) if [ -n "$first_jsonl" ]; then local cwd="" while IFS= read -r line; do # Skip queue-operation lines case "$line" in *'"type":"queue-operation"'*) continue ;; esac # Try to extract cwd local maybe_cwd maybe_cwd=$(echo "$line" | grep -o '"cwd":"[^"]*"' | head -1 | sed 's/"cwd":"//;s/"$//' || true) if [ -n "$maybe_cwd" ]; then cwd="$maybe_cwd" break fi done < "$first_jsonl" if [ -n "$cwd" ]; then echo "$cwd" return fi fi # 3. Fallback: decode from dir name echo "" } # Returns 0 when this Codex session should count toward the picker's session # total (N), 1 when it should count toward the subagent total (M). # # Picker semantics deliberately differ from cross_tool_linker.rb#STANDALONE_ORIGINATORS: # only "Claude Code"-originated Codex sessions are bucketed as cross-tool, # because CrossToolLinker only assigns triggered_by_id for Claude-origin # parents (cross_tool_linker.rb:113-127). Cursor / unknown / future launchers # stay as logical_roots server-side, so the picker must count them as sessions # to avoid undercounting + the zero-abort path on Cursor-only Codex users. # # Implicit accept list (catch-all): "" (empty, pre-detector-fix sessions), # codex_cli_rs / codex_exec / codex-tui (server STANDALONE_ORIGINATORS), # codex_cli (v0.92 flat format), Cursor, unknown. codex_originator_is_standalone() { case "$1" in "Claude Code") return 1 ;; *) return 0 ;; esac } # Read originator from a Codex JSONL first line. Independent helper — does NOT # refactor get_codex_session_remote / get_codex_session_cwd, which are on the # upload-extraction critical path (collect_codex_sessions). Reads # .payload.originator OR top-level .originator (v0.92 flat format). get_codex_session_originator() { local jsonl_file="$1" local first_line="" IFS= read -r first_line < "$jsonl_file" 2>/dev/null || true [ -z "$first_line" ] && echo "" && return if command -v jq >/dev/null 2>&1; then printf '%s' "$first_line" | jq -r '(.payload // .).originator // empty' 2>/dev/null || echo "" else # grep fallback — handles both nested and top-level originator since # the field key is the same. The trailing `|| true` mirrors sibling # helpers (get_codex_session_remote line ~514) and is critical: without # it, a Codex JSONL whose first line lacks `"originator":"…"` makes grep # exit 1 → pipeline fails under set -o pipefail → _paxel_on_error ERR # trap fires → upload aborts. Exact failure mode for jq-absent users # with even one malformed/partial session file in ~/.codex/sessions/. printf '%s' "$first_line" | grep -o '"originator":"[^"]*"' 2>/dev/null \ | head -1 | sed 's/.*":"//;s/"$//' || true fi } # Extract git remote URL from a Codex JSONL file's session_meta (first line). # Falls through to the session's cwd if repository_url is absent: live cwd # uses git-remote directly; dead cwd routes through resolve_remote_for_dead_cwd # so orphan Codex sessions group under their real repo. get_codex_session_remote() { local jsonl_file="$1" local first_line="" IFS= read -r first_line < "$jsonl_file" 2>/dev/null || true [ -z "$first_line" ] && echo "" && return local remote="" if command -v jq &>/dev/null; then remote=$(echo "$first_line" | jq -r '(.payload // .).git.repository_url // empty' 2>/dev/null || true) fi if [ -z "$remote" ]; then # grep fallback for repository_url remote=$(echo "$first_line" | grep -o '"repository_url":"[^"]*"' | sed 's/"repository_url":"//;s/"$//' 2>/dev/null || true) fi if [ -z "$remote" ]; then local cwd cwd=$(get_codex_session_cwd "$jsonl_file") if [ -n "$cwd" ]; then if [ -e "$cwd" ]; then remote=$(get_git_remote "$cwd") else remote=$(resolve_remote_for_dead_cwd "$cwd") fi fi fi # Normalize raw repository_url paths (jq/grep above); get_git_remote and # resolve_remote_for_dead_cwd already return normalized, so this is a no-op # for those branches. normalize_remote "$remote" } # Extract working directory from a Codex JSONL file's session_meta (first line) get_codex_session_cwd() { local jsonl_file="$1" local first_line="" IFS= read -r first_line < "$jsonl_file" 2>/dev/null || true [ -z "$first_line" ] && echo "" && return if command -v jq &>/dev/null; then local cwd cwd=$(echo "$first_line" | jq -r '(.payload // .).cwd // empty' 2>/dev/null || true) if [ -n "$cwd" ]; then echo "$cwd" return fi fi # grep fallback local cwd cwd=$(echo "$first_line" | grep -o '"cwd":"[^"]*"' | sed 's/"cwd":"//;s/"$//' 2>/dev/null || true) echo "$cwd" } # Emit 6 chars of stable hex over $1. Tries md5sum (Linux) then md5 (macOS), # falls back to "000000" if neither is available. Used to disambiguate # per-remote / per-workspace bucket directories that share a basename. stable_hash6() { if command -v md5sum >/dev/null 2>&1; then printf '%s' "$1" | md5sum | cut -c1-6 elif command -v md5 >/dev/null 2>&1; then printf '%s' "$1" | md5 | cut -c1-6 else echo "000000" fi } # Compute a stable per-remote bucket dir name for a Codex session. # Returns "_codex_unattributed" when the remote is empty, otherwise # "_codex__" where is a sanitized repo basename and # is 6 chars of md5 over the raw remote (stable across runs, # collision-safe across different repos that share a basename). codex_bucket_name() { local remote="$1" if [ -z "$remote" ]; then echo "_codex_unattributed" return fi local slug slug=$(basename "$remote" .git) # Sanitize to filesystem-safe chars, collapse runs of '-' slug=$(printf '%s' "$slug" | tr -c 'A-Za-z0-9_.-' '-' | sed 's/--*/-/g; s/^-//; s/-$//') [ -z "$slug" ] && slug="repo" echo "_codex_${slug}_$(stable_hash6 "$remote")" } # ── Cursor IDE helpers ── # Resolve workspace path from a Cursor workspaceStorage directory. # Each workspace dir has a workspace.json with a "folder" URI (file:///path/to/project). get_cursor_workspace_path() { local ws_dir="$1" local ws_json="$ws_dir/workspace.json" [ ! -f "$ws_json" ] && echo "" && return if command -v jq &>/dev/null; then local folder folder=$(jq -r '.folder // empty' "$ws_json" 2>/dev/null || true) if [ -n "$folder" ]; then # Strip file:// prefix echo "$folder" | sed 's|^file://||' return fi fi # grep fallback local folder folder=$(grep -o '"folder":"[^"]*"' "$ws_json" | sed 's/"folder":"//;s/"$//' 2>/dev/null || true) echo "$folder" | sed 's|^file://||' } # Extract Cursor sessions from a state.vscdb SQLite database into canonical JSONL. # Writes one JSONL file per composer session to $output_dir. # Requires sqlite3 and jq. extract_cursor_db() { local db_path="$1" local output_dir="$2" local selected_remote="${3:-}" local ws_dir ws_dir=$(dirname "$db_path") # Return codes: 0 = extracted sessions, 1 = no data (not an error), 2 = real error # Validate schema local tables tables=$(sqlite3 "$db_path" ".tables" 2>/dev/null || true) if ! echo "$tables" | grep -q "cursorDiskKV"; then echo " Warning: Could not read Cursor chat data in $(basename "$ws_dir")." >&2 echo " Your version of Cursor may store data differently. Skipping." >&2 return 2 fi # Check for composerData entries local composer_count composer_count=$(sqlite3 "$db_path" "SELECT COUNT(*) FROM cursorDiskKV WHERE key LIKE 'composerData:%'" 2>/dev/null || echo "0") if [ "$composer_count" -eq 0 ] 2>/dev/null; then return 1 fi # Resolve workspace path and git remote local workspace_path workspace_path=$(get_cursor_workspace_path "$ws_dir") local git_remote="" if [ -n "$workspace_path" ]; then if [ -e "$workspace_path" ]; then git_remote=$(get_git_remote "$workspace_path") else # workspace.json's folder points at a deleted path; route through # the resolver so the ancestor / sibling-worktree recovery strategies # can still attribute sessions. Mirrors the per-session site below # (see its comment block for the full rationale). git_remote=$(resolve_remote_for_dead_cwd "$workspace_path" || true) fi fi local extracted=0 # Dump composer rows to temp file to avoid subshell variable loss from piping local sqlite_out="$output_dir/.sqlite_dump" if ! sqlite3 "$db_path" "SELECT key, value FROM cursorDiskKV WHERE key LIKE 'composerData:%'" > "$sqlite_out"; then echo " Warning: Failed to read Cursor data from $(basename "$(dirname "$db_path")")" >&2 rm -f "$sqlite_out" return 2 fi # Iterate over composer sessions (reading from file, not pipe) while IFS='|' read -r key value; do [ -z "$value" ] && continue local composer_id composer_id=$(echo "$value" | jq -r '.composerId // empty' 2>/dev/null || true) [ -z "$composer_id" ] && continue # Filter by --since using createdAt (milliseconds epoch) if [ -n "$SINCE_EPOCH" ]; then local created_at_ms created_at_ms=$(echo "$value" | jq -r '.createdAt // 0' 2>/dev/null || echo "0") local created_at_s created_at_s=$(( created_at_ms / 1000 )) 2>/dev/null || created_at_s=0 if [ "$created_at_s" -lt "$SINCE_EPOCH" ] 2>/dev/null; then continue fi fi # Resolve per-session workspace from composerData (global DB has mixed workspaces) local session_ws="" session_ws=$(echo "$value" | jq -r '.workspaceIdentifier.uri.fsPath // empty' 2>/dev/null || true) # Recover the workspace when Cursor didn't record workspaceIdentifier.uri.fsPath # (common for global-DB sessions that outlived their workspace, so they got # silently dropped — PAXEL cursor-missed). Walk each file the session # referenced up to its enclosing git/jj root; adopt a root ONLY if every # referenced file agrees on it, so a stray cross-repo file selection can't # mis-attribute the session to the wrong repo (Opus review). if [ -z "$session_ws" ]; then local _cand_root="" _agree=1 _c while IFS= read -r _c; do [ -z "$_c" ] && continue # Stop at "/", "." and any idempotent root (Git Bash `dirname C:` == `C:`) # so a Windows drive-root path can't spin this walk forever. local _p="$_c" _root="" _prev_p="" while [ -n "$_p" ] && [ "$_p" != "/" ] && [ "$_p" != "." ] && [ "$_p" != "$_prev_p" ]; do if [ -e "$_p/.git" ] || [ -d "$_p/.jj" ]; then _root="$_p"; break; fi _prev_p="$_p" _p=$(dirname "$_p") done [ -z "$_root" ] && continue if [ -z "$_cand_root" ]; then _cand_root="$_root" elif [ "$_cand_root" != "$_root" ]; then _agree=0 break fi done < <(echo "$value" | jq -r '(.context.fileSelections[]?.uri.fsPath // empty), (.allAttachedFileCodeChunksUris[]? | sub("^file://"; ""))' 2>/dev/null || true) [ "$_agree" -eq 1 ] && session_ws="$_cand_root" fi [ -z "$session_ws" ] && session_ws="$workspace_path" local session_remote="$git_remote" if [ -n "$session_ws" ] && [ "$session_ws" != "$workspace_path" ]; then if [ -e "$session_ws" ]; then session_remote=$(get_git_remote "$session_ws") else # session_ws is a non-empty path but not on disk (deleted workspace, # moved repo). Ancestor-walk / sibling-worktree recovery via # resolve_remote_for_dead_cwd. Non-Conductor scope only — Conductor # paths short-circuit inside the resolver. get_git_remote (called # inside the resolver on live parents) already normalizes, so the # return value matches the normalized $selected_remote directly. # Stderr NOT suppressed: the resolver's `[paxel] Recovered remote` # log is load-bearing debug signal for users troubleshooting "why # didn't my Cursor session match?" — matches other call sites. session_remote=$(resolve_remote_for_dead_cwd "$session_ws" || true) fi # Conductor dead-workspace cache fallback: resolve_remote_for_dead_cwd # short-circuits */conductor/workspaces/* and */.conductor/* paths # because Conductor recovery needs sibling-worktree data (not ancestor # walk). list_projects_grouped's backfill_conductor_remotes pre-pass # writes sibling workspaces' remotes into the project cache. Iterate # cache rows and decode each dir_name's cwd via get_project_cwd to # find a TRUE sibling (exact parent-dir match). A prefix-match on the # Claude-encoded path would conflate sibling Conductor projects with # shared prefixes (e.g. "paxel" vs "paxel-v2") since the encoding # `[/.]→-` is lossy. if [ -z "$session_remote" ] && [ ! -e "$session_ws" ]; then local _normalized_ws="${session_ws%/}" local _ws_parent="" case "$_normalized_ws" in */conductor/workspaces/*/*) _ws_parent="${_normalized_ws%/*}" ;; */.conductor/*) _ws_parent="${_normalized_ws%%/.conductor/*}/.conductor" ;; esac if [ -n "$_ws_parent" ]; then local _cache_file="${HOME}/.paxel/cache/project-remotes-v2.tsv" if [ -f "$_cache_file" ]; then local _dir _key _rest _row_cwd while IFS=$'\t' read -r _dir _key _rest; do [ -z "$_key" ] && continue case "$_key" in name:*|local:*|unknown) continue ;; esac _row_cwd=$(get_project_cwd "$_dir" 2>/dev/null || true) [ -z "$_row_cwd" ] && continue if [ "${_row_cwd%/*}" = "$_ws_parent" ]; then session_remote="$_key" break fi done < "$_cache_file" fi fi fi fi # Filter by selected_remote (per-session, not per-DB) if [ -n "${selected_remote:-}" ]; then if [ -z "$session_remote" ] || [ "$session_remote" != "${selected_remote:-}" ]; then continue fi fi # Get bubble IDs from fullConversationHeadersOnly local bubble_ids bubble_ids=$(echo "$value" | jq -r '.fullConversationHeadersOnly[]? | .bubbleId' 2>/dev/null || true) [ -z "$bubble_ids" ] && continue # Write to per-workspace subdirectory (use path hash to avoid basename collisions) local ws_bucket="_cursor_unattributed" if [ -n "$session_ws" ]; then ws_bucket="_cursor_$(basename "$session_ws")_$(stable_hash6 "$session_ws")" fi mkdir -p "$output_dir/$ws_bucket" local session_file="$output_dir/$ws_bucket/${composer_id}.jsonl" [ -f "$session_file" ] && continue # dedupe: per-workspace DB wins over global DB # Batch the per-bubble work into ONE sqlite3 + ONE jq for the whole session. # The per-bubble version ran 2 sqlite3 + 2 jq + 1 date PER bubble; on a large # global DB (1.5GB, 1000+ bubbles/session) that was ~5 forks * tens of thousands # of bubbles, each sqlite3 re-opening the whole DB — it crawled to an effective # hang (PAXEL: Cursor extraction hangs, vanshgupta027). Output is byte-identical # to the per-bubble path (validated vs real Cursor DBs). The fallback bare-key # lookup (bubbleId: without composer prefix) is preserved below. local bubbles_dump="$output_dir/.cursor_bubbles_dump" # Fetch all of this composer's bubbles in TWO statements (one per key form), NOT a # single OR'd query. Two key forms exist: the modern composer-scoped # 'bubbleId::' and the legacy bare 'bubbleId:' (the # per-bubble path looked up both). NEITHER statement needs the SQLite JSON1 # extension — important because a single OR'd json_each query would fail to PREPARE # on a JSON1-less sqlite build and, under 2>/dev/null, silently drop the WHOLE # session (the old per-bubble path never needed JSON1 — it already had the ids). # # Modern form: a key RANGE (NOT LIKE/GLOB): LIKE is case-insensitive so SQLite can't # use the BINARY UNIQUE index and full-scans the whole DB; a >=/< range uses the # index, and ';' (0x3B) is the byte just after ':' (0x3A) so it's the exclusive # upper bound for the prefix. # # Legacy bare keys: build the id list HOST-side from the parsed session headers # (jq on the composerData $value we already hold), then look them up with a literal # IN(...) — no json_each, so no JSON1 dependency. ' is the SQL string quote, # written as a jq escape so no literal ' breaks this bash-single-quoted jq program. # Cursor bubble ids are UUIDs (no quotes); the SELECT goes over STDIN (printf is a # bash builtin → no argv, and sqlite3 reads SQL from stdin), so a huge id list can't # hit ARG_MAX. Both statements stay index-driven (~0.02s even on a 1.5GB DB). # (Edge: since all bare ids share one IN(...) statement, a single id carrying a ' # would make the whole statement a SQL error → 2>/dev/null drops this session's bare # keys as a set. Impossible for UUID ids; the modern range path is unaffected.) # # TAB-separated output, NOT -json: `sqlite3 -json` JSON-encodes the BLOB column in a # way that is pathologically slow (~9s to encode one large session's bubbles vs # ~0.02s to read them raw) — that, plus per-bubble fork overhead, was the real # "hang" (PAXEL: Cursor extraction hangs, vanshgupta027). Cursor stores each bubble # value as single-line minified JSON with no embedded tabs/newlines (verified across # all bubbles), so 'keyvalue' is one safe, parseable line per row. Caveat: a # value that ever did carry an embedded TAB/newline would split wrong and that one # bubble would be silently dropped (try/fromjson/catch null in the reduce below) — # graceful degradation of a single bubble, never a crash or a dropped session. sqlite3 -separator "$(printf '\t')" "$db_path" \ "SELECT key, value FROM cursorDiskKV WHERE key >= 'bubbleId:${composer_id}:' AND key < 'bubbleId:${composer_id};'" \ > "$bubbles_dump" 2>/dev/null || true local bare_in bare_in=$(printf '%s' "$value" | jq -r ' [ (.fullConversationHeadersOnly // [])[]? | .bubbleId | select(. != null) | "bubbleId:" + (. | tostring) ] | if length == 0 then empty else (map("\u0027" + . + "\u0027") | join(",")) end ' 2>/dev/null || true) if [ -n "$bare_in" ]; then printf 'SELECT key, value FROM cursorDiskKV WHERE key IN (%s);\n' "$bare_in" \ | sqlite3 -separator "$(printf '\t')" "$db_path" >> "$bubbles_dump" 2>/dev/null || true fi [ -f "$bubbles_dump" ] || : > "$bubbles_dump" local now_iso now_iso=$(date -u +%Y-%m-%dT%H:%M:%SZ) # composerData ($value) via stdin (avoids ARG_MAX on large sessions); the bubble # rows via --rawfile (parsed from the TAB-separated dump in jq below). local session_out session_out=$(printf '%s' "$value" | jq -c \ --rawfile brows "$bubbles_dump" \ --arg cid "$composer_id" \ --arg ws "${session_ws:-}" \ --arg remote "${session_remote:-}" \ --arg now "$now_iso" ' def toolmap: { "run_terminal_command_v2":"Bash","run_terminal_cmd":"Bash", "read_file_v2":"Read","read_file":"Read", "edit_file_v2":"Edit","edit_file":"Edit","search_replace":"Edit","apply_patch":"Edit","reapply":"Edit", "task_v2":"Task", "ripgrep_raw_search":"Grep","grep_search":"Grep","grep":"Grep", "glob_file_search":"Glob","file_search":"Glob","list_dir":"LS" }; def canon($n): if ($n|type)=="string" and ($n|length)>0 then (toolmap[$n] // $n) else "tool" end; def remap($name; $p): ($p // {}) | (if type=="object" then . else {} end) | del(.streamingContent) | if $name=="Read" and .targetFile then . + {file_path:.targetFile} elif $name=="Edit" and .relativeWorkspacePath then . + {file_path:.relativeWorkspacePath} else . end; def tsof($b): (($b.timingInfo.clientEndTime) // ($b.createdAt)) as $t | if ($t|type)=="number" then (($t/1000)|floor|todate) elif ($t|type)=="string" then (if ($t|test("^[0-9]+$")) then ((($t|tonumber)/1000)|floor|todate) else $t end) else $now end; def bubble2objs($b): tsof($b) as $ts | ($b.toolFormerData) as $tfd | ($tfd.toolCallId // null) as $tcid | (if (($b.type)|tostring)=="2" then "assistant" else "user" end) as $role | if $role=="user" and ($tfd==null) then (if (($b.text)//"")=="" then empty else {type:"user", message:{role:"user", content:($b.text)}, timestamp:$ts} end) else ( (if (($b.thinking)//"")!="" then [{type:"thinking",thinking:($b.thinking)}] else [] end) + (if (($b.text)//"")!="" then [{type:"text",text:($b.text)}] else [] end) + (if $tfd then (if (($tfd.params)|type)=="string" then (try (($tfd.params)|fromjson) catch null) else ($tfd.params) end) as $p | (canon($tfd.name)) as $tn | [ ({type:"tool_use", name:$tn, input:remap($tn;$p)} + (if $tcid then {id:$tcid} else {} end)) ] else [] end) ) as $content | if ($content|length)==0 then empty else ( {type:"assistant", message:{role:"assistant", content:$content}, timestamp:$ts} ), ( if $tfd and ((($tfd.result)//"")!="") then (if (($tfd.result)|type)=="string" then (try (($tfd.result)|fromjson) catch null) else ($tfd.result) end) as $rj | (if ($rj|type)=="object" then (($rj.output)//($rj.contents)//($rj.result)//($tfd.result)) else ($tfd.result) end) as $rt | {type:"user", message:{role:"user", content:[ ({type:"tool_result", content:(($rt|tostring)[0:4000])} + (if $tcid then {tool_use_id:$tcid} else {} end)) ]}, timestamp:$ts} else empty end ) end end; . as $cd | ($brows | split("\n") | map(select(length > 0) | split("\t"))) as $rows # Build id->bubble. Both key forms normalize to the bare bubble id, so a session # carrying BOTH a composer-scoped key and a bare key for the same id must PREFER # the scoped row (matching the per-bubble path order). Scoped rows always # overwrite; bare rows fill only ids not already present. | (reduce $rows[] as $r ({}; ($r[0]) as $k | ($k | sub("^bubbleId:[^:]*:"; "") | sub("^bubbleId:"; "")) as $bid | ($k | test("^bubbleId:[^:]+:")) as $scoped | if $scoped then .[$bid] = (try ($r[1]|fromjson) catch null) elif (.[$bid] == null) then .[$bid] = (try ($r[1]|fromjson) catch null) else . end)) as $bmap | {composerId:$cid, workspace:$ws, git_remote:$remote, agent_type:"cursor"} as $meta | [ ($cd.fullConversationHeadersOnly // [])[]? | .bubbleId | select(. != null) ] | [ .[] as $bid | ($bmap[$bid] // null) | select(. != null) | bubble2objs(.) ] | if length == 0 then empty else (.[0] += {_cursor_meta: $meta}) | .[] end ' 2>/dev/null || true) if [ -n "$session_out" ]; then printf '%s\n' "$session_out" >> "$session_file" extracted=$((extracted + 1)) fi done < "$sqlite_out" rm -f "$sqlite_out" rm -f "$output_dir/.cursor_bubbles_dump" [ "$extracted" -gt 0 ] && return 0 || return 1 } # Collect Cursor IDE sessions into the archive tmpdir. # Discovers all state.vscdb files, extracts sessions matching --since filter, # and writes canonical JSONL to $tmpdir/_cursor/. collect_cursor_sessions() { local tmpdir="$1" local selected_remote="${2:-}" # Dependency check if ! command -v sqlite3 &>/dev/null; then echo " Cursor: sqlite3 not found. Install with: brew install sqlite3 (macOS) or apt install sqlite3 (Linux)" >&2 return 0 fi if ! command -v jq &>/dev/null; then echo " Cursor: jq not found. Install with: brew install jq (macOS) or apt install jq (Linux)" >&2 return 0 fi if [ ! -d "$CURSOR_DIR" ] && [ ! -f "$CURSOR_GLOBAL_DB" ]; then return 0 fi local cursor_count=0 local cursor_bytes=0 local _cursor_errors=0 # 1. Extract from per-workspace state.vscdb files if [ -d "$CURSOR_DIR" ]; then while IFS= read -r db_file; do [ -z "$db_file" ] && continue local _erc=0 extract_cursor_db "$db_file" "$tmpdir" "$selected_remote" || _erc=$? [ "$_erc" -eq 2 ] && _cursor_errors=$((_cursor_errors + 1)) done < <(find "$CURSOR_DIR" -name "state.vscdb" -maxdepth 2 2>/dev/null) fi # 2. Extract from globalStorage/state.vscdb (most composer data lives here) if [ -f "$CURSOR_GLOBAL_DB" ]; then local _erc=0 extract_cursor_db "$CURSOR_GLOBAL_DB" "$tmpdir" "$selected_remote" || _erc=$? [ "$_erc" -eq 2 ] && _cursor_errors=$((_cursor_errors + 1)) fi # Count extracted files across all _cursor_* subdirs cursor_count=$(find "$tmpdir" -maxdepth 2 -path "*/_cursor_*/*.jsonl" 2>/dev/null | wc -l | tr -d ' ') if [ "$cursor_count" -gt 0 ]; then cursor_bytes=$(find "$tmpdir" -maxdepth 2 -path "*/_cursor_*/*.jsonl" -exec cat {} + 2>/dev/null | wc -c | tr -d ' ') local ws_count ws_count=$(find "$tmpdir" -maxdepth 1 -type d -name "_cursor_*" 2>/dev/null | wc -l | tr -d ' ') echo " Cursor IDE: ${cursor_count} sessions, $(($cursor_bytes / 1024))KB (${ws_count} workspaces)" >&2 # Add per-workspace entries to sidecar metadata [ ! -f "$tmpdir/_metadata.json" ] && echo '{"directories":{}}' > "$tmpdir/_metadata.json" if command -v jq &>/dev/null && [ -f "$tmpdir/_metadata.json" ]; then for ws_dir in "$tmpdir"/_cursor_*/; do [ -d "$ws_dir" ] || continue local bucket_name bucket_name=$(basename "$ws_dir") local first_file first_file=$(find "$ws_dir" -name "*.jsonl" -maxdepth 1 -print -quit 2>/dev/null || true) [ -z "$first_file" ] && continue local bucket_remote bucket_remote=$(head -1 "$first_file" | jq -r '._cursor_meta.git_remote // empty' 2>/dev/null || true) local bucket_cwd bucket_cwd=$(head -1 "$first_file" | jq -r '._cursor_meta.workspace // empty' 2>/dev/null || true) local updated updated=$(jq \ --arg bucket "$bucket_name" \ --arg remote "${bucket_remote:-}" \ --arg cwd "${bucket_cwd:-}" \ '.directories[$bucket] = {"git_remote": $remote, "cwd": $cwd}' \ "$tmpdir/_metadata.json" 2>/dev/null) [ -n "$updated" ] && echo "$updated" > "$tmpdir/_metadata.json" done fi else # Clean up empty directories rmdir "$tmpdir"/_cursor_* 2>/dev/null || true fi # If we tried extractions but got zero files, signal failure to caller if [ "$cursor_count" -eq 0 ] && [ "$_cursor_errors" -gt 0 ]; then echo " Cursor: extraction failed for $_cursor_errors database(s)" >&2 return 1 fi } # Collect Codex sessions from $CODEX_DIR (~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl) # into per-remote buckets at $tmpdir/_codex__/ (or _codex_unattributed/ # for sessions without a repository_url). Writes _metadata.json sidecar entries so # TranscriptDiscoverer merges each bucket into the Claude project with a matching # git_remote (one Project per repo, not one per agent). # # Signature mirrors collect_cursor_sessions: # collect_codex_sessions [selected_remote] # # When selected_remote is set (single-project / multi-repo single-child), only # sessions whose get_codex_session_remote normalizes to selected_remote are # included — the others belong to different Claude projects and would widen the # upload's scope beyond what the user asked for. # # When selected_remote is empty (--all mode), every Codex session is bucketed by # its own remote. Sessions without a remote land in _codex_unattributed/. # # Reusable from: # * run_docker_analysis — produces the dir mounted as /codex_sessions:ro # (the container's analyze_local.rake merges _codex_* dirs into # transcript_dir, mirroring the Cursor merge). # * dev/test archive staging (via collect_all_projects / # collect_project_group / collect_single_project) — those paths still # have inline Codex logic today; consolidating onto this helper is a # separate cleanup (not scoped here to keep the Docker fix minimal). collect_codex_sessions() { local tmpdir="$1" local selected_remote="${2:-}" [ -d "$CODEX_DIR" ] || return 0 local selected_remote_norm="" if [ -n "$selected_remote" ]; then selected_remote_norm=$(normalize_remote "$selected_remote") fi local codex_count=0 while IFS= read -r f; do [ -z "$f" ] && continue local remote remote=$(get_codex_session_remote "$f") # Single-project filter: skip sessions whose remote doesn't match. if [ -n "$selected_remote_norm" ]; then local remote_norm remote_norm=$(normalize_remote "$remote") [ "$remote_norm" != "$selected_remote_norm" ] && continue fi # Apply --since filter via file mtime (parity with DRY_RUN archive path # at collect_project_group:2616-2620). if [ -n "$SINCE_EPOCH" ]; then local file_mtime file_mtime=$(stat -c %Y "$f" 2>/dev/null || stat -f %m "$f" 2>/dev/null || echo "0") [ "$file_mtime" -lt "$SINCE_EPOCH" ] 2>/dev/null && continue fi local bucket bucket=$(codex_bucket_name "$remote") mkdir -p "$tmpdir/$bucket" cp "$f" "$tmpdir/$bucket/" codex_count=$((codex_count + 1)) done < <(find "$CODEX_DIR" -name "*.jsonl" -maxdepth 6 2>/dev/null) if [ "$codex_count" -eq 0 ]; then return 0 fi local repo_bucket_count repo_bucket_count=$(find "$tmpdir" -maxdepth 1 -type d -name "_codex_*" -not -name "_codex_unattributed" 2>/dev/null | wc -l | tr -d ' ') echo " Codex: ${codex_count} sessions (${repo_bucket_count} repos)" >&2 # Write per-bucket sidecar so TranscriptDiscoverer reads git_remote + cwd # for each Codex bucket and merges it with the matching Claude project. # Without jq, skip silently — the bucketed dirs still ship, but each # Codex repo becomes its own Project (graceful degradation). command -v jq &>/dev/null || return 0 [ ! -f "$tmpdir/_metadata.json" ] && echo '{"version":1,"directories":{}}' > "$tmpdir/_metadata.json" local bucket_dir for bucket_dir in "$tmpdir"/_codex_*/; do [ -d "$bucket_dir" ] || continue local bname bname=$(basename "$bucket_dir") local first_file first_file=$(find "$bucket_dir" -name "*.jsonl" -maxdepth 1 -print -quit 2>/dev/null || true) [ -z "$first_file" ] && continue local bremote bremote=$(get_codex_session_remote "$first_file") local bcwd bcwd=$(get_codex_session_cwd "$first_file") local updated updated=$(jq \ --arg bucket "$bname" \ --arg remote "${bremote:-}" \ --arg cwd "${bcwd:-}" \ '.directories[$bucket] = {"git_remote": $remote, "cwd": $cwd}' \ "$tmpdir/_metadata.json" 2>/dev/null) [ -n "$updated" ] && echo "$updated" > "$tmpdir/_metadata.json" done } # Write one opencode-native JSONL file (meta line + message lines) for a single # opencode session. Shared by extract_opencode_db's top-level and child # (subagent) passes. Returns 1 (and removes the file) if the meta line fails. # args: <version> <agent> <model_raw> <git_remote> _opencode_write_session_jsonl() { local db_path="$1" sid="$2" out_file="$3" directory="$4" title="$5" version="$6" agent="$7" model_raw="$8" remote="$9" # first_prompt = first user text part. JSON paths are escaped (\$) so bash # leaves them literal for sqlite. Truncated server-side; title is the # server-side fallback when this is empty. local first_prompt first_prompt=$(sqlite3 "$db_path" "SELECT json_extract(p.data,'\$.text') FROM part p JOIN message m ON p.message_id=m.id WHERE m.session_id='$sid' AND json_extract(m.data,'\$.role')='user' AND json_extract(p.data,'\$.type')='text' ORDER BY m.time_created, p.time_created LIMIT 1" 2>/dev/null || true) # Informational model string "<providerID>/<id>" from the session model JSON. local model="" if [ -n "$model_raw" ]; then model=$(printf '%s' "$model_raw" | jq -r 'if type=="object" then ((.providerID // "") + "/" + (.id // "")) else "" end' 2>/dev/null || true) model="${model#/}"; model="${model%/}" fi # Line 1: opencode_session_meta marker. if ! jq -cn \ --arg id "$sid" --arg title "$title" --arg fp "${first_prompt}" \ --arg dir "$directory" --arg remote "${remote:-}" \ --arg model "$model" --arg agent "$agent" --arg version "$version" \ '{type:"opencode_session_meta", id:$id, title:$title, first_prompt:($fp[0:1000]), directory:$dir, git_remote:$remote, model:$model, agent:$agent, version:$version}' \ > "$out_file" 2>/dev/null; then rm -f "$out_file" return 1 fi # Message lines: one per message, parts inlined with DB time_created (`t`). sqlite3 "$db_path" "SELECT json_object('type','opencode_message','message',json(m.data),'parts',(SELECT json_group_array(json_object('t',p.time_created,'p',json(p.data))) FROM part p WHERE p.message_id=m.id)) FROM message m WHERE m.session_id='$sid' ORDER BY m.time_created" >> "$out_file" 2>/dev/null || true return 0 } # Build a SELECT column expression that tolerates opencode session-table columns # added across versions. opencode grows the session schema over time (agent + # model landed 2026-05-01 in the "next_venus" migration; metadata 2026-05-11), so # a SELECT that hard-references a column dies with "no such column: agent" on any # older DB -> extract_opencode_db returns 2 -> "extraction failed for N # database(s)" even though the session is perfectly readable. Given a |-delimited # column list and a column name, emit COALESCE(<name>,<default>) when present, # else just <default>. See SESSION_DETECTION.md §3d. _oc_coalesce_col() { local cols="$1" name="$2" default="$3" case "$cols" in *"|$name|"*) printf 'COALESCE(%s,%s)' "$name" "$default" ;; *) printf '%s' "$default" ;; esac } # Extract opencode sessions from a SQLite DB into "opencode-native" JSONL. # opencode stores sessions relationally (session/message/part tables, content in # JSON columns), not as JSONL like Claude/Codex. We dump each top-level session # to one file per bucket: line 1 is an opencode_session_meta marker (drives # server-side format detection + the discoverer index + the sidecar), then one # opencode_message line per message with its parts inlined. Each part carries # its DB time_created as `t`; the server-side OpencodeNormalizer sorts by `t` # and converts to canonical (SQLite's json_group_array does NOT reliably sort # aggregate input, so we deliberately do NOT order parts in SQL). # # opencode `task`-tool subagents are child sessions (parent_id set). We emit each # one at <bucket>/<parent_id>/subagents/<child_id>.jsonl — the SAME layout Claude # subagents use — so TranscriptDiscoverer + find_jsonl link them as is_subagent # children with ZERO server changes. Children are only emitted when their parent # top-level session was extracted (its <bucket>/<parent_id>.jsonl exists), which # keeps --since / --project scoping consistent. # # Buckets are keyed by the session's workspace directory (like Cursor), so two # opencode workspaces in the same repo don't collide; the sidecar maps each # bucket to a git_remote and the server collapses same-remote buckets into one # Project. Requires sqlite3 (with JSON1) + jq. # Return codes: 0 = extracted sessions, 1 = no data (not an error), 2 = real error. extract_opencode_db() { local db_path="$1" local output_dir="$2" local selected_remote="${3:-}" # Validate schema (opencode 1.x: session/message/part tables). local tables tables=$(sqlite3 "$db_path" ".tables" 2>/dev/null || true) if ! echo "$tables" | grep -q "session" || ! echo "$tables" | grep -q "message" || ! echo "$tables" | grep -q "part"; then echo " Warning: $(basename "$db_path") is not a recognized opencode database. Skipping." >&2 return 2 fi # JSON1 probe — extraction relies on json_object/json_group_array/json(). local json_ok json_ok=$(sqlite3 "$db_path" "SELECT json_valid('{}')" 2>/dev/null || echo "0") if [ "$json_ok" != "1" ]; then echo " Warning: this sqlite3 build lacks JSON support; cannot read opencode data. Skipping." >&2 return 2 fi # Probe the session table's actual columns so the SELECTs below degrade across # opencode schema versions instead of failing whole-DB (see _oc_coalesce_col). # opencode adds session columns over time (agent + model arrived 2026-05-01 in # the "next_venus" migration); directory/title/version/time_created — and the # id/parent_id structural keys — are baseline since its first SQLite schema. # Wrapped in leading/trailing '|' for substring membership tests. local oc_cols oc_cols=$(sqlite3 "$db_path" "SELECT '|'||COALESCE(group_concat(name,'|'),'')||'|' FROM pragma_table_info('session')" 2>/dev/null || true) if [ -z "$oc_cols" ] || [ "$oc_cols" = "||" ]; then echo " Warning: could not read the opencode session schema from $(basename "$db_path"). Skipping." >&2 return 2 fi # Resolve each content column against the probe so older DBs (e.g. pre-agent/model) # still read. id + parent_id stay hard-referenced as structural invariants, so the # top-level vs subagent split below matches count_opencode_sessions exactly. local sel_dir sel_title sel_ver sel_agent sel_model sel_tc sel_dir=$(_oc_coalesce_col "$oc_cols" directory "''") sel_title=$(_oc_coalesce_col "$oc_cols" title "''") sel_ver=$(_oc_coalesce_col "$oc_cols" version "''") sel_agent=$(_oc_coalesce_col "$oc_cols" agent "''") sel_model=$(_oc_coalesce_col "$oc_cols" model "''") sel_tc=$(_oc_coalesce_col "$oc_cols" time_created 0) local session_count session_count=$(sqlite3 "$db_path" "SELECT COUNT(*) FROM session WHERE parent_id IS NULL" 2>/dev/null || echo "0") [ "$session_count" -eq 0 ] 2>/dev/null && return 1 local selected_remote_norm="" [ -n "$selected_remote" ] && selected_remote_norm=$(normalize_remote "$selected_remote") # Dump one JSON object per top-level session. Using json_object avoids # separator pitfalls with arbitrary directory/title text; model stays raw # text (model_raw) for jq to parse below. local sqlite_out="$output_dir/.opencode_sessions.$$" if ! sqlite3 "$db_path" "SELECT json_object('id',id,'directory',$sel_dir,'title',$sel_title,'version',$sel_ver,'agent',$sel_agent,'model_raw',$sel_model,'time_created',$sel_tc) FROM session WHERE parent_id IS NULL" > "$sqlite_out" 2>/dev/null; then echo " Warning: failed to read opencode sessions from $(basename "$db_path")" >&2 rm -f "$sqlite_out" return 2 fi local extracted=0 local row while IFS= read -r row; do [ -z "$row" ] && continue local sid directory title version agent model_raw created_ms sid=$(printf '%s' "$row" | jq -r '.id // empty' 2>/dev/null || true) [ -z "$sid" ] && continue directory=$(printf '%s' "$row" | jq -r '.directory // empty' 2>/dev/null || true) title=$(printf '%s' "$row" | jq -r '.title // empty' 2>/dev/null || true) version=$(printf '%s' "$row" | jq -r '.version // empty' 2>/dev/null || true) agent=$(printf '%s' "$row" | jq -r '.agent // empty' 2>/dev/null || true) model_raw=$(printf '%s' "$row" | jq -r '.model_raw // empty' 2>/dev/null || true) created_ms=$(printf '%s' "$row" | jq -r '.time_created // 0' 2>/dev/null || echo "0") # --since filter (createdAt is milliseconds epoch). if [ -n "${SINCE_EPOCH:-}" ]; then local created_s=$(( created_ms / 1000 )) 2>/dev/null || created_s=0 [ "$created_s" -lt "$SINCE_EPOCH" ] 2>/dev/null && continue fi # Resolve remote from the session directory (cwd). Live dir -> get_git_remote; # deleted dir -> resolve_remote_for_dead_cwd (ancestor / sibling-worktree # walk), then a Conductor dead-workspace cache fallback. Mirrors # extract_cursor_db's per-session resolution (see its comment block). local session_remote="" if [ -n "$directory" ]; then if [ -e "$directory" ]; then session_remote=$(get_git_remote "$directory") else session_remote=$(resolve_remote_for_dead_cwd "$directory" || true) if [ -z "$session_remote" ]; then local _normalized_ws="${directory%/}" local _ws_parent="" case "$_normalized_ws" in */conductor/workspaces/*/*) _ws_parent="${_normalized_ws%/*}" ;; */.conductor/*) _ws_parent="${_normalized_ws%%/.conductor/*}/.conductor" ;; esac if [ -n "$_ws_parent" ]; then local _cache_file="${HOME}/.paxel/cache/project-remotes-v2.tsv" if [ -f "$_cache_file" ]; then local _dir _key _rest _row_cwd while IFS=$'\t' read -r _dir _key _rest; do [ -z "$_key" ] && continue case "$_key" in name:*|local:*|unknown) continue ;; esac _row_cwd=$(get_project_cwd "$_dir" 2>/dev/null || true) [ -z "$_row_cwd" ] && continue if [ "${_row_cwd%/*}" = "$_ws_parent" ]; then session_remote="$_key" break fi done < "$_cache_file" fi fi fi fi fi # Per-session --project filter. if [ -n "$selected_remote_norm" ]; then local rn rn=$(normalize_remote "$session_remote") [ "$rn" != "$selected_remote_norm" ] && continue fi # Bucket by workspace directory path (md5[:6] keeps same-repo workspaces apart). local bucket="_opencode_unattributed" if [ -n "$directory" ]; then bucket="_opencode_$(basename "$directory")_$(stable_hash6 "$directory")" fi mkdir -p "$output_dir/$bucket" local session_file="$output_dir/$bucket/${sid}.jsonl" [ -f "$session_file" ] && continue # dedupe across multiple DBs if _opencode_write_session_jsonl "$db_path" "$sid" "$session_file" "$directory" "$title" "$version" "$agent" "$model_raw" "$session_remote"; then extracted=$((extracted + 1)) fi done < "$sqlite_out" rm -f "$sqlite_out" # Pass 2: child (subagent) sessions, emitted at # <bucket>/<parent_id>/subagents/<child_id>.jsonl. A child is bucketed by its # own workspace directory (= the parent's, in practice); we only emit it if the # parent's top-level file already exists in that bucket (so filtered-out parents # don't leave orphan subagents, and grandchildren whose parent is itself a child # are skipped). No --since filter on children — they belong to an included parent. local child_out="$output_dir/.opencode_children.$$" if sqlite3 "$db_path" "SELECT json_object('id',id,'parent_id',COALESCE(parent_id,''),'directory',$sel_dir,'title',$sel_title,'version',$sel_ver,'agent',$sel_agent,'model_raw',$sel_model) FROM session WHERE parent_id IS NOT NULL" > "$child_out" 2>/dev/null; then local crow while IFS= read -r crow; do [ -z "$crow" ] && continue local c_sid c_parent c_dir c_title c_ver c_agent c_model_raw c_sid=$(printf '%s' "$crow" | jq -r '.id // empty' 2>/dev/null || true) c_parent=$(printf '%s' "$crow" | jq -r '.parent_id // empty' 2>/dev/null || true) [ -z "$c_sid" ] && continue [ -z "$c_parent" ] && continue c_dir=$(printf '%s' "$crow" | jq -r '.directory // empty' 2>/dev/null || true) [ -z "$c_dir" ] && continue local c_bucket="_opencode_$(basename "$c_dir")_$(stable_hash6 "$c_dir")" # Gate on the parent's extracted top-level file existing in this bucket. [ -f "$output_dir/$c_bucket/${c_parent}.jsonl" ] || continue local sub_dir="$output_dir/$c_bucket/$c_parent/subagents" mkdir -p "$sub_dir" local child_file="$sub_dir/${c_sid}.jsonl" [ -f "$child_file" ] && continue c_title=$(printf '%s' "$crow" | jq -r '.title // empty' 2>/dev/null || true) c_ver=$(printf '%s' "$crow" | jq -r '.version // empty' 2>/dev/null || true) c_agent=$(printf '%s' "$crow" | jq -r '.agent // empty' 2>/dev/null || true) c_model_raw=$(printf '%s' "$crow" | jq -r '.model_raw // empty' 2>/dev/null || true) # git_remote left empty on children: the discoverer links them to the # parent by path, not by remote, so it's unused for subagents. if _opencode_write_session_jsonl "$db_path" "$c_sid" "$child_file" "$c_dir" "$c_title" "$c_ver" "$c_agent" "$c_model_raw" ""; then extracted=$((extracted + 1)) fi done < "$child_out" fi rm -f "$child_out" [ "$extracted" -gt 0 ] && return 0 || return 1 } # Collect opencode sessions into the archive/extraction tmpdir. Scans # OPENCODE_DIR for opencode*.db (or honors an explicit OPENCODE_DB override), # extracts each, and writes per-bucket sidecar entries so TranscriptDiscoverer # merges each bucket into the Project with the matching git_remote. # Signature mirrors collect_cursor_sessions / collect_codex_sessions: # collect_opencode_sessions <output_tmpdir> [selected_remote] collect_opencode_sessions() { local tmpdir="$1" local selected_remote="${2:-}" if ! command -v sqlite3 &>/dev/null; then echo " opencode: sqlite3 not found. Install with: brew install sqlite3 (macOS) or apt install sqlite3 (Linux)" >&2 return 0 fi if ! command -v jq &>/dev/null; then echo " opencode: jq not found. Install with: brew install jq (macOS) or apt install jq (Linux)" >&2 return 0 fi # Candidate DBs: a non-empty OPENCODE_DB is AUTHORITATIVE — if it's set we use # only it (and warn + bail if it's missing rather than silently scanning the # user's real default DBs). Otherwise scan OPENCODE_DIR for every opencode*.db # (multi-channel; WAL/shm end in -wal/-shm, not .db). local -a dbs=() if [ -n "${OPENCODE_DB:-}" ]; then if [ -f "${OPENCODE_DB}" ]; then dbs+=("$OPENCODE_DB") else echo " opencode: OPENCODE_DB set but not found: ${OPENCODE_DB}" >&2 return 0 fi elif [ -d "$OPENCODE_DIR" ]; then local db while IFS= read -r db; do [ -n "$db" ] && dbs+=("$db") done < <(find "$OPENCODE_DIR" -maxdepth 1 -name 'opencode*.db' 2>/dev/null) fi [ "${#dbs[@]}" -eq 0 ] && return 0 local _oc_errors=0 local db for db in "${dbs[@]}"; do local _erc=0 extract_opencode_db "$db" "$tmpdir" "$selected_remote" || _erc=$? [ "$_erc" -eq 2 ] && _oc_errors=$((_oc_errors + 1)) done local oc_count oc_count=$(find "$tmpdir" -maxdepth 2 -path "*/_opencode_*/*.jsonl" 2>/dev/null | wc -l | tr -d ' ') if [ "$oc_count" -gt 0 ]; then local oc_bytes ws_count oc_bytes=$(find "$tmpdir" -maxdepth 2 -path "*/_opencode_*/*.jsonl" -exec cat {} + 2>/dev/null | wc -c | tr -d ' ') ws_count=$(find "$tmpdir" -maxdepth 1 -type d -name "_opencode_*" 2>/dev/null | wc -l | tr -d ' ') echo " opencode: ${oc_count} sessions, $(($oc_bytes / 1024))KB (${ws_count} workspaces)" >&2 [ ! -f "$tmpdir/_metadata.json" ] && echo '{"version":1,"directories":{}}' > "$tmpdir/_metadata.json" local ws_dir for ws_dir in "$tmpdir"/_opencode_*/; do [ -d "$ws_dir" ] || continue local bucket_name first_file bremote bcwd updated bucket_name=$(basename "$ws_dir") first_file=$(find "$ws_dir" -name "*.jsonl" -maxdepth 1 -print -quit 2>/dev/null || true) [ -z "$first_file" ] && continue bremote=$(head -1 "$first_file" | jq -r '.git_remote // empty' 2>/dev/null || true) bcwd=$(head -1 "$first_file" | jq -r '.directory // empty' 2>/dev/null || true) updated=$(jq \ --arg bucket "$bucket_name" \ --arg remote "${bremote:-}" \ --arg cwd "${bcwd:-}" \ '.directories[$bucket] = {"git_remote": $remote, "cwd": $cwd}' \ "$tmpdir/_metadata.json" 2>/dev/null) [ -n "$updated" ] && echo "$updated" > "$tmpdir/_metadata.json" done # The resolver may have logged dead-cwd recoveries during extraction; keep # the sidecar's orphan_recovery_count honest (mirrors collect_cursor_sessions). if declare -f _refresh_orphan_recovery_count >/dev/null 2>&1; then _refresh_orphan_recovery_count "$tmpdir/_metadata.json" fi else rmdir "$tmpdir"/_opencode_* 2>/dev/null || true fi if [ "$oc_count" -eq 0 ] && [ "$_oc_errors" -gt 0 ]; then echo " opencode: extraction failed for $_oc_errors database(s)" >&2 return 1 fi return 0 } # Count top-level opencode sessions for the time estimate + the single-dir # zero-session guard. With a selected_remote, counts only sessions whose # workspace directory resolves to that remote (each distinct directory resolved # once). Prelude display only — no downstream gate beyond avoiding a false # "No sessions found" abort for an opencode-only user. Echoes 0 on any problem. count_opencode_sessions() { local selected_remote="${1:-}" command -v sqlite3 &>/dev/null || { echo 0; return; } command -v jq &>/dev/null || { echo 0; return; } # Non-empty OPENCODE_DB is authoritative (kept in sync with # collect_opencode_sessions): a set-but-missing override counts 0 rather than # scanning the user's real DBs. local -a dbs=() if [ -n "${OPENCODE_DB:-}" ]; then if [ -f "${OPENCODE_DB}" ]; then dbs+=("$OPENCODE_DB") else echo 0; return fi elif [ -d "$OPENCODE_DIR" ]; then local db while IFS= read -r db; do [ -n "$db" ] && dbs+=("$db") done < <(find "$OPENCODE_DIR" -maxdepth 1 -name 'opencode*.db' 2>/dev/null) fi [ "${#dbs[@]}" -eq 0 ] && { echo 0; return; } local sel_norm="" [ -n "$selected_remote" ] && sel_norm=$(normalize_remote "$selected_remote") # --since filter: mirror extract_opencode_db (time_created is a baseline ms-epoch # column). Without it the count over-reports sessions older than --since that the # extractor drops, which can make an opencode-only repo wrongly visible/selectable. local _oc_since="" [ -n "${SINCE_EPOCH:-}" ] && _oc_since="AND time_created >= $((SINCE_EPOCH * 1000))" local total=0 local db for db in "${dbs[@]}"; do sqlite3 "$db" "SELECT 1 FROM session LIMIT 1" >/dev/null 2>&1 || continue if [ -z "$sel_norm" ]; then local c c=$(sqlite3 "$db" "SELECT COUNT(*) FROM session WHERE parent_id IS NULL $_oc_since" 2>/dev/null || echo 0) [ -n "$c" ] && total=$((total + c)) || true else # Resolve each distinct workspace directory once, then add the session # count for directories whose remote matches. local line dir n remote rn while IFS=$'\t' read -r dir n; do [ -z "$dir" ] && continue if [ -e "$dir" ]; then remote=$(get_git_remote "$dir") else remote=$(resolve_remote_for_dead_cwd "$dir" || true) fi rn=$(normalize_remote "$remote") [ "$rn" = "$sel_norm" ] && total=$((total + n)) done < <(sqlite3 -separator "$(printf '\t')" "$db" "SELECT directory, COUNT(*) FROM session WHERE parent_id IS NULL AND directory IS NOT NULL AND directory <> '' $_oc_since GROUP BY directory" 2>/dev/null) fi done echo "$total" } # Copy one Gemini CLI chats/ dir into a bucket, keyed by the real sessionId so the # discoverer's <parent>/subagents/<child> linking lines up. Reads sessionId via sed # (no jq/sqlite3 dependency — gemini sessions are plain JSONL we copy verbatim; the # server GeminiNormalizer does all reconstruction). Returns 0 if anything emitted. _gemini_extract_chats() { local chats_dir="$1" local bucket_out="$2" [ -d "$chats_dir" ] || return 1 mkdir -p "$bucket_out" local emitted=0 # Top-level sessions: session-<ISO>-<short>.jsonl. Name the copy by the full # sessionId from the header (the filename only carries an 8-char prefix), so the # subagent dirs below — named by the parent's full sessionId — match the parent # file's basename, which is how TranscriptDiscoverer pairs parent <-> subagent. local f sid dest fm for f in "$chats_dir"/session-*.jsonl; do [ -f "$f" ] || continue # --since filter via file mtime (portable: GNU stat -c, then BSD stat -f). # Parity with the Codex/Cursor/opencode collectors. Only top-level sessions # are filtered; an included parent's subagents come along (the gate below # requires the parent's file to exist). if [ -n "${SINCE_EPOCH:-}" ]; then fm=$(stat -c %Y "$f" 2>/dev/null || stat -f %m "$f" 2>/dev/null || echo "0") [ "$fm" -lt "$SINCE_EPOCH" ] 2>/dev/null && continue fi sid=$(sed -n '1s/.*"sessionId":"\([^"]*\)".*/\1/p' "$f" 2>/dev/null || true) [ -z "$sid" ] && continue # fail-closed: never emit a stray _gemini_/.jsonl dest="$bucket_out/${sid}.jsonl" [ -f "$dest" ] && continue # dedupe across slug dirs cp "$f" "$dest" && emitted=$((emitted + 1)) done # Subagents: chats/<parentSessionId>/<childSessionId>.jsonl -> re-laid at # <bucket>/<parentSessionId>/subagents/<childSessionId>.jsonl (the Claude layout # the discoverer links). Gated on the parent's top-level file existing, so a # filtered-out parent leaves no orphan subagents (mirrors extract_opencode_db). local pdir parent cf c_sid sub_dir cdest for pdir in "$chats_dir"/*/; do [ -d "$pdir" ] || continue parent=$(basename "$pdir") [ -f "$bucket_out/${parent}.jsonl" ] || continue sub_dir="$bucket_out/$parent/subagents" for cf in "$pdir"*.jsonl; do [ -f "$cf" ] || continue c_sid=$(sed -n '1s/.*"sessionId":"\([^"]*\)".*/\1/p' "$cf" 2>/dev/null || true) [ -z "$c_sid" ] && c_sid=$(basename "$cf" .jsonl) [ -z "$c_sid" ] && continue mkdir -p "$sub_dir" cdest="$sub_dir/${c_sid}.jsonl" [ -f "$cdest" ] && continue cp "$cf" "$cdest" && emitted=$((emitted + 1)) done done [ "$emitted" -gt 0 ] && return 0 || return 1 } # Write the per-bucket remote into the sidecar so TranscriptDiscoverer merges the # bucket onto the matching repo Project. jq-OPTIONAL: without jq the sessions still # upload and analyze — they just attribute to a bucket-named Project instead of # collapsing onto the git remote (the discoverer's no-sidecar fallback). _gemini_write_sidecar() { local tmpdir="$1" bucket="$2" remote="${3:-}" cwd="${4:-}" command -v jq >/dev/null 2>&1 || return 0 [ ! -f "$tmpdir/_metadata.json" ] && echo '{"version":1,"directories":{}}' > "$tmpdir/_metadata.json" local updated updated=$(jq --arg bucket "$bucket" --arg remote "$remote" --arg cwd "$cwd" \ '.directories[$bucket] = {"git_remote": $remote, "cwd": $cwd}' \ "$tmpdir/_metadata.json" 2>/dev/null) [ -n "$updated" ] && echo "$updated" > "$tmpdir/_metadata.json" } # Collect Gemini CLI sessions into the archive/extraction tmpdir. Each # ~/.gemini/tmp/<slug>/ dir is one project (its .project_root names the repo); # we resolve the remote, bucket as _gemini_<basename>_<hash>, copy sessions + # subagents, and write a sidecar entry. Signature mirrors collect_opencode_sessions: # collect_gemini_sessions <output_tmpdir> [selected_remote] # No sqlite3/jq hard dependency (extraction is sed/cp). collect_gemini_sessions() { local tmpdir="$1" local selected_remote="${2:-}" [ -d "$GEMINI_DIR" ] || return 0 local sel_norm="" [ -n "$selected_remote" ] && sel_norm=$(normalize_remote "$selected_remote") local slug_dir for slug_dir in "$GEMINI_DIR"/*/; do [ -d "${slug_dir}chats" ] || continue local project_root="" [ -f "${slug_dir}.project_root" ] && project_root=$(head -1 "${slug_dir}.project_root" 2>/dev/null || true) # Resolve remote from the project root: live -> get_git_remote, deleted -> # ancestor/sibling-worktree recovery (mirrors the SQLite collectors). local session_remote="" if [ -n "$project_root" ]; then if [ -e "$project_root" ]; then session_remote=$(get_git_remote "$project_root") else session_remote=$(resolve_remote_for_dead_cwd "$project_root" || true) fi fi # Per-project filter. if [ -n "$sel_norm" ]; then local rn rn=$(normalize_remote "$session_remote") [ "$rn" != "$sel_norm" ] && continue fi local bucket="_gemini_unattributed" [ -n "$project_root" ] && bucket="_gemini_$(basename "$project_root")_$(stable_hash6 "$project_root")" _gemini_extract_chats "${slug_dir}chats" "$tmpdir/$bucket" || true if [ -n "$(find "$tmpdir/$bucket" -maxdepth 1 -name '*.jsonl' -print -quit 2>/dev/null)" ]; then _gemini_write_sidecar "$tmpdir" "$bucket" "$session_remote" "$project_root" fi done local g_count g_count=$(find "$tmpdir" -maxdepth 2 -path "*/_gemini_*/*.jsonl" 2>/dev/null | wc -l | tr -d ' ') if [ "$g_count" -gt 0 ]; then local g_bytes ws_count g_bytes=$(find "$tmpdir" -path "*/_gemini_*/*.jsonl" -exec cat {} + 2>/dev/null | wc -c | tr -d ' ') ws_count=$(find "$tmpdir" -maxdepth 1 -type d -name "_gemini_*" 2>/dev/null | wc -l | tr -d ' ') echo " Gemini CLI: ${g_count} sessions, $(($g_bytes / 1024))KB (${ws_count} workspaces)" >&2 if declare -f _refresh_orphan_recovery_count >/dev/null 2>&1; then _refresh_orphan_recovery_count "$tmpdir/_metadata.json" fi else rmdir "$tmpdir"/_gemini_* 2>/dev/null || true fi return 0 } # Count top-level Gemini sessions for the time estimate + zero-session guard. # With a selected_remote, counts only slug dirs whose .project_root resolves to it. # Prelude display only. Echoes 0 on any problem. No jq/sqlite3 needed. count_gemini_sessions() { local selected_remote="${1:-}" [ -d "$GEMINI_DIR" ] || { echo 0; return; } local sel_norm="" [ -n "$selected_remote" ] && sel_norm=$(normalize_remote "$selected_remote") local total=0 slug_dir for slug_dir in "$GEMINI_DIR"/*/; do [ -d "${slug_dir}chats" ] || continue if [ -n "$sel_norm" ]; then local project_root="" remote rn [ -f "${slug_dir}.project_root" ] && project_root=$(head -1 "${slug_dir}.project_root" 2>/dev/null || true) [ -z "$project_root" ] && continue if [ -e "$project_root" ]; then remote=$(get_git_remote "$project_root") else remote=$(resolve_remote_for_dead_cwd "$project_root" || true) fi rn=$(normalize_remote "$remote") [ "$rn" != "$sel_norm" ] && continue fi # Count top-level sessions, honoring --since via file mtime (parity with the # extraction filter above, so the estimate matches what actually uploads). local c=0 sf fm while IFS= read -r sf; do [ -z "$sf" ] && continue if [ -n "${SINCE_EPOCH:-}" ]; then fm=$(stat -c %Y "$sf" 2>/dev/null || stat -f %m "$sf" 2>/dev/null || echo "0") [ "$fm" -lt "$SINCE_EPOCH" ] 2>/dev/null && continue fi c=$((c + 1)) done < <(find "${slug_dir}chats" -maxdepth 1 -name 'session-*.jsonl' 2>/dev/null) total=$((total + c)) done echo "$total" } # Fallback: read origin URL from a jj workspace when git's standard probe # can't find it (non-colocated jj checkout with no .git dir). get_jj_remote() { local cwd="$1" [ -z "$cwd" ] && echo "" && return [ ! -d "$cwd" ] && echo "" && return [ ! -d "$cwd/.jj" ] && echo "" && return command -v jj >/dev/null 2>&1 || { echo ""; return; } local jj_remotes jj_remotes=$(jj git remote list --repository "$cwd" 2>/dev/null || true) local remote remote=$(echo "$jj_remotes" | awk '$1 == "origin" { if ($2 == "<no" && $3 == "URL>") print "<no URL>"; else print $2; exit }') [ "$remote" = "<no URL>" ] && echo "" || echo "$remote" } # Get canonical git remote URL (passes through normalize_remote below so that # every caller's grouping key is the same canonical form the server uses in # Repository.normalize_remote). Falls back to jj when git has no origin. get_git_remote() { local cwd="$1" [ -z "$cwd" ] && echo "" && return [ ! -d "$cwd" ] && echo "" && return local remote remote=$(git -C "$cwd" remote get-url origin 2>/dev/null || true) if [ -z "$remote" ]; then remote=$(get_jj_remote "$cwd") fi normalize_remote "$remote" } # Normalize a git remote URL to the same canonical form the server uses in # Repository.normalize_remote (app/models/repository.rb). This lets the # client's exact-string filters (Codex repository_url vs the selected project's # origin) treat https:// and git@: as the same repo — otherwise a user with # mixed https/ssh remotes silently loses Codex coverage for their project. # Port of app/models/repository.rb:16-34. Returns the normalized form, or # empty string when the input is blank. normalize_remote() { local url="$1" [ -z "$url" ] && echo "" && return local n="$url" # Strip leading/trailing whitespace n=$(printf '%s' "$n" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') # Strip ssh:// and http(s):// schemes n="${n#ssh://}" n="${n#https://}" n="${n#http://}" # Convert git@host:path -> host/path n=$(printf '%s' "$n" | sed 's|^git@\([^:]*\):|\1/|') # Strip trailing .git n="${n%.git}" # Strip trailing / n="${n%/}" # Strip any remaining leading user@ (e.g. ssh://other-user@host/path after scheme strip) n=$(printf '%s' "$n" | sed 's|^[^@/]*@||') printf '%s' "$n" } # Encode an absolute path the way Claude Code names its project dirs: replace # BOTH "/" AND "." with "-" (e.g. /Users/a/x70.one -> -Users-a-x70-one). Used by # Strategy-2 auto-detect to match the current dir against ~/.claude/projects/. # Matching only "/" silently broke detection for any cwd containing a dot # (x70.one, qerdp.co.uk, macOS /var/folders/.../T/tmp.X) — see SESSION_DETECTION.md # §3a. "." is a literal inside the [] bracket expression on BSD + GNU sed. encode_claude_dir_name() { printf '%s' "$1" | sed 's|[/.]|-|g' } # Backfill missing remotes for Conductor workspaces by finding siblings. # Conductor creates worktrees in two path patterns: # New: ~/conductor/workspaces/{project}/{workspace} # Old: {any_path}/.conductor/{workspace} # When a workspace is cleaned up, the CWD no longer exists and git remote # resolution fails. We recover by finding a sibling workspace for the same # project that DID resolve and reusing its remote. # Operates on global arrays: _bfc_cwds[], _bfc_remotes[] # Mutates _bfc_remotes[] in place. backfill_conductor_remotes() { local j=0 local total=${#_bfc_cwds[@]} while [ $j -lt $total ]; do if [ -z "${_bfc_remotes[$j]}" ]; then local cwd="${_bfc_cwds[$j]}" local conductor_pattern="" if [[ "$cwd" == */conductor/workspaces/*/* ]]; then # New pattern: ~/conductor/workspaces/{project}/* conductor_pattern="*/conductor/workspaces/$(echo "$cwd" | sed 's|.*/conductor/workspaces/||; s|/.*||')/*" elif [[ "$cwd" == */.conductor/* ]]; then # Old pattern: {any_path}/.conductor/* local project_root="${cwd%%/.conductor/*}" conductor_pattern="${project_root}/.conductor/*" fi if [ -n "$conductor_pattern" ]; then local k=0 while [ $k -lt $total ]; do if [ -n "${_bfc_remotes[$k]}" ] && [[ "${_bfc_cwds[$k]}" == ${conductor_pattern} ]]; then _bfc_remotes[$j]="${_bfc_remotes[$k]}" _log_recovery_source "$cwd" "conductor-backfill" break fi k=$((k + 1)) done # For new pattern, also check for a non-Conductor dir with the same # project name (e.g., ~/infra for conductor/workspaces/infra/*) if [ -z "${_bfc_remotes[$j]}" ] && [[ "$cwd" == */conductor/workspaces/*/* ]]; then local project_name project_name=$(echo "$cwd" | sed 's|.*/conductor/workspaces/||; s|/.*||') local k=0 while [ $k -lt $total ]; do if [ -n "${_bfc_remotes[$k]}" ]; then local sibling_basename sibling_basename=$(basename "${_bfc_cwds[$k]}") if [ "$sibling_basename" = "$project_name" ] && [[ "${_bfc_cwds[$k]}" != */conductor/workspaces/* ]]; then _bfc_remotes[$j]="${_bfc_remotes[$k]}" _log_recovery_source "$cwd" "conductor-backfill" break fi fi k=$((k + 1)) done fi # For old pattern, also check the project root dir (e.g., ~/code) if [ -z "${_bfc_remotes[$j]}" ] && [[ "$cwd" == */.conductor/* ]]; then local project_root="${cwd%%/.conductor/*}" local k=0 while [ $k -lt $total ]; do if [ -n "${_bfc_remotes[$k]}" ] && [ "${_bfc_cwds[$k]}" = "$project_root" ]; then _bfc_remotes[$j]="${_bfc_remotes[$k]}" _log_recovery_source "$cwd" "conductor-backfill" break fi k=$((k + 1)) done fi fi fi j=$((j + 1)) done } # Disk-backed dedup log for orphan-cwd recovery activity. The resolver # runs inside `$(...)` subshells at every call site, so in-memory counters # and assoc arrays can't survive across invocations. A file append does. # # Schema: `<cwd><TAB><source>` per line. `<source>` is one of: # ancestor, worktree-list, jj-workspace-list, project-cache, # conductor-backfill, unresolvable. # # Two readers consume this log: # - _rmdc_recovery_count_unique: counts unique cwds for the legacy # orphan_recovery_count metric. Filters out `unresolvable` rows # so the metric keeps its "successful recoveries" semantics. # - _recovery_source_breakdown: emits a per-source CSV for the # recovery_breakdown telemetry field (all sources including # unresolvable). # # Summary line in list_projects_grouped reads unique lines; sidecar # writes (PR #566) can compute scoped deltas via snapshot-before/after. # If mktemp fails, the log stays empty and the summary reports 0 — the # per-recovery stderr lines still fire, so activity is visible. # # In the Functions section so bats `extract_functions` picks it up; # cleanup trap lives at script top-level. _RMDC_LOG_FILE="" if _rmdc_tmp=$(mktemp -t paxel_recoveries.XXXXXX 2>/dev/null); then _RMDC_LOG_FILE="$_rmdc_tmp" fi unset _rmdc_tmp 2>/dev/null || true # Docker --all sidecar staging path. Computed lazily at call time (not # at script load) so bats tests that override $HOME after source see the # correct per-test path. Deterministic $$-naming mirrors the existing # $$-suffixed tmpdirs at cleanup_temp_dirs:82-86; both the parent shell # and _docker_all_host_scan_for_recovery's ( ... ) subshell compute the # same path independently. Tests override via `export # _DOCKER_ALL_SIDECAR_DIR=...` before calling the helper. _docker_all_sidecar_dir() { printf '%s' "${_DOCKER_ALL_SIDECAR_DIR:-${HOME}/.paxel/cache/docker-all-sidecar-$$}" } # `--all` git extraction. Writes per-repo numstat + commit-count into the # sidecar's _git/ dir so the container can SUM them into one combined # git_metrics (ClientPipeline#collect_git_data_aggregate). Only numstat + # commit_count are emitted — the aggregate deliberately skips recent_commits / # author files to avoid cross-repo episode mislinking server-side. Deduped by # git_remote (worktrees of one repo extract once); jq-independent (git only). # get_project_cwd / get_git_remote resolve against CLAUDE_DIR, so we pin it to # the passed dir for the duration (dynamic scope reaches the called helpers). _docker_all_extract_git_data() { local CLAUDE_DIR="$1" [ -d "$CLAUDE_DIR" ] || return 0 local git_out git_out="$(_docker_all_sidecar_dir)/_git" # mkdir our own _git/ — the recovery scan only creates the sidecar root when # jq is present, but git extraction needs no jq. mkdir -p "$git_out" local since_flag="" if [ -n "${SINCE_EPOCH:-}" ]; then since_flag="--since=$(date -r "$SINCE_EPOCH" '+%Y-%m-%d' 2>/dev/null || date -d "@$SINCE_EPOCH" '+%Y-%m-%d' 2>/dev/null || echo '')" fi local seen_keys="|" local extracted=0 local proj_dir for proj_dir in "$CLAUDE_DIR"/*/; do [ -d "$proj_dir" ] || continue local pname pname=$(basename "$proj_dir") local pcwd pcwd=$(get_project_cwd "$pname") if [ -z "$pcwd" ] || [ ! -e "$pcwd/.git" ]; then continue fi local premote premote=$(get_git_remote "$pcwd") # Dedup by remote (fall back to cwd for no-origin repos) so worktrees of one # repo aren't extracted N times. local key="${premote:-$pcwd}" case "$seen_keys" in *"|${key}|"*) continue ;; esac seen_keys="${seen_keys}${key}|" local encoded encoded=$(echo "$pname" | sed 's/[^a-zA-Z0-9_-]/_/g') git -C "$pcwd" rev-list --count HEAD \ > "${git_out}/${encoded}_commit_count.txt" 2>/dev/null || true git -C "$pcwd" log -${COMMIT_LIMIT:-1000} $since_flag \ --format='COMMIT_BOUNDARY %H %aI %aN <%aE>' --numstat \ > "${git_out}/${encoded}_numstat.txt" 2>/dev/null || true extracted=$((extracted + 1)) done if [ "$extracted" -gt 0 ]; then local _rl="repos" [ "$extracted" -eq 1 ] && _rl="repo" echo " Extracted git history from ${extracted} ${_rl}" fi # Always succeed — a zero-repo result (all dead cwds) must NOT return 1, or the # caller's `set -Eeuo pipefail` would fire the ERR trap and abort the upload. return 0 } # Append a recovery-source row to _RMDC_LOG_FILE. Single writer for # the log; resolver, backfill, project-cache and P1's unresolvable-warning # all flow through here. # # Init-order dependency: _RMDC_LOG_FILE is set at top-level load time # above. The empty-var guard returns 0 silently if the helper is called # before init (e.g. when sourced by bats), so tests can pre-set # _RMDC_LOG_FILE themselves without the mktemp path firing. # # Source names must match [a-z_-]+ — downstream CSV/JSON transforms # split on ':' and ',' and use a hyphen→underscore rewrite. A source # name containing ':' or ',' silently corrupts the sidecar's # recovery_breakdown JSON. Keep additions lowercase with hyphens. _log_recovery_source() { local cwd="$1" local source="$2" [ -z "${_RMDC_LOG_FILE:-}" ] && return 0 [ -z "$cwd" ] && return 0 [ -z "$source" ] && return 0 printf '%s\t%s\n' "$cwd" "$source" >> "$_RMDC_LOG_FILE" 2>/dev/null || true } # Emit a user-visible warning when a Conductor dead-cwd exhausted every # attribution strategy (resolver short-circuits Conductor paths, sibling # walks + ancestor probe don't apply, project-cache had no prior row). # Closes the §9d support-triage gap documented in # docs/designs/SESSION_DETECTION.md — without this, the dir's sessions # silently land under an encoded-name orphan Project, and the recovery # path (run from a live workspace to warm the cache) is undiscoverable. # # Mode-neutral phrasing ("re-run this upload") works for both # `curl | bash` and `bin/upload` invocation paths. # # Suppresses entirely under PAXEL_NO_ORPHAN_RECOVERY=1. The project-cache # fallback that would have populated the cache is also gated on that # flag, so when the user opts out the cache was never consulted — # emitting "no cached remote" would misrepresent state. The opt-out is # a power-user escape hatch; honor "be quiet." # # Gated on the Conductor path pattern. Standalone ~/code/foo deletions # hit the same premote-empty/pcwd-dead shape but are expected to orphan # silently (encoded_name grouping is working as designed for those). # # Logs `<pcwd>\tunresolvable` to _RMDC_LOG_FILE so # _recovery_source_breakdown includes an `unresolvable` bucket. _warn_unresolvable_conductor_cwd() { [ "${PAXEL_NO_ORPHAN_RECOVERY:-0}" = "1" ] && return 0 local pname="$1" local pcwd="$2" case "$pcwd" in */conductor/workspaces/*/*|*/.conductor/*) ;; *) return 0 ;; esac echo "[paxel] warning: couldn't attribute ${pname}'s sessions to a repo." >&2 echo " cwd: ${pcwd} (deleted)" >&2 echo " no cached remote for this project, and no live sibling workspace to" >&2 echo " infer from. To recover, re-run this upload from inside a live" >&2 echo " workspace of this project." >&2 _log_recovery_source "$pcwd" "unresolvable" } # Host-side recovery-detection pass for Docker --all mode. Mirrors the # detection subset of collect_all_projects's Claude loop (:3220-3262): # walk $CLAUDE_DIR's encoded project dirs, compute pcwd + try # get_git_remote, and for dead cwds fall through resolve_remote_for_dead_cwd # → project-cache fallback → _warn_unresolvable_conductor_cwd. # # Scope: detection ONLY. Does NOT write archive sidecars (run_docker_mode # bind-mounts $CLAUDE_DIR into the container, which reads it directly # without a sidecar handoff) and does NOT persist cache TSV rows # (self-warming Docker --all is a separate design question — surprising # behavior to have bin/upload mutate ~/.paxel/cache without opt-in). # # Writes to _RMDC_LOG_FILE via _log_recovery_source only, so the # env-var passthrough at run_docker_analysis's PAXEL_RECOVERY_BREAKDOWN # block can forward non-zero counts to the container. # # Drift risk: if collect_all_projects's recovery strategies change (new # source, reordered gates), mirror the change here. Long-term, extract # a shared detection helper — out of scope for this follow-up. # # Runs the detection walk in a ( ... ) subshell so the CLAUDE_DIR # mutation is scoped — no manual save/restore, and exception-safe # under set -Eeuo pipefail if a future edit adds a command that can # trip set -e inside the loop. _RMDC_LOG_FILE writes propagate out of # the subshell via the filesystem (same pattern as the resolver's # $(...) callers); stderr warnings propagate via fd inheritance. # # Self-warming cache persistence (PR after-#690): every project dir # (live OR recovered) writes a row to $_cache_rows_file, which is # merged into ~/.paxel/cache/project-remotes-v2.tsv at the end of the # scan. This closes the §9d first-run-after-delete gap for Docker # --all users — a workspace that was live during a prior bin/upload # is recoverable from the cache after it's deleted, symmetric with # how legacy --all (collect_all_projects:3376) already warms the # cache. Row format mirrors collect_all_projects:3371. _docker_all_host_scan_for_recovery() { local claude_dir="${1:-${CLAUDE_DIR:-}}" [ -d "$claude_dir" ] || return 0 # No jq guard: get_project_cwd has grep/JSONL fallbacks (:734, :754), # so jq-less hosts still resolve pcwd correctly. Verified by pre-ship # review of this change. ( cd "$claude_dir" || return 0 CLAUDE_DIR="$(pwd)" local _cache_rows_file _cache_rows_file=$(mktemp) # Sidecar for the container's TranscriptDiscoverer.read_sidecar # secondary-path fallback. Docker --all bind-mounts $CLAUDE_DIR # read-only, so the container has no archive sidecar; we write one # here to a host-side staging dir and run_docker_analysis bind-mounts # it at /paxel_sidecar:ro. jq-less hosts skip the write (container # falls back to encoded_name, same as today — no regression). local _sidecar_root="" _sidecar_tmp="" if command -v jq &>/dev/null; then _sidecar_root="$(_docker_all_sidecar_dir)" if mkdir -p "$_sidecar_root" 2>/dev/null; then _sidecar_tmp="$_sidecar_root/_metadata.json" printf '%s' '{"version":1,"directories":{}}' > "$_sidecar_tmp" 2>/dev/null || _sidecar_tmp="" fi fi local proj_dir pname pcwd premote _p_inferred for proj_dir in */; do [ -d "$proj_dir" ] || continue pname="${proj_dir%/}" pcwd=$(get_project_cwd "$pname") premote=$(get_git_remote "$pcwd") _p_inferred=0 if [ -z "$premote" ] && [ -n "$pcwd" ] && [ ! -e "$pcwd" ]; then # resolve_remote_for_dead_cwd's "[paxel] Recovered remote ..." # stderr (ancestor/worktree-list/jj-workspace-list at :1766, # :1827, :1850) flows through — Docker --all now attributes # sessions end-to-end via the host-written sidecar, so the # message is accurate rather than misleading. premote=$(resolve_remote_for_dead_cwd "$pcwd" || true) [ -n "$premote" ] && _p_inferred=1 if [ -z "$premote" ] && [ "${PAXEL_NO_ORPHAN_RECOVERY:-0}" != "1" ]; then case "$pcwd" in */conductor/workspaces/*/*|*/.conductor/*) # `|| true` matches resolve_remote_for_dead_cwd's style above # for defense-in-depth under set -Eeuo pipefail, even though # _project_cache_read_remote can't fail on its current impl. premote=$(_project_cache_read_remote "$pname" || true) if [ -n "$premote" ]; then echo "[paxel] Recovered remote for $pcwd via project-cache($pname) -> $premote" >&2 _log_recovery_source "$pcwd" "project-cache" _p_inferred=1 fi ;; esac fi if [ -z "$premote" ]; then _warn_unresolvable_conductor_cwd "$pname" "$pcwd" fi fi # Cache-row skip gates — preserve any existing cached row for this # dir when the scan couldn't fully evaluate the current state. # Without these, the merge in _project_cache_persist_rows would # overwrite a valid warmed remote with an empty one, erasing the # only recovery signal available to future runs. # # 1. Both pcwd + premote empty = no routing signal at all. # Mirrors legacy collect_all_projects:3344's skip, which # continues past this dir entirely. # 2. PAXEL_NO_ORPHAN_RECOVERY=1 + empty premote + DEAD cwd = # recovery paths (resolver, cache fallback) were SKIPPED by the # opt-out, so "empty" means "we didn't look" rather than # "verified unresolvable". Writing empty here would clobber # the user's warmed cache. Preserve existing rows. # # Gate must be narrow: opt-out + empty premote + LIVE cwd # means get_git_remote DID run and verified the live workspace # has no origin — a legitimate clear-stale-cache case. Keep # writing in that branch (matching the non-opt-out live path). # Legacy collect_all_projects:3438 mirrors this exact gate (PR # #712) so both --all paths converge on the same semantics. [ -z "$premote" ] && [ -z "$pcwd" ] && continue if [ "${PAXEL_NO_ORPHAN_RECOVERY:-0}" = "1" ] \ && [ -z "$premote" ] \ && [ -n "$pcwd" ] \ && [ ! -e "$pcwd" ]; then continue fi # Record a cache row. Persist empty-remote rows ONLY when we # actually verified emptiness (non-opt-out, premote-tried-and-failed) # — they clear stale cache entries from a prior live run whose # remote has since disappeared; without this, a dir that loses # its remote would keep the stale one forever. Mirrors # collect_all_projects:3355-3371's row format. # BSD find/stat parse leading `-` of encoded Claude dir names # (`-Users-...`) as option flags; `./$pname` forces path interpretation. local _pd_sessions _pd_mtime _pd_sessions=$(find "./$pname" -maxdepth 3 -name "*.jsonl" -not -name "_*" 2>/dev/null | wc -l | tr -d ' ') _pd_mtime=$(stat -c %Y "./$pname" 2>/dev/null || stat -f %m "./$pname" 2>/dev/null || echo "0") printf '%s\t%s\t%s\t%s\t%s\n' "$pname" "$premote" "${_pd_sessions:-0}" "${_pd_mtime:-0}" "$_p_inferred" >> "$_cache_rows_file" # Sidecar entry for TranscriptDiscoverer.read_sidecar fallback. # Only write when we resolved a non-empty remote — empty-remote # dirs server-side already fall back to encoded_name, which is # correct for them. `|| true` on the jq invocation so a stubbed # jq (CJ10e) that exits non-zero doesn't abort the scan under # set -Eeuo pipefail. Mirrors the legacy sidecar write at :3412. if [ -n "$_sidecar_tmp" ] && [ -n "$premote" ]; then local _sc_updated _sc_updated=$(jq \ --arg dir "$pname" \ --arg remote "$premote" \ --arg cwd "${pcwd:-}" \ '.directories[$dir] = {"git_remote": $remote, "cwd": $cwd}' \ "$_sidecar_tmp" 2>/dev/null || true) [ -n "$_sc_updated" ] && printf '%s' "$_sc_updated" > "$_sidecar_tmp" fi done # Merge collected rows into the project-remote cache. Same helper # collect_all_projects uses at :3376; unconditional of # PAXEL_NO_ORPHAN_RECOVERY (that flag gates READS from the cache, # not writes — consistent with legacy behavior). _project_cache_persist_rows "$_cache_rows_file" rm -f "$_cache_rows_file" ) } # Recover a git or jj remote for a session whose cwd no longer exists on # disk. Used for orphan Claude/Codex/Cursor sessions from deleted # subdirectories or sibling worktrees. Returns "" on miss; callers decide # fallback. # # Strategies, in order: # 1. Ancestor walk — if a parent of $cwd is a live git repo (.git # present) or pure-jj workspace (.jj/ present), use its remote. # Low false-positive risk: a session inside a subdirectory usually # belongs to the enclosing repo. Pure-jj matches bypass # get_git_remote (which would walk past the marker via git's own # discovery) and call get_jj_remote directly, anchoring the lookup. # 2. Sibling-worktree cross-reference — for stems of $(basename $cwd), # check if $parent/$stem is a git repo whose `git worktree list # --porcelain` still mentions $cwd. Catches worktrees removed with # `rm -rf` before `git worktree prune`. Git-only: jj's workspace # list semantics don't plug into this verifier directly. # # Conductor paths short-circuit at the top — those are handled by # backfill_conductor_remotes, which understands fork/branch-specific # remote semantics the ancestor walk would flatten. # # On successful recovery, appends $cwd to $_RMDC_LOG_FILE so callers can # read a deduped count post-hoc. Counters-in-parent-scope don't work here: # the resolver is invoked via $(...), which creates a subshell — in-function # variable mutations don't propagate. A file append does. # Cross-reference a dead cwd against a sibling jj repo's workspace list. # Returns 0 if any of the sibling's workspaces has a root path matching the # dead cwd (either as an on-disk root or via a resolved error-line path for # a workspace whose dir was rm -rf'd without `jj workspace forget`). # # jj's workspace list template has no keyword for the stored relative path — # `root` resolves it, and errors out inline when the path is missing. The # error embeds the `../..` form (e.g. `<cand>/.jj/repo/../../../dead-ws`), # which is bash-normalizable to the canonical path. Covers the # non-Conductor dead-jj-sibling case: workspace dir deleted via filesystem # rm, never through `jj workspace forget`. Conductor paths short-circuit at # the top of `resolve_remote_for_dead_cwd` and route through # PR #614's `project-remotes-v2.tsv` cache instead — this verifier is not # reached for `*/conductor/workspaces/*` or `*/.conductor/*` cwds. # # Args: $1 = candidate jj repo dir, $2 = original dead cwd, $3 = canonical # dead cwd. Returns 0/1. Silent on success; prints nothing (caller logs). _jj_sibling_workspace_match() { local cand="$1" local cwd="$2" local canonical_cwd="$3" local out out=$(jj workspace list --repository "$cand" -T 'root ++ "\n"' 2>/dev/null) || return 1 [ -z "$out" ] && return 1 local line while IFS= read -r line; do [ -z "$line" ] && continue case "$line" in /*) # Valid root line: exact match against dead cwd wins. (Prefix-matching # to catch "dead subdir of live workspace" is already handled by the # ancestor walk above, so we don't need it here.) if [ "$line" = "$cwd" ] || [ "$line" = "$canonical_cwd" ]; then return 0 fi ;; *'<Error: Failed to resolve workspace root:'*) # Error line format (jj 0.40.x): # <Error: Failed to resolve workspace root: <name>: <abs-path>: <os-err>> # Anchor on `: /` (workspace paths are always absolute) instead of # counting `: ` separators — jj allows workspace names containing # `: ` (e.g. `jj workspace add -n "feat: bugfix" path`), and a # separator-count parser drops the tail of the name into the extracted # path and produces a non-absolute string that silently misses. local raw raw=$(printf '%s' "$line" | awk ' { i = index($0, "workspace root: ") if (i == 0) next rest = substr($0, i + length("workspace root: ")) sub(/>$/, "", rest) # trim trailing close-angle # Locate the `: ` whose RHS starts with `/` — that is the separator # right before the absolute path. Walk from the front; the first # occurrence is correct because a workspace name starting with `: /` # after a `: ` would itself begin with `/`, which is pathologically # rare enough not to worry about here. path_start = 0 for (k = 1; k <= length(rest) - 2; k++) { if (substr(rest, k, 2) == ": " && substr(rest, k + 2, 1) == "/") { path_start = k + 2 break } } if (path_start == 0) next tail = substr(rest, path_start) # The path ends at the FINAL `: ` (the one before the os-err tail). # os-err text comes from the std::io::Error Display impl — on Unix # it is single-line and has no `: ` inside, so last-match is safe. last = 0 for (k = 1; k <= length(tail) - 1; k++) { if (substr(tail, k, 2) == ": ") last = k } if (last > 0) print substr(tail, 1, last - 1) }') [ -z "$raw" ] && continue local resolved resolved=$(_normpath_absolute "$raw") [ -z "$resolved" ] && continue if [ "$resolved" = "$cwd" ] || [ "$resolved" = "$canonical_cwd" ]; then return 0 fi ;; esac done <<< "$out" return 1 } # Collapse `.` and `..` components in an already-absolute path without # consulting the filesystem (the target may not exist — the whole point # of calling this from the jj sibling walk is that jj emits `../..` paths # for removed workspaces). Returns empty if input isn't absolute. _normpath_absolute() { local input="$1" case "$input" in /*) ;; *) return ;; esac printf '%s' "$input" | awk -F/ ' { n = 0 for (i = 1; i <= NF; i++) { if ($i == "" || $i == ".") continue if ($i == "..") { if (n > 0) n-- ; continue } parts[++n] = $i } if (n == 0) { printf "/" } else { for (i = 1; i <= n; i++) printf "/%s", parts[i] } }' } resolve_remote_for_dead_cwd() { local cwd="$1" [ -z "$cwd" ] && return 0 [ -e "$cwd" ] && return 0 [ "${PAXEL_NO_ORPHAN_RECOVERY:-0}" = "1" ] && return 0 case "$cwd" in */conductor/workspaces/*|*/.conductor/*) return 0 ;; esac # _rmdc_source tracks which strategy succeeded so the single log # write at the function's exit can emit `<cwd>\t<source>`. Scoped # to this function's $(...) subshell — the file-append at the exit # escapes the subshell via fd inheritance (not via shell variable # propagation, which wouldn't work here). local recovered="" local _rmdc_source="" # Strategy 1: Ancestor walk. Matches on .git (dir or file — worktree/submodule # marker) or .jj (pure-jj workspace marker). # # For .git matches, get_git_remote's `git -C <p> remote get-url origin` is # repo-discovering and tolerates a .git at $p or at an enclosing ancestor — # either way it anchors to an actual git repo root. For .jj-only matches # (no .git at $p), get_git_remote would still run git's discovery and could # walk PAST $p to an enclosing git repo's origin, silently misattributing a # nested-jj-in-git layout. Call get_jj_remote directly for that branch to # anchor the lookup at $p. local p="$cwd" local home_guard="${HOME:-/nonexistent}" while :; do local _prev_p="$p" p="$(dirname "$p")" case "$p" in /|.|"$home_guard") break ;; esac # Stop at a root where dirname is idempotent: on Windows Git Bash # `dirname C:` returns `C:` (there is no leading "/"), so without this guard # the walk never reaches "/" and spins forever (PAXEL: aaryansr, Git Bash). [ "$p" = "$_prev_p" ] && break local r="" if [ -d "$p/.git" ] || [ -f "$p/.git" ]; then r=$(get_git_remote "$p") elif [ -d "$p/.jj" ]; then # Pure-jj ancestor: bypass get_git_remote to avoid git's repo-discovery # walking past $p. Normalize the raw URL to match the server's canonical # form (get_jj_remote does not normalize; get_git_remote does). local jj_raw jj_raw=$(get_jj_remote "$p") [ -n "$jj_raw" ] && r=$(normalize_remote "$jj_raw") fi if [ -n "$r" ]; then recovered="$r" _rmdc_source="ancestor" echo "[paxel] Recovered remote for $cwd via ancestor $p -> $r" >&2 break fi done # Strategy 2: Sibling-worktree cross-reference. if [ -z "$recovered" ]; then local parent base parent="$(dirname "$cwd")" base="$(basename "$cwd")" # Canonicalize so the awk compare survives symlinked home dirs # (e.g. /home/x -> /data/x). The parent is still on-disk even when # the cwd itself is gone, so `cd && pwd -P` works. local canonical_cwd="$cwd" if [ -d "$parent" ]; then local real_parent real_parent=$(cd "$parent" 2>/dev/null && pwd -P 2>/dev/null) || real_parent="" [ -n "$real_parent" ] && canonical_cwd="$real_parent/$base" fi local -a stems=() # Digit strip: code1 -> code. %%[0-9]* removes longest trailing-digit run. local ds="${base%%[0-9]*}" [ -n "$ds" ] && [ "$ds" != "$base" ] && stems+=("$ds") # Dash walk: code-frontend-tests -> code-frontend -> code. # Longest-first order means a more-specific parent is tried before a # less-specific one if both exist (e.g. ~/code-frontend wins over ~/code # when the dead cwd is ~/code-frontend-tests). local ds2="$base" while [[ "$ds2" == *-* ]]; do ds2="${ds2%-*}" [ -n "$ds2" ] && stems+=("$ds2") done # Underscore walk: same pattern. local ds3="$base" while [[ "$ds3" == *_* ]]; do ds3="${ds3%_*}" [ -n "$ds3" ] && stems+=("$ds3") done local stem for stem in ${stems[@]+"${stems[@]}"}; do local cand="$parent/$stem" local has_git=0 has_jj=0 { [ -d "$cand/.git" ] || [ -f "$cand/.git" ]; } && has_git=1 [ -d "$cand/.jj" ] && has_jj=1 [ "$has_git" -eq 0 ] && [ "$has_jj" -eq 0 ] && continue # Try git worktree list first when .git is present. Match full worktree # path, not awk $2 — paths can contain spaces. Compare against both the # original cwd and the symlink-resolved form. if [ "$has_git" -eq 1 ]; then if git -C "$cand" worktree list --porcelain 2>/dev/null \ | awk -v t1="$cwd" -v t2="$canonical_cwd" ' /^worktree / { sub(/^worktree /, ""); if ($0 == t1 || $0 == t2) { found=1; exit } } END { exit !found } '; then local r r=$(get_git_remote "$cand") if [ -n "$r" ]; then recovered="$r" _rmdc_source="worktree-list" echo "[paxel] Recovered remote for $cwd via worktree-list($stem) -> $r" >&2 break fi fi fi # Fall through to jj workspace check when git didn't match. jj 0.40+ # always creates `.git/` alongside `.jj/` on `jj git init`, so relying # on `.git`-absence to route to the jj branch fails in practice. The # jj check fires whenever .jj exists AND we haven't already recovered. # Covers non-Conductor rm -rf'd jj sibling workspaces: the removed # workspace is still in `jj workspace list` output as an inline # `<Error: …>` row whose path is bash-normalizable to the dead cwd. # Conductor cwds short-circuit above (see :1389-1391) and are handled # by PR #614's cache; this branch is unreachable for Conductor paths. if [ -z "$recovered" ] && [ "$has_jj" -eq 1 ]; then command -v jj >/dev/null 2>&1 || continue if _jj_sibling_workspace_match "$cand" "$cwd" "$canonical_cwd"; then local r r=$(get_jj_remote "$cand") if [ -n "$r" ]; then recovered=$(normalize_remote "$r") _rmdc_source="jj-workspace-list" echo "[paxel] Recovered remote for $cwd via jj-workspace-list($stem) -> $recovered" >&2 break fi fi fi done fi if [ -n "$recovered" ]; then # Single write site for the resolver; fd inheritance propagates the # append out of the $(...) subshell so callers see a complete log. _log_recovery_source "$cwd" "$_rmdc_source" fi printf '%s' "$recovered" } # Count unique recovered cwds logged so far in this run. Preserves the # legacy "successful recoveries" semantics: rows whose source is # `unresolvable` (from P1's warning helper) are excluded — those are # failures, not recoveries, and reporting them as recoveries would # misrepresent the support-triage metric. # # Optional second arg: a previous-snapshot count, in which case we return # the delta (unique cwds recovered since the snapshot was taken). _rmdc_recovery_count_unique() { local prev="${1:-0}" local cur=0 if [ -n "${_RMDC_LOG_FILE:-}" ] && [ -s "$_RMDC_LOG_FILE" ]; then cur=$(awk -F'\t' '$2 != "unresolvable" && !seen[$1]++' "$_RMDC_LOG_FILE" 2>/dev/null | wc -l | tr -d ' ') fi echo "$((cur - prev))" } # Emit a per-source CSV breakdown of recovery activity. Counts unique # (cwd, source) pairs — since a cwd is resolved via at most one source # in one run, this matches per-source cwd counts. # # Output shape: `ancestor:2,worktree_list:1,project_cache:3,unresolvable:1` # Zero-count buckets are skipped; the server-side reader # (ClientPipeline#read_recovery_breakdown) fills defaults for absent # keys so downstream consumers always see the full 6-key hash. # # Hyphen→underscore transform happens inside the awk END block so # future hyphenated source names (e.g. `docker-bind`) don't need a # paired sed allowlist update. JSON-friendly keys without the # character-brittleness of a fixed allowlist. # # Emits an empty string when the log is absent/empty — callers # (_refresh_orphan_recovery_count, Docker env-var fallback) treat that # as "no recoveries yet" and fall back to defaults. _recovery_source_breakdown() { [ -z "${_RMDC_LOG_FILE:-}" ] && { echo ""; return 0; } [ ! -s "$_RMDC_LOG_FILE" ] && { echo ""; return 0; } awk -F'\t' ' !seen[$1]++ { c[$2]++ } END { first = 1 for (k in c) { key = k gsub(/-/, "_", key) if (!first) printf "," printf "%s:%d", key, c[k] first = 0 } } ' "$_RMDC_LOG_FILE" 2>/dev/null } # Convert a `k1:v1,k2:v2` CSV to a `{"k1":v1,"k2":v2}` JSON object for # jq --argjson consumption. Returns `{}` when given empty input. _recovery_breakdown_csv_to_json() { local csv="$1" if [ -z "$csv" ]; then echo "{}" return 0 fi printf '%s' "$csv" | awk -F',' ' { printf "{" for (i = 1; i <= NF; i++) { split($i, kv, ":") if (i > 1) printf "," printf "\"%s\":%d", kv[1], kv[2] } printf "}" } ' } # Refresh orphan_recovery_count in a sidecar to reflect resolver calls that # fired AFTER the archive sidecar was initially written. The three archive # write sites (collect_project_group, prepare_and_run_for_repo, run_docker_mode) # all emit the counter BEFORE collect_cursor_sessions runs — but extract_cursor_db # can call resolve_remote_for_dead_cwd for deleted Cursor workspaces, which # appends to _RMDC_LOG_FILE after the snapshot. Callers invoke this helper # after every collect_cursor_sessions so the on-disk counter stays honest. # # Semantics: aggregate run activity, NOT archive-exact. The resolver logs a # recovery the moment it fires (inside extract_cursor_db, BEFORE the # selected_remote filter at :980). Under `--project X`, a mixed Cursor DB # can recover remotes for sessions that are then filtered out of the # archive — the counter still increments. Matches the pre-existing comment # near collect_project_group's initial write. # # Idempotent; safe to call with a non-existent sidecar path (no-op when # the sidecar was never written, e.g. --all Docker mode which skips the # archive sidecar entirely). _refresh_orphan_recovery_count() { local sidecar="$1" [ -f "$sidecar" ] || return 0 command -v jq &>/dev/null || return 0 local total total=$(_rmdc_recovery_count_unique) local breakdown breakdown=$(_recovery_source_breakdown) local breakdown_json breakdown_json=$(_recovery_breakdown_csv_to_json "$breakdown") # _recovery_breakdown_csv_to_json always returns a valid JSON object # (empty-input path returns {}); no need for a default fallback. local updated updated=$(jq \ --argjson r "${total:-0}" \ --argjson b "$breakdown_json" \ '.orphan_recovery_count = $r | .recovery_breakdown = $b' \ "$sidecar" 2>/dev/null) [ -n "$updated" ] && printf '%s\n' "$updated" > "$sidecar" } # Group project directories by git remote list_projects_grouped() { echo "Scanning projects..." >&2 # Collect all Claude project dirs with jsonl files local all_dirs=() if [ -d "$CLAUDE_DIR" ]; then for dir in "$CLAUDE_DIR"/*/; do [ -d "$dir" ] || continue local name name=$(basename "$dir") local has_jsonl # `find -print -quit` SIGPIPEs on first match → ERR trap. Same family # as the OLDEST_SESSION_EPOCH fix (PR #389). See find_has_jsonl below. has_jsonl=$(find "$dir" -name "*.jsonl" -maxdepth 3 -print -quit 2>/dev/null || true) if [ -n "$has_jsonl" ]; then all_dirs+=("$name") fi done fi # Check for Codex sessions local has_codex=0 if [ -d "$CODEX_DIR" ]; then local codex_check codex_check=$(find "$CODEX_DIR" -name "*.jsonl" -maxdepth 6 -print -quit 2>/dev/null || true) [ -n "$codex_check" ] && has_codex=1 fi # Check for Cursor sessions (workspace DBs or global DB) local has_cursor=0 if command -v sqlite3 &>/dev/null; then if [ -d "$CURSOR_DIR" ]; then local cursor_check cursor_check=$(find "$CURSOR_DIR" -name "state.vscdb" -maxdepth 2 -print -quit 2>/dev/null || true) [ -n "$cursor_check" ] && has_cursor=1 fi [ -f "$CURSOR_GLOBAL_DB" ] && has_cursor=1 fi # Check for opencode sessions (any opencode*.db, or the explicit override). # Scan-only like Cursor: opencode sessions are upload-bucketed separately, not # surfaced as their own picker groups — but their presence must keep the # "no projects found" guard from blocking an opencode-only user. local has_opencode=0 if command -v sqlite3 &>/dev/null; then if [ -n "${OPENCODE_DB:-}" ] && [ -f "${OPENCODE_DB:-}" ]; then has_opencode=1 elif [ -d "$OPENCODE_DIR" ]; then local opencode_check opencode_check=$(find "$OPENCODE_DIR" -maxdepth 1 -name 'opencode*.db' -print -quit 2>/dev/null || true) [ -n "$opencode_check" ] && has_opencode=1 fi fi # Check for Gemini CLI sessions (any chats/session-*.jsonl). Scan-only like # opencode — bucketed separately at upload time, but its presence must keep the # "no projects found" guard from blocking a Gemini-only user. No sqlite3 needed. local has_gemini=0 if [ -d "$GEMINI_DIR" ]; then local gemini_check gemini_check=$(find "$GEMINI_DIR" -maxdepth 3 -name 'session-*.jsonl' -print -quit 2>/dev/null || true) [ -n "$gemini_check" ] && has_gemini=1 fi if [ ${#all_dirs[@]} -eq 0 ] && [ "$has_codex" -eq 0 ] && [ "$has_cursor" -eq 0 ] && [ "$has_opencode" -eq 0 ] && [ "$has_gemini" -eq 0 ]; then echo "Error: No projects with transcripts found" >&2 echo "Checked: $CLAUDE_DIR, $CODEX_DIR, $CURSOR_DIR, $OPENCODE_DIR, $GEMINI_DIR" >&2 exit 1 fi # Resolve CWD and git remote for each dir local dir_cwds=() local dir_remotes=() for name in "${all_dirs[@]}"; do local cwd cwd=$(get_project_cwd "$name") dir_cwds+=("$cwd") local remote remote=$(get_git_remote "$cwd") dir_remotes+=("$remote") done # Backfill remotes for deleted Conductor workspaces _bfc_cwds=("${dir_cwds[@]}") _bfc_remotes=("${dir_remotes[@]}") backfill_conductor_remotes # Recover remotes for non-Conductor orphan cwds # (deleted subdirs of existing repos, sibling worktrees) local _bfc_j=0 while [ $_bfc_j -lt ${#_bfc_cwds[@]} ]; do if [ -z "${_bfc_remotes[$_bfc_j]}" ]; then local _bfc_recovered _bfc_recovered=$(resolve_remote_for_dead_cwd "${_bfc_cwds[$_bfc_j]}") [ -n "$_bfc_recovered" ] && _bfc_remotes[$_bfc_j]="$_bfc_recovered" fi _bfc_j=$((_bfc_j + 1)) done dir_remotes=("${_bfc_remotes[@]}") # Group by remote (or cwd for no-remote, or "unknown") GROUP_REMOTES=() GROUP_DISPLAYS=() GROUP_DIRS=() GROUP_COUNTS=() GROUP_DIR_COUNTS=() local i=0 for name in "${all_dirs[@]}"; do local remote="${dir_remotes[$i]}" local cwd="${dir_cwds[$i]}" # Determine group key local group_key if [ -n "$remote" ]; then group_key="$remote" elif [ -n "$cwd" ]; then group_key="local:$cwd" else group_key="unknown" fi # Determine display name local display if [ -n "$remote" ]; then display=$(remote_display_name "$remote") elif [ -n "$cwd" ]; then display=$(basename "$cwd") else display="${name##*-}" fi # Count sessions in this dir local session_count session_count=$({ find "$CLAUDE_DIR/$name" -name "*.jsonl" -maxdepth 3 2>/dev/null || true; } | wc -l | tr -d ' ') # Find existing group or create new one local found=0 local g=0 while [ $g -lt ${#GROUP_REMOTES[@]} ]; do if [ "${GROUP_REMOTES[$g]}" = "$group_key" ]; then GROUP_DIRS[$g]="${GROUP_DIRS[$g]}|$name" GROUP_COUNTS[$g]=$((${GROUP_COUNTS[$g]} + $session_count)) GROUP_DIR_COUNTS[$g]=$((${GROUP_DIR_COUNTS[$g]} + 1)) found=1 break fi g=$((g + 1)) done if [ "$found" -eq 0 ]; then GROUP_REMOTES+=("$group_key") GROUP_DISPLAYS+=("$display") GROUP_DIRS+=("$name") GROUP_COUNTS+=("$session_count") GROUP_DIR_COUNTS+=("1") fi i=$((i + 1)) done # Discover Codex sessions and merge into groups by git remote if [ "$has_codex" -eq 1 ]; then echo "Scanning Codex sessions..." >&2 while IFS= read -r codex_file; do [ -z "$codex_file" ] && continue local remote remote=$(get_codex_session_remote "$codex_file") [ -z "$remote" ] && continue # Find existing group by remote or create new one local found=0 local g=0 while [ $g -lt ${#GROUP_REMOTES[@]} ]; do if [ "${GROUP_REMOTES[$g]}" = "$remote" ]; then GROUP_COUNTS[$g]=$((${GROUP_COUNTS[$g]} + 1)) found=1 break fi g=$((g + 1)) done if [ "$found" -eq 0 ]; then local display display=$(remote_display_name "$remote") GROUP_REMOTES+=("$remote") GROUP_DISPLAYS+=("$display") GROUP_DIRS+=("") GROUP_COUNTS+=("1") GROUP_DIR_COUNTS+=("0") fi done < <(find "$CODEX_DIR" -name "*.jsonl" -maxdepth 6 2>/dev/null) fi local _rmdc_total _rmdc_total=$(_rmdc_recovery_count_unique) if [ "${_rmdc_total:-0}" -gt 0 ]; then echo "[paxel] Orphan recovery: ${_rmdc_total} dir(s) remapped via ancestor walk or sibling worktree (see '[paxel] Recovered' lines above for detail)" >&2 fi } # Build or refresh the project-remote cache at ~/.paxel/cache/project-remotes-v2.tsv # Each line: dir_name\tkey\tsession_count\tlatest_mtime # Only re-resolves dirs whose newest JSONL changed since last cache write. # After this function, CACHED_KEYS / CACHED_DIRS / CACHED_SESSIONS arrays are populated. # v2 bumps the cache filename after keys switched from raw git URLs to # normalized form (normalize_remote / Repository.normalize_remote parity). # Old v1 rows would under-dedupe until every dir's mtime changed. CACHED_KEYS=() CACHED_DIRS=() CACHED_SESSIONS=() _cache_loaded=0 load_project_cache() { [ "$_cache_loaded" -eq 1 ] && return 0 local cache_file="${HOME}/.paxel/cache/project-remotes-v2.tsv" mkdir -p "${HOME}/.paxel/cache" # Step 1: Get current dir listing with mtimes local has_cache=0 [ -f "$cache_file" ] && [ -s "$cache_file" ] && has_cache=1 if [ "$has_cache" -eq 1 ]; then echo " Checking for new sessions..." >&2 else echo " Scanning your coding sessions (first run, this may take a minute)..." >&2 fi local dir_list_file dir_list_file=$(mktemp) local _dcr_total=0 for dir in "$CLAUDE_DIR"/*/; do [ -d "$dir" ] || continue _dcr_total=$((_dcr_total + 1)) local dn dn=$(basename "$dir") local dm dm=$(stat -c %Y "$dir" 2>/dev/null || stat -f %m "$dir" 2>/dev/null || echo "0") printf '%s\t%s\n' "$dn" "$dm" >> "$dir_list_file" done # Step 2: awk joins dir listing against cache in one pass. # Outputs HIT lines (cached, mtime matches, non-empty key) and MISS lines # (need resolution). Empty-key rows always miss so they self-heal next run: # resolution can fail transiently (missing sessions-index.json, queue-only # jsonls) and stick forever if we trust them on matching mtime alone, which # silently orphans later-populated subpath projects (e.g. ~/code/bookface) # from their parent repo group. local hit_file miss_file hit_file=$(mktemp) miss_file=$(mktemp) if [ "$has_cache" -eq 1 ]; then # Schema: dir<TAB>key<TAB>sessions<TAB>mtime<TAB>inferred. # Inferred entries (key came from orphan resolver) always miss so we # re-verify: if the ancestor repo or sibling worktree has been removed # since the last run, the cached key becomes wrong. Memoization claim # (from an earlier revision) was wrong — the resolver runs in a # $(...) subshell, so its assoc-array cache dies with the subshell. # Every inferred row triggers a fresh ancestor walk each run; that's # a few stats + one git call per orphan, tolerable for the common # case of <50 orphans. # Miss-line format: dir<TAB>mtime<TAB>prior_inferred (prior_inferred # lets the miss-resolution loop preserve the inferred flag on # transient misses, so one bad run doesn't permanently detach.) awk -F'\t' ' NR==FNR { cache[$1] = $2 "\t" $3 "\t" $4 "\t" $5; next } { dir = $1; mtime = $2 if (dir in cache) { split(cache[dir], c, "\t") if (c[3] == mtime && c[1] != "" && c[4] != "1") { print dir "\t" c[1] "\t" c[2] "\t" mtime "\t" c[4] > "'"$hit_file"'" } else { print dir "\t" mtime "\t" c[4] > "'"$miss_file"'" } } else { print dir "\t" mtime "\t" > "'"$miss_file"'" } } ' "$cache_file" "$dir_list_file" else # No cache: everything is a miss with empty prior_inferred. awk -F'\t' '{ print $0 "\t" }' "$dir_list_file" > "$miss_file" fi local hit_count miss_count hit_count=$(wc -l < "$hit_file" | tr -d ' ') miss_count=$(wc -l < "$miss_file" | tr -d ' ') if [ "$miss_count" -gt 0 ] && [ "$hit_count" -gt 0 ]; then echo " ${miss_count} new or changed, ${hit_count} cached." >&2 fi # Step 3: Load all cache hits into CACHED_KEYS/DIRS/SESSIONS local new_cache="" while IFS=$'\t' read -r _cn _ck _cs _cm _ci; do [ -z "$_cn" ] && continue new_cache="${new_cache}${_cn} ${_ck} ${_cs} ${_cm} ${_ci} " [ -z "$_ck" ] || [ "${_cs:-0}" -le 0 ] 2>/dev/null && continue local found=0 local k=0 while [ $k -lt ${#CACHED_KEYS[@]} ]; do if [ "${CACHED_KEYS[$k]}" = "$_ck" ]; then CACHED_DIRS[$k]="${CACHED_DIRS[$k]}|$_cn" CACHED_SESSIONS[$k]=$((${CACHED_SESSIONS[$k]} + _cs)) found=1 break fi k=$((k + 1)) done if [ "$found" -eq 0 ]; then CACHED_KEYS+=("$_ck") CACHED_DIRS+=("$_cn") CACHED_SESSIONS+=("$_cs") fi done < "$hit_file" # Step 4: Resolve only the misses (new or changed dirs) local _miss_i=0 while IFS=$'\t' read -r dir_name dir_mtime prior_inferred; do [ -z "$dir_name" ] && continue _miss_i=$((_miss_i + 1)) if [ $((_miss_i % 200)) -eq 0 ]; then echo " ...${_miss_i}/${miss_count} resolved" >&2 fi local dir="$CLAUDE_DIR/$dir_name" local has_jsonl # `find -print -quit` SIGPIPEs on first match → ERR trap. Same family # as the OLDEST_SESSION_EPOCH fix (PR #389). has_jsonl=$(find "$dir" -name "*.jsonl" -maxdepth 3 -print -quit 2>/dev/null || true) if [ -z "$has_jsonl" ]; then new_cache="${new_cache}${dir_name} 0 ${dir_mtime} " continue fi local cwd cwd=$(get_project_cwd "$dir_name") local remote remote=$(get_git_remote "$cwd") # Recover orphan remote BEFORE key assignment — otherwise # deleted-subdir cwds fall into name:<basename> at line ~1342 and # resolver never fires for the cached-run hot path. # _inferred="1" on a successful recovery so the cache re-verifies this # row next run (inferred rows always miss the awk hit predicate). # Preserves the inferred flag on a transient miss (prior_inferred=1 + # new resolve returns empty) so one bad run doesn't permanently detach # the session from its real repo. local _inferred="" if [ -z "$remote" ] && [ -n "$cwd" ] && [ ! -e "$cwd" ]; then remote=$(resolve_remote_for_dead_cwd "$cwd") if [ -n "$remote" ]; then _inferred="1" elif [ "${prior_inferred:-}" = "1" ]; then # Transient miss (e.g. --no-orphan-recovery, or filesystem hiccup). # Keep the row flagged so we retry next run. _inferred="1" fi fi local key="" if [ -n "$remote" ]; then key="$remote" elif [ -n "$cwd" ] && [ -d "$cwd" ]; then key="local:$cwd" elif [ -n "$cwd" ]; then local _proj_name="" if [[ "$cwd" == */conductor/workspaces/*/* ]]; then _proj_name=$(echo "$cwd" | sed 's|.*/conductor/workspaces/||' | sed 's|/.*||') elif [[ "$cwd" != */.gstack/* ]] && [[ "$cwd" != */.claude/* ]]; then _proj_name=$(basename "$cwd") fi [ -n "$_proj_name" ] && key="name:$_proj_name" fi if [ -z "$key" ]; then new_cache="${new_cache}${dir_name} 0 ${dir_mtime} " continue fi local session_count session_count=$({ find "$dir" -name "*.jsonl" -not -name "_*" -not -path "*/_git/*" -not -path "*/subagents/*" -maxdepth 3 2>/dev/null || true; } | wc -l | tr -d ' ') new_cache="${new_cache}${dir_name} ${key} ${session_count} ${dir_mtime} ${_inferred} " if [ -n "$key" ] && [ "$session_count" -gt 0 ]; then local found=0 local k=0 while [ $k -lt ${#CACHED_KEYS[@]} ]; do if [ "${CACHED_KEYS[$k]}" = "$key" ]; then CACHED_DIRS[$k]="${CACHED_DIRS[$k]}|$dir_name" CACHED_SESSIONS[$k]=$((${CACHED_SESSIONS[$k]} + session_count)) found=1 break fi k=$((k + 1)) done if [ "$found" -eq 0 ]; then CACHED_KEYS+=("$key") CACHED_DIRS+=("$dir_name") CACHED_SESSIONS+=("$session_count") fi fi done < "$miss_file" # Step 5: Write updated cache and clean up printf '%s' "$new_cache" > "$cache_file" rm -f "$dir_list_file" "$hit_file" "$miss_file" _cache_loaded=1 if [ "$miss_count" -eq 0 ]; then echo " All sessions cached, ready to go." >&2 else local _total_sess=0 local _si=0 while [ $_si -lt ${#CACHED_SESSIONS[@]} ]; do _total_sess=$((_total_sess + ${CACHED_SESSIONS[$_si]})) _si=$((_si + 1)) done local _repo_list="" local _ri=0 while [ $_ri -lt ${#CACHED_KEYS[@]} ]; do [ -n "$_repo_list" ] && _repo_list="${_repo_list}, " _repo_list="${_repo_list}$(remote_display_name "${CACHED_KEYS[$_ri]}")" _ri=$((_ri + 1)) done # This count is Claude Code only — CACHED_* never holds Codex/Cursor/opencode/ # Gemini. For a Codex/opencode-only user it would otherwise print a misleading # "Resolved 0 sessions across 0 repos" before those tools are even scanned, so # suppress the line entirely when there are no Claude repos and let the # per-tool scans speak; otherwise label it as Claude-specific. if [ "${#CACHED_KEYS[@]}" -gt 0 ]; then echo " Resolved ${_total_sess} Claude Code sessions across ${#CACHED_KEYS[@]} repos: ${_repo_list}" >&2 fi fi } # Detect child git repos in the current directory that have transcript data. # Populates CHILD_REPO_* parallel arrays. Returns 0 if any repos found. detect_child_repos() { local current_dir current_dir=$(pwd) # Build/refresh the project cache (fast on subsequent runs) load_project_cache # Use cached data as t_keys/t_dirs/t_sessions. Use the empty-safe expansion # idiom (cf. :2494): on macOS's default bash 3.2, copying an empty array via # `("${CACHED_KEYS[@]}")` aborts under `set -u` ("unbound variable"), which # crashed detect_child_repos for any user with no Claude cache (Codex/opencode- # only). The `[@]+"..."` form yields a genuinely-empty array (NOT `("${a[@]:-}")`, # which on 3.2 yields a 1-element "" array → a spurious downstream child_key). local t_keys=("${CACHED_KEYS[@]+"${CACHED_KEYS[@]}"}") local t_dirs=("${CACHED_DIRS[@]+"${CACHED_DIRS[@]}"}") local t_sessions=("${CACHED_SESSIONS[@]+"${CACHED_SESSIONS[@]}"}") # Pre-compute Codex session remotes (key → pipe-separated file paths). # Phase 3.5 — track cross-tool subset alongside total count so the picker # can label "(N sessions, M Codex by Claude)" per child repo. local codex_keys=() local codex_file_lists=() local codex_counts=() local codex_cross_tool_counts=() if [ -d "$CODEX_DIR" ]; then while IFS= read -r codex_file; do [ -z "$codex_file" ] && continue local remote remote=$(get_codex_session_remote "$codex_file") [ -z "$remote" ] && continue local _origin _origin=$(get_codex_session_originator "$codex_file") local _is_cross_tool=0 codex_originator_is_standalone "$_origin" || _is_cross_tool=1 local found=0 local k=0 while [ $k -lt ${#codex_keys[@]} ]; do if [ "${codex_keys[$k]}" = "$remote" ]; then codex_file_lists[$k]="${codex_file_lists[$k]}|$codex_file" codex_counts[$k]=$((${codex_counts[$k]} + 1)) [ "$_is_cross_tool" -eq 1 ] && codex_cross_tool_counts[$k]=$((${codex_cross_tool_counts[$k]} + 1)) found=1 break fi k=$((k + 1)) done if [ "$found" -eq 0 ]; then codex_keys+=("$remote") codex_file_lists+=("$codex_file") codex_counts+=("1") codex_cross_tool_counts+=("$_is_cross_tool") fi done < <(find "$CODEX_DIR" -name "*.jsonl" -maxdepth 6 2>/dev/null) fi # Scan child directories for .git (use -e to catch worktrees where .git is a file) echo " Looking for repos in ${current_dir}/" >&2 CHILD_REPO_DIRS=() CHILD_REPO_REMOTES=() CHILD_REPO_NAMES=() CHILD_REPO_SESSIONS=() # Phase 3.5 — parallel array tracking cross-tool subset of CHILD_REPO_SESSIONS # so the --all picker can show "(N sessions, M Codex by Claude)". CHILD_REPO_CROSS_TOOL_SESSIONS=() CHILD_TRANSCRIPT_DIRS=() CHILD_CODEX_DIRS=() local seen_remotes="" # Tracks name:<base> cache-group indices already claimed by an earlier child, so # the same unresolved (deleted-cwd) transcript group can't be attributed to two # same-basename children (e.g. 'my-app' + 'my_app', which both match name:my-app # via the _<->- alt-form) and uploaded under two reports. Delimited-token idiom, # same as seen_remotes / the per-repo failed-index set. local consumed_name_idx="" for child in "$current_dir"/*/; do [ -d "$child" ] || continue [ -e "$child/.git" ] || [ -e "$child/.jj" ] || continue local child_path child_path=$(cd "$child" && pwd) local child_remote child_remote=$(get_git_remote "$child_path") local child_key if [ -n "$child_remote" ]; then child_key="$child_remote" else child_key="local:$child_path" fi # Deduplicate by key case "$seen_remotes" in *"|$child_key|"*) continue ;; esac seen_remotes="${seen_remotes}|${child_key}|" # Match against pre-computed transcript dirs (by remote, local path, or project name) local matched_dirs="" local matched_sessions=0 local child_basename child_basename=$(basename "$child_path") # Also try with underscores replaced by hyphens and vice versa local child_basename_alt child_basename_alt=$(echo "$child_basename" | tr '_' '-') local child_basename_alt2 child_basename_alt2=$(echo "$child_basename" | tr '-' '_') local k=0 while [ $k -lt ${#t_keys[@]} ]; do local match=0 local is_name_match=0 if [ "${t_keys[$k]}" = "$child_key" ]; then match=1 elif [[ "${t_keys[$k]}" == name:* ]]; then # Name-based matching for unresolved (deleted-cwd) transcript dirs. Skip a # name: group already claimed by an earlier child so it isn't double-counted; # an exact (remote/local-key) match above is unaffected and a live repo still # inherits its own unclaimed name: group (the common Conductor case). case "$consumed_name_idx" in *"|$k|"*) : ;; # already claimed by an earlier same-basename child *) local t_name="${t_keys[$k]#name:}" if [ "$t_name" = "$child_basename" ] || [ "$t_name" = "$child_basename_alt" ] || [ "$t_name" = "$child_basename_alt2" ]; then match=1 is_name_match=1 fi ;; esac fi if [ "$match" -eq 1 ]; then # Safe to mark consumed before the total_sessions==0 skip below: a name # match always adds t_sessions[k], and load_project_cache only emits # name: groups with sessions>0, so a name-matched child is never dropped # there — the index can't be consumed by a child that then disappears. [ "$is_name_match" -eq 1 ] && consumed_name_idx="${consumed_name_idx}|${k}|" if [ -z "$matched_dirs" ]; then matched_dirs="${t_dirs[$k]}" else matched_dirs="${matched_dirs}|${t_dirs[$k]}" fi matched_sessions=$((matched_sessions + ${t_sessions[$k]})) fi k=$((k + 1)) done # Match Codex sessions local matched_codex="" local codex_session_count=0 local codex_cross_tool_count=0 local k=0 while [ $k -lt ${#codex_keys[@]} ]; do if [ "${codex_keys[$k]}" = "$child_key" ]; then matched_codex="${codex_file_lists[$k]}" codex_session_count=${codex_counts[$k]} codex_cross_tool_count=${codex_cross_tool_counts[$k]:-0} break fi k=$((k + 1)) done local total_sessions=$((matched_sessions + codex_session_count)) # No Claude/Codex sessions? Check opencode/Gemini before skipping — otherwise a # repo worked ONLY in those tools is invisible in the picker, unlike single-repo # auto-detect which folds them in. Only zero-Claude/Codex children pay this, and # the count helpers return 0 immediately when the tool isn't installed. (Cursor # has no count helper anywhere — the single-repo prelude omits it too — so a # Cursor-only repo stays a known gap. A future prescan could also fold these into # the displayed count for MIXED repos, which today show Claude+Codex only.) if [ "$total_sessions" -eq 0 ] && [ -n "$child_remote" ]; then local _oc_n _gm_n _oc_n=$(count_opencode_sessions "$child_remote") _gm_n=$(count_gemini_sessions "$child_remote") total_sessions=$((_oc_n + _gm_n)) fi # Skip repos with no transcript data at all [ "$total_sessions" -eq 0 ] && continue local display_name if [ -n "$child_remote" ]; then display_name=$(remote_display_name "$child_remote") else display_name=$(basename "$child_path") fi CHILD_REPO_DIRS+=("$child_path") CHILD_REPO_REMOTES+=("$child_key") CHILD_REPO_NAMES+=("$display_name") CHILD_REPO_SESSIONS+=("$total_sessions") CHILD_REPO_CROSS_TOOL_SESSIONS+=("$codex_cross_tool_count") CHILD_TRANSCRIPT_DIRS+=("$matched_dirs") CHILD_CODEX_DIRS+=("$matched_codex") done [ ${#CHILD_REPO_DIRS[@]} -eq 0 ] && return 1 # Sort by session count descending (bubble sort, fine for <50 items) local n=${#CHILD_REPO_DIRS[@]} local i=0 while [ $i -lt $((n - 1)) ]; do local j=0 while [ $j -lt $((n - i - 1)) ]; do local next=$((j + 1)) if [ "${CHILD_REPO_SESSIONS[$j]}" -lt "${CHILD_REPO_SESSIONS[$next]}" ]; then # Swap all parallel arrays — keep CHILD_REPO_CROSS_TOOL_SESSIONS in # sync or `--all` mode display gets scrambled when sort reorders rows. local tmp tmp="${CHILD_REPO_DIRS[$j]}"; CHILD_REPO_DIRS[$j]="${CHILD_REPO_DIRS[$next]}"; CHILD_REPO_DIRS[$next]="$tmp" tmp="${CHILD_REPO_REMOTES[$j]}"; CHILD_REPO_REMOTES[$j]="${CHILD_REPO_REMOTES[$next]}"; CHILD_REPO_REMOTES[$next]="$tmp" tmp="${CHILD_REPO_NAMES[$j]}"; CHILD_REPO_NAMES[$j]="${CHILD_REPO_NAMES[$next]}"; CHILD_REPO_NAMES[$next]="$tmp" tmp="${CHILD_REPO_SESSIONS[$j]}"; CHILD_REPO_SESSIONS[$j]="${CHILD_REPO_SESSIONS[$next]}"; CHILD_REPO_SESSIONS[$next]="$tmp" tmp="${CHILD_REPO_CROSS_TOOL_SESSIONS[$j]}"; CHILD_REPO_CROSS_TOOL_SESSIONS[$j]="${CHILD_REPO_CROSS_TOOL_SESSIONS[$next]}"; CHILD_REPO_CROSS_TOOL_SESSIONS[$next]="$tmp" tmp="${CHILD_TRANSCRIPT_DIRS[$j]}"; CHILD_TRANSCRIPT_DIRS[$j]="${CHILD_TRANSCRIPT_DIRS[$next]}"; CHILD_TRANSCRIPT_DIRS[$next]="$tmp" tmp="${CHILD_CODEX_DIRS[$j]}"; CHILD_CODEX_DIRS[$j]="${CHILD_CODEX_DIRS[$next]}"; CHILD_CODEX_DIRS[$next]="$tmp" fi j=$((j + 1)) done i=$((i + 1)) done return 0 } # Show interactive menu for child repo selection show_child_repo_menu() { require_tty local repo_count=${#CHILD_REPO_NAMES[@]} local repo_label="repos" [ "$repo_count" -eq 1 ] && repo_label="repo" local all_idx=$((repo_count + 1)) local cancel_idx=$((repo_count + 2)) echo "" echo "Found ${repo_count} ${repo_label} with transcript data in this directory:" echo "" local g=0 while [ $g -lt ${#CHILD_REPO_NAMES[@]} ]; do local total="${CHILD_REPO_SESSIONS[$g]}" local cross_tool="${CHILD_REPO_CROSS_TOOL_SESSIONS[$g]:-0}" local main_total=$((total - cross_tool)) local session_label="sessions" [ "$main_total" -eq 1 ] && session_label="session" if [ "$cross_tool" -gt 0 ]; then echo " $((g + 1))) ${CHILD_REPO_NAMES[$g]} (${main_total} ${session_label} + ${cross_tool} Codex by Claude)" else echo " $((g + 1))) ${CHILD_REPO_NAMES[$g]} (${main_total} ${session_label})" fi g=$((g + 1)) done echo "" echo " ${all_idx}) Analyze all repos (one report per repo)" echo " ${cancel_idx}) Cancel" echo "" local choice user_read -rp "Choose [1-${cancel_idx}] (comma-separate for multiple, e.g. 1,3): " choice # Strip spaces so "1, 2" parses the same as "1,2". choice="${choice// /}" MULTI_REPO_SELECTED_LIST=() # Meta items first (exact string match, so they can't be mixed into a list). if [ "$choice" = "$all_idx" ]; then MULTI_REPO_MODE="all" local g=0 while [ "$g" -lt "$repo_count" ]; do MULTI_REPO_SELECTED_LIST+=("$g") g=$((g + 1)) done _confirm_selected_child_repos || { echo "Cancelled."; exit 0; } return 0 fi if [ "$choice" = "$cancel_idx" ]; then echo "Cancelled." exit 0 fi # One or more repo numbers, comma-separated. Each token bounded to 10 digits so # long digit-only paste (exceeds bash's 64-bit integer range) is rejected by # the regex rather than printing raw "integer expression expected" arithmetic. if ! [[ "$choice" =~ ^[0-9]{1,10}(,[0-9]{1,10})*$ ]]; then echo "Invalid choice." >&2 exit 1 fi local -a _toks IFS=',' read -ra _toks <<< "$choice" local tok for tok in "${_toks[@]}"; do if [ "$tok" -lt 1 ] || [ "$tok" -gt "$repo_count" ]; then echo "Invalid choice." >&2 exit 1 fi MULTI_REPO_SELECTED_LIST+=("$((tok - 1))") done # Dedup + sort the selected indices (process substitution keeps the array # write in the current shell — a `| while` subshell would lose it). local -a _uniq=() local i while IFS= read -r i; do [ -n "$i" ] && _uniq+=("$i") done < <(printf '%s\n' "${MULTI_REPO_SELECTED_LIST[@]}" | sort -n | uniq) MULTI_REPO_SELECTED_LIST=("${_uniq[@]}") if [ "${#MULTI_REPO_SELECTED_LIST[@]}" -eq 1 ]; then MULTI_REPO_MODE="single" MULTI_REPO_SELECTED="${MULTI_REPO_SELECTED_LIST[0]}" echo "Selected: ${CHILD_REPO_NAMES[${MULTI_REPO_SELECTED_LIST[0]}]}" else MULTI_REPO_MODE="subset" _confirm_selected_child_repos || { echo "Cancelled."; exit 0; } fi } # Show the combined time estimate for the repos in MULTI_REPO_SELECTED_LIST and # ask the user to confirm. Returns 0 to proceed, 1 to cancel. Used by the "all" # and multi-select subset paths; a single selection skips the estimate. _confirm_selected_child_repos() { local sel_count=${#MULTI_REPO_SELECTED_LIST[@]} local label="repos" [ "$sel_count" -eq 1 ] && label="repo" local total_minutes=0 echo "" echo "${sel_count} ${label} selected." local idx for idx in "${MULTI_REPO_SELECTED_LIST[@]}"; do local mins mins=$(estimate_time "${CHILD_REPO_SESSIONS[$idx]}") total_minutes=$((total_minutes + mins)) local _ct="${CHILD_REPO_CROSS_TOOL_SESSIONS[$idx]:-0}" local _main=$((CHILD_REPO_SESSIONS[$idx] - _ct)) if [ "$_ct" -gt 0 ]; then printf " %-20s ~%d min (%d sessions + %d Codex by Claude)\n" "${CHILD_REPO_NAMES[$idx]}" "$mins" "$_main" "$_ct" else printf " %-20s ~%d min (%d sessions)\n" "${CHILD_REPO_NAMES[$idx]}" "$mins" "$_main" fi done echo "" echo "Total estimated time: ~${total_minutes} minutes" echo "" local confirm user_read -rp "Continue? [Y/n]: " confirm case "$confirm" in [Nn]*) return 1 ;; esac return 0 } # Run prepare_and_run_for_repo for each repo in MULTI_REPO_SELECTED_LIST # (0-based indices into the CHILD_REPO_* arrays), print the multi-repo summary, # and exit. Shared by the two child-repo entry points (auto-detect override and # the Strategy 3 parent-dir picker) so they can't drift. run_selected_child_repos() { pull_client_image MULTI_REPO_RUNNING=1 local original_claude_dir="$CLAUDE_DIR" local original_codex_dir="$CODEX_DIR" local success_count=0 local failed_repos="" # space-joined names, for the human-readable "Failed:" line local failed_idx="" # "|idx|"-delimited, for exact per-repo ✓/✗ (no name-substring collision) local total=${#MULTI_REPO_SELECTED_LIST[@]} # Set expectations before the slow part (the single-repo reassurance in # run_docker_analysis is suppressed under MULTI_REPO_RUNNING). Steps 1-3 above # were one-time setup; each repo below re-runs steps 4-17 on the same /17 scale. echo "" echo "Analyzing ${total} repos — this is the slow part. Each repo runs steps 4-17 below." local n=0 local idx for idx in "${MULTI_REPO_SELECTED_LIST[@]}"; do n=$((n + 1)) echo "" echo "═══ [${n}/${total}] Analyzing: ${CHILD_REPO_NAMES[$idx]} ═══" if prepare_and_run_for_repo "${CHILD_REPO_DIRS[$idx]}" "${CHILD_REPO_NAMES[$idx]}" "${CHILD_REPO_REMOTES[$idx]}" "${CHILD_TRANSCRIPT_DIRS[$idx]}" "${CHILD_CODEX_DIRS[$idx]}"; then success_count=$((success_count + 1)) else failed_repos="${failed_repos} ${CHILD_REPO_NAMES[$idx]}" failed_idx="${failed_idx}|${idx}|" fi done # End summary echo "" echo "═══ Multi-repo analysis complete ═══" for idx in "${MULTI_REPO_SELECTED_LIST[@]}"; do local status_icon="✓" # Match by exact index token, not a name substring — otherwise a succeeded # repo whose name is a substring of a failed one (e.g. "app" vs "app-web") # is wrongly shown as ✗. case "$failed_idx" in *"|${idx}|"*) status_icon="✗" ;; esac echo " ${status_icon} ${CHILD_REPO_NAMES[$idx]}" done echo "" echo "${success_count}/${total} repos analyzed successfully." if [ -n "$failed_repos" ]; then echo "Failed:${failed_repos}" echo " Per-repo logs: ${HOME}/.paxel/logs/<repo>-*.log" fi echo "" echo "Results: ${PAXEL_SERVER}/reports" # Single notification at the end printf '\a' if [ "$(uname -s)" = "Darwin" ]; then osascript -e "display notification \"${success_count}/${total} repos analyzed.\" with title \"Paxel\"" 2>/dev/null || true fi CLAUDE_DIR="$original_claude_dir" CODEX_DIR="$original_codex_dir" [ -z "$failed_repos" ] && exit 0 || exit 1 } # Get sessions sorted by createdAt from sessions-index.json, falling back to mtime get_sorted_sessions() { local project_dir="$1" local index_file="$project_dir/sessions-index.json" if [ -f "$index_file" ] && command -v jq &>/dev/null; then # Handle both array format and {version, entries} format jq -r ' (if type == "array" then . elif type == "object" then (.entries // []) else [] end) | sort_by(.createdAt) | reverse | .[].sessionId ' "$index_file" 2>/dev/null else # Fallback: find .jsonl files sorted by mtime (newest first). # GNU stat first (BSD stat -f on Linux silently prints filesystem info). find "$project_dir" -name "*.jsonl" -maxdepth 1 2>/dev/null \ | while read -r f; do local m m=$(stat -c '%Y' "$f" 2>/dev/null || stat -f '%m' "$f" 2>/dev/null || echo 0) printf '%s %s\n' "$m" "$f" done \ | sort -rn \ | awk '{print $2}' \ | xargs -I{} basename {} .jsonl fi } # Get session timestamp (epoch) from sessions-index.json or file mtime get_session_timestamp() { local project_dir="$1" local session_id="$2" local index_file="$project_dir/sessions-index.json" if [ -f "$index_file" ] && command -v jq &>/dev/null; then local ts ts=$(jq -r --arg sid "$session_id" ' (if type == "array" then . elif type == "object" then (.entries // []) else [] end) | map(select(.sessionId == $sid)) | .[0].createdAt // empty ' "$index_file" 2>/dev/null || true) if [ -n "$ts" ]; then # Convert ISO date to epoch. GNU date first (exits cleanly on BSD), # then BSD date. Same order-sensitivity rule applies to stat. date -d "${ts%%.*}" "+%s" 2>/dev/null \ || date -j -f "%Y-%m-%dT%H:%M:%S" "${ts%%.*}" "+%s" 2>/dev/null \ || stat -c %Y "$project_dir/${session_id}.jsonl" 2>/dev/null \ || stat -f %m "$project_dir/${session_id}.jsonl" 2>/dev/null \ || echo "0" return fi fi # Fallback to file mtime. GNU -c %Y first (BSD stat -f with %m as literal # prints filesystem info and exits 0, poisoning the output), then BSD. stat -c %Y "$project_dir/${session_id}.jsonl" 2>/dev/null \ || stat -f %m "$project_dir/${session_id}.jsonl" 2>/dev/null \ || echo "0" } # Collect sessions across multiple dirs with merge-sort collect_project_group() { local tmpdir="$1" shift local dirs=("$@") # Declared at function scope so collect_cursor_sessions "$tmpdir" "$selected_remote" # (line ~3066, outside the CODEX_DIR conditional that previously owned the # `local` declaration) has a defined value when CODEX_DIR is absent. Without # this, `set -u` trips with "selected_remote: unbound variable" under any # invocation path where the Claude project group has no Codex directory # to mirror. Exposed by the DRY_RUN staging path; also latent in legacy. local selected_remote="" # Build session list: (timestamp, dir_name, session_id, file_size) local session_list_file session_list_file=$(mktemp) for dir_name in "${dirs[@]}"; do local project_dir="$CLAUDE_DIR/$dir_name" [ -d "$project_dir" ] || continue # Copy sessions-index.json mkdir -p "$tmpdir/$dir_name" if [ -f "$project_dir/sessions-index.json" ]; then cp "$project_dir/sessions-index.json" "$tmpdir/$dir_name/" fi while IFS= read -r session_id; do [ -z "$session_id" ] && continue local jsonl_file="$project_dir/${session_id}.jsonl" [ -f "$jsonl_file" ] || continue local ts ts=$(get_session_timestamp "$project_dir" "$session_id") local file_size file_size=$(wc -c < "$jsonl_file" | tr -d ' ') echo "$ts $dir_name $session_id $file_size" >> "$session_list_file" done < <(get_sorted_sessions "$project_dir") done # Sort by timestamp (newest first) and process local sorted_file sorted_file=$(mktemp) sort -rn "$session_list_file" > "$sorted_file" local accumulated_bytes=0 local session_count=0 local pr_links_json="[]" local dir_metadata_json="{}" # Build directory metadata for sidecar for dir_name in "${dirs[@]}"; do local cwd cwd=$(get_project_cwd "$dir_name") local remote remote=$(get_git_remote "$cwd") # Orphan recovery: server-side TranscriptDiscoverer consumes this # remote to find-or-create the Project; empty means per-dir Project. if [ -z "$remote" ] && [ -n "$cwd" ] && [ ! -e "$cwd" ]; then remote=$(resolve_remote_for_dead_cwd "$cwd") fi if [ -n "$remote" ] || [ -n "$cwd" ]; then dir_metadata_json=$(echo "$dir_metadata_json" | jq \ --arg dir "$dir_name" \ --arg remote "$remote" \ --arg cwd "$cwd" \ '. + {($dir): {"git_remote": $remote, "cwd": $cwd}}' 2>/dev/null || echo "$dir_metadata_json") fi done while IFS=' ' read -r ts dir_name session_id file_size; do [ -z "$session_id" ] && continue # Apply --since filter if [ -n "$SINCE_EPOCH" ] && [ "$ts" -lt "$SINCE_EPOCH" ] 2>/dev/null; then continue fi # Track oldest session timestamp (post-filter) for author-filtered git collection if [ -z "$OLDEST_SESSION_EPOCH" ] || [ "$ts" -lt "$OLDEST_SESSION_EPOCH" ] 2>/dev/null; then OLDEST_SESSION_EPOCH="$ts" fi local project_dir="$CLAUDE_DIR/$dir_name" local jsonl_file="$project_dir/${session_id}.jsonl" mkdir -p "$tmpdir/$dir_name" cp "$jsonl_file" "$tmpdir/$dir_name/" accumulated_bytes=$(($accumulated_bytes + $file_size)) session_count=$((session_count + 1)) # Scan for PR links (grep for pr-link pattern in JSONL) # Claude Code JSONL uses camelCase: "prNumber", "prUrl", "prRepository" local pr_match pr_match=$(grep -o '"pr-link","sessionId":"[^"]*","prNumber":[0-9]*,"prUrl":"[^"]*","prRepository":"[^"]*"' "$jsonl_file" 2>/dev/null | head -1 || true) if [ -n "$pr_match" ]; then local pr_num pr_url pr_repo pr_num=$(echo "$pr_match" | grep -o '"prNumber":[0-9]*' | sed 's/"prNumber"://') pr_url=$(echo "$pr_match" | grep -o '"prUrl":"[^"]*"' | sed 's/"prUrl":"//;s/"$//') pr_repo=$(echo "$pr_match" | grep -o '"prRepository":"[^"]*"' | sed 's/"prRepository":"//;s/"$//') if [ -n "$pr_num" ] && command -v jq &>/dev/null; then pr_links_json=$(echo "$pr_links_json" | jq \ --arg sid "$session_id" \ --arg dir "$dir_name" \ --argjson num "$pr_num" \ --arg url "$pr_url" \ --arg repo "$pr_repo" \ '. + [{"session_id": $sid, "dir": $dir, "pr_number": $num, "pr_url": $url, "pr_repo": $repo}]' 2>/dev/null || echo "$pr_links_json") fi fi # Copy subagents directory if present local subagents_dir="$project_dir/${session_id}/subagents" if [ -d "$subagents_dir" ]; then mkdir -p "$tmpdir/$dir_name/${session_id}/subagents" cp "$subagents_dir"/*.jsonl "$tmpdir/$dir_name/${session_id}/subagents/" 2>/dev/null || true fi # Copy tool-results directory if present local tool_results_dir="$project_dir/${session_id}/tool-results" if [ -d "$tool_results_dir" ]; then mkdir -p "$tmpdir/$dir_name/${session_id}/tool-results" cp -r "$tool_results_dir/"* "$tmpdir/$dir_name/${session_id}/tool-results/" 2>/dev/null || true fi done < "$sorted_file" # Write _metadata.json sidecar. The recoveries count is the cumulative # unique orphan recoveries during this script run (via the dedup log); # for single-repo uploads this is exact, for multi-repo it over-counts # later children (acceptable — admins see aggregate run activity). local _rmdc_total _rmdc_total=$(_rmdc_recovery_count_unique) if command -v jq &>/dev/null; then jq -n \ --argjson dirs "$dir_metadata_json" \ --argjson prs "$pr_links_json" \ --argjson recoveries "${_rmdc_total:-0}" \ '{"version": 1, "directories": $dirs, "pr_links": $prs, "orphan_recovery_count": $recoveries}' \ > "$tmpdir/_metadata.json" fi rm -f "$session_list_file" "$sorted_file" echo " Claude Code: ${session_count} sessions across ${#dirs[@]} workspaces, $(($accumulated_bytes / 1024 / 1024))MB" >&2 # Collect matching Codex sessions for the same git remote # Determine the selected project's git remote — used by BOTH Codex and # Cursor helpers below, so hoist out of any $CODEX_DIR guard. local selected_remote="" for dir_name in "${dirs[@]}"; do local cwd cwd=$(get_project_cwd "$dir_name") local remote remote=$(get_git_remote "$cwd") if [ -z "$remote" ] && [ -n "$cwd" ] && [ ! -e "$cwd" ]; then remote=$(resolve_remote_for_dead_cwd "$cwd") fi if [ -n "$remote" ]; then selected_remote="$remote" break fi done # Codex: gate on non-empty remote. An empty selected_remote tells the helper # to include every repo's sessions (--all mode), which would widen a scoped # upload. Handoff gotcha #5. if [ -n "$selected_remote" ]; then collect_codex_sessions "$tmpdir" "$selected_remote" fi # Collect matching Cursor IDE sessions for the same git remote collect_cursor_sessions "$tmpdir" "$selected_remote" # Collect matching opencode sessions for the same git remote collect_opencode_sessions "$tmpdir" "$selected_remote" # Collect matching Gemini CLI sessions for the same git remote collect_gemini_sessions "$tmpdir" "$selected_remote" _refresh_orphan_recovery_count "$tmpdir/_metadata.json" } list_projects() { if [ ! -d "$CLAUDE_DIR" ]; then echo "Error: Claude projects directory not found at $CLAUDE_DIR" >&2 exit 1 fi # List directories that contain .jsonl files local projects=() for dir in "$CLAUDE_DIR"/*/; do [ -d "$dir" ] || continue local name name=$(basename "$dir") # Check if directory has any .jsonl files. # `find -print -quit` SIGPIPEs on first match → ERR trap. Same family # as the OLDEST_SESSION_EPOCH fix (PR #389). local has_jsonl has_jsonl=$(find "$dir" -name "*.jsonl" -maxdepth 3 -print -quit 2>/dev/null || true) if [ -n "$has_jsonl" ]; then projects+=("$name") fi done if [ ${#projects[@]} -eq 0 ]; then echo "Error: No projects with transcripts found in $CLAUDE_DIR" >&2 exit 1 fi printf '%s\n' "${projects[@]}" } # Read a cached normalized remote for an encoded Claude dir from # ~/.paxel/cache/project-remotes-v2.tsv. Skips the "name:*" / "local:*" / # exact-match "unknown" fallback keys that list_projects_grouped writes for # unresolvable dirs — anchor "unknown" at end (via `$2 != "unknown"`) so a # real remote like "unknownhost.io/org/repo" isn't silently filtered out. # Cursor's dead-ws fallback (:1016) uses shell glob `unknown)` with no # trailing wildcard, matching this semantic. # # Used by collect_all_projects as a last-resort fallback for Conductor dead # cwds, which short-circuit inside resolve_remote_for_dead_cwd and have no # other local signal to recover from once all siblings are deleted. _project_cache_read_remote() { local dir="$1" local cache="${HOME}/.paxel/cache/project-remotes-v2.tsv" [ -z "$dir" ] && return 0 [ ! -f "$cache" ] && return 0 awk -F'\t' -v d="$dir" ' $1 == d && $2 != "" && $2 !~ /^(name:|local:)/ && $2 != "unknown" { print $2; exit } ' "$cache" 2>/dev/null || true } # Merge a TSV of (dir, remote, sessions, mtime, inferred) rows into the # project-remote cache. Rows for dirs we saw this run overwrite any existing # row; rows for dirs we didn't see are preserved (list_projects_grouped may # have written them on an earlier --project run). Creates the cache if # missing. Called at the end of collect_all_projects so the next --all run # can recover a Conductor workspace whose dir has since been deleted. _project_cache_persist_rows() { local new_rows_file="$1" [ ! -s "$new_rows_file" ] && return 0 local cache="${HOME}/.paxel/cache/project-remotes-v2.tsv" mkdir -p "$(dirname "$cache")" local merged awk_ok=0 merged=$(mktemp) if [ -f "$cache" ]; then if awk -F'\t' -v new_file="$new_rows_file" ' BEGIN { while ((getline line < new_file) > 0) { n = split(line, f, "\t") if (n >= 1) new_rows[f[1]] = line } close(new_file) } { if ($1 in new_rows) { print new_rows[$1] delete new_rows[$1] } else { print $0 } } END { for (d in new_rows) print new_rows[d] } ' "$cache" > "$merged" 2>/dev/null; then awk_ok=1 fi else cp "$new_rows_file" "$merged" && awk_ok=1 fi # Only overwrite the real cache if the merge succeeded AND produced non- # empty output. A mid-stream awk failure (signal, disk-full) could leave a # truncated $merged with data — swapping it in would corrupt the cache; bail. if [ "$awk_ok" = "1" ] && [ -s "$merged" ]; then mv "$merged" "$cache" else rm -f "$merged" fi } collect_all_projects() { local tmpdir="$1" # Collect Claude Code sessions if [ -d "$CLAUDE_DIR" ]; then cd "$CLAUDE_DIR" local claude_count=0 find . \( -name "*.jsonl" -o -name "sessions-index.json" \) | while read -r f; do mkdir -p "$tmpdir/$(dirname "$f")" cp "$f" "$tmpdir/$f" done claude_count=$(find . -name "*.jsonl" -not -name "_*" -not -path "*/_git/*" -not -path "*/subagents/*" -maxdepth 3 2>/dev/null | wc -l | tr -d ' ') if [ "$claude_count" -gt 0 ]; then echo " Claude Code: ${claude_count} sessions" >&2 fi # Copy tool-results directories find . -type d -name "tool-results" | while read -r d; do mkdir -p "$tmpdir/$d" cp -r "$d/"* "$tmpdir/$d/" 2>/dev/null || true done fi # Write _metadata.json entries for each Claude project dir so the server's # TranscriptDiscoverer merges worktrees that share a git remote into one # Project (find_or_create_by!(git_remote: ...)) instead of scattering by # encoded_name. Without this, users with N Conductor worktrees of the same # repo see N separate Projects in --all uploads. Mirrors the pattern # collect_project_group already uses at line 2015-2032. # # `cd "$CLAUDE_DIR"` above has already run (claude_count > 0 implies the # Claude dir existed), so we pin CLAUDE_DIR to $PWD (absolute form) for the # duration of the loop — get_project_cwd reads $CLAUDE_DIR internally and # would double-resolve a relative value under the cd'd cwd. if [ "${claude_count:-0}" -gt 0 ] && command -v jq &>/dev/null; then [ ! -f "$tmpdir/_metadata.json" ] && echo '{"version":1,"directories":{}}' > "$tmpdir/_metadata.json" local _orig_claude_dir="$CLAUDE_DIR" CLAUDE_DIR="$PWD" local claude_sidecar_count=0 # Cache rows to persist at end of loop. Populated with every dir whose # remote resolved (live OR via resolver/cache recovery). Persisted to # ~/.paxel/cache/project-remotes-v2.tsv so that if the workspace is # later deleted, the next --all run can still attribute its sessions. local _cache_rows_file _cache_rows_file=$(mktemp) for proj_dir in */; do [ -d "$proj_dir" ] || continue local pname pcwd premote _p_inferred=0 # Parameter expansion over `basename "$proj_dir"` — Claude encoded # project dir names like `-Users-...` trip basename's leading-dash # flag parsing ("illegal option -- U"). pname="${proj_dir%/}" pcwd=$(get_project_cwd "$pname") premote=$(get_git_remote "$pcwd") # Dead cwd recovery (deleted worktrees, removed subdirs) if [ -z "$premote" ] && [ -n "$pcwd" ] && [ ! -e "$pcwd" ]; then premote=$(resolve_remote_for_dead_cwd "$pcwd" 2>/dev/null || true) [ -n "$premote" ] && _p_inferred=1 # Conductor dead-cwd cache fallback. resolve_remote_for_dead_cwd # short-circuits */conductor/workspaces/*|*/.conductor/* paths # (PR #647 comment: ancestor + sibling-walk strategies can't span # Conductor-project boundaries correctly). If the cache has a prior # remote for this encoded dir — written by a past run where the # workspace WAS live — use it. Unblocks the first-run-after-delete # Conductor+jj case that no other strategy can reach. Honor the # orphan-recovery opt-out: PAXEL_NO_ORPHAN_RECOVERY=1 disables this # fallback alongside the ancestor/sibling walks it already gates. if [ -z "$premote" ] && [ "${PAXEL_NO_ORPHAN_RECOVERY:-0}" != "1" ]; then case "$pcwd" in */conductor/workspaces/*/*|*/.conductor/*) premote=$(_project_cache_read_remote "$pname") if [ -n "$premote" ]; then echo "[paxel] Recovered remote for $pcwd via project-cache($pname) -> $premote" >&2 _log_recovery_source "$pcwd" "project-cache" _p_inferred=1 fi ;; esac fi fi # Unresolvable Conductor dead-cwd path: every strategy above # (resolver, sibling walks, project-cache) was tried and came back # empty. Tell the user what happened and how to recover; otherwise # the sessions ship under an encoded-name orphan Project and the # root cause is invisible to both user and support. if [ -z "$premote" ] && [ -n "$pcwd" ] && [ ! -e "$pcwd" ]; then _warn_unresolvable_conductor_cwd "$pname" "$pcwd" fi # Skip if we can't produce any routing signal at all [ -z "$premote" ] && [ -z "$pcwd" ] && continue local updated updated=$(jq \ --arg dir "$pname" \ --arg remote "${premote:-}" \ --arg cwd "${pcwd:-}" \ '.directories[$dir] = {"git_remote": $remote, "cwd": $cwd}' \ "$tmpdir/_metadata.json" 2>/dev/null) [ -n "$updated" ] && echo "$updated" > "$tmpdir/_metadata.json" # Count every sidecar write so the status line matches what landed. claude_sidecar_count=$((claude_sidecar_count + 1)) # Record for cache persistence so future --all runs can recover dead # Conductor cwds after the workspace is deleted. Persist empty-remote # rows too — they clear stale cache entries from a prior live run # whose remote has since disappeared (without this, a dir that loses # its remote would keep the stale one forever). _p_inferred tracks # whether $premote came from live resolution (0) or the resolver/ # cache fallback (1) so load_project_cache can force a re-verify on # the next non-all run (matching list_projects_grouped:2057 semantics). # # Opt-out + empty premote + DEAD cwd: recovery paths (resolver + cache # fallback above) were SKIPPED by PAXEL_NO_ORPHAN_RECOVERY=1, so # "empty" means "we didn't look" rather than "verified unresolvable". # Writing empty here would clobber the user's warmed cache. Preserve # existing rows. Mirrors the Docker --all gate in # _docker_all_host_scan_for_recovery:1550-1555. Legacy's sidecar # write above already landed (the sidecar is the primary attribution # signal here; cache is best-effort for future-run recovery). local _legacy_skip_cache_write=0 if [ "${PAXEL_NO_ORPHAN_RECOVERY:-0}" = "1" ] \ && [ -z "$premote" ] \ && [ -n "$pcwd" ] \ && [ ! -e "$pcwd" ]; then _legacy_skip_cache_write=1 fi if [ -n "$pname" ] && [ "$_legacy_skip_cache_write" != "1" ]; then # BSD find/stat on macOS parse leading `-` of encoded Claude dir # names (e.g. `-Users-...`, `-conductor-workspaces-...`) as option # flags; prefix with `./` to force path interpretation. Same family # of bug as the basename-leading-dash comment on the outer loop. local _pd_sessions _pd_mtime _pd_sessions=$(find "./$pname" -maxdepth 3 -name "*.jsonl" -not -name "_*" 2>/dev/null | wc -l | tr -d ' ') _pd_mtime=$(stat -c %Y "./$pname" 2>/dev/null || stat -f %m "./$pname" 2>/dev/null || echo "0") printf '%s\t%s\t%s\t%s\t%s\n' "$pname" "$premote" "${_pd_sessions:-0}" "${_pd_mtime:-0}" "$_p_inferred" >> "$_cache_rows_file" fi done CLAUDE_DIR="$_orig_claude_dir" # Merge newly-resolved rows into the project-remote cache. _project_cache_persist_rows "$_cache_rows_file" rm -f "$_cache_rows_file" [ "$claude_sidecar_count" -gt 0 ] && echo " Sidecar: ${claude_sidecar_count} Claude workspaces with git_remote/cwd" >&2 fi # Collect Codex sessions via collect_codex_sessions helper. Empty second # arg = --all mode: buckets per-session remote into _codex_<slug>_<hash>/ # (or _codex_unattributed/ for sessions with no repository_url), writes # per-bucket sidecar entries, and applies --since filtering. collect_codex_sessions "$tmpdir" # Collect all Cursor IDE sessions (no remote filter — --all mode). # collect_cursor_sessions already buckets per-workspace (_cursor_<basename>_<hash>/) # and writes _metadata.json entries with each bucket's git_remote. Without this # call, the legacy archive flow missed Cursor entirely in --all uploads — the # Docker flow mounts /cursor_sessions separately but the archive didn't. # # Guard the call: collect_cursor_sessions returns 1 if every DB extraction # fails (stale/schema-changed state.vscdb). Under `set -e` that would abort # the whole upload and throw away the Claude/Codex data we already collected. # Docker mode does the same best-effort wrap at line 3670. if ! collect_cursor_sessions "$tmpdir" ""; then echo " Warning: Cursor session extraction had errors. Continuing with other sessions." >&2 fi if ! collect_opencode_sessions "$tmpdir" ""; then echo " Warning: opencode session extraction had errors. Continuing with other sessions." >&2 fi if ! collect_gemini_sessions "$tmpdir" ""; then echo " Warning: Gemini session extraction had errors. Continuing with other sessions." >&2 fi _refresh_orphan_recovery_count "$tmpdir/_metadata.json" } collect_single_project() { local tmpdir="$1" local project="$2" local project_dir="$CLAUDE_DIR/$project" if [ ! -d "$project_dir" ]; then echo "Error: Project directory not found: $project_dir" exit 1 fi mkdir -p "$tmpdir/$project" # Copy sessions-index.json if present if [ -f "$project_dir/sessions-index.json" ]; then cp "$project_dir/sessions-index.json" "$tmpdir/$project/" fi local accumulated_bytes=0 local session_count=0 # Get sessions sorted by createdAt while IFS= read -r session_id; do [ -z "$session_id" ] && continue local jsonl_file="$project_dir/${session_id}.jsonl" [ -f "$jsonl_file" ] || continue local file_size file_size=$(wc -c < "$jsonl_file" | tr -d ' ') cp "$jsonl_file" "$tmpdir/$project/" accumulated_bytes=$(($accumulated_bytes + $file_size)) session_count=$((session_count + 1)) # Copy subagents directory if present local subagents_dir="$project_dir/${session_id}/subagents" if [ -d "$subagents_dir" ]; then mkdir -p "$tmpdir/$project/${session_id}/subagents" cp "$subagents_dir"/*.jsonl "$tmpdir/$project/${session_id}/subagents/" 2>/dev/null || true fi # Copy tool-results directory if present local tool_results_dir="$project_dir/${session_id}/tool-results" if [ -d "$tool_results_dir" ]; then mkdir -p "$tmpdir/$project/${session_id}/tool-results" cp -r "$tool_results_dir/"* "$tmpdir/$project/${session_id}/tool-results/" 2>/dev/null || true fi done < <(get_sorted_sessions "$project_dir") echo " Collected: ${session_count} sessions, $(($accumulated_bytes / 1024 / 1024))MB" # Derive the selected project's git remote once for both Codex and Cursor # filtering below. Empty remote (unresolved cwd, no origin) means no filter # gets applied — Codex's `-n "$selected_remote"` guard below short-circuits, # and Cursor's extract_cursor_db treats empty as "match all" per its own logic. local cwd cwd=$(get_project_cwd "$project") local selected_remote selected_remote=$(get_git_remote "$cwd") # Dead cwd recovery (deleted subdirs, moved workspaces) — parity with # collect_project_group and collect_all_projects. Non-Conductor only # (resolve_remote_for_dead_cwd short-circuits */conductor/workspaces/* # and */.conductor/* at :1297 because those need sibling-worktree data, # not ancestor walk). if [ -z "$selected_remote" ] && [ -n "$cwd" ] && [ ! -e "$cwd" ]; then selected_remote=$(resolve_remote_for_dead_cwd "$cwd" 2>/dev/null || true) fi # Conductor dead-workspace fallback: list_projects_grouped's # backfill_conductor_remotes pre-pass walks sibling Conductor workspaces # and writes the recovered remote into ~/.paxel/cache/project-remotes-v2.tsv # (schema: dir<TAB>key<TAB>sessions<TAB>mtime<TAB>inferred). Single-line # awk lookup is cheap — avoids the full load_project_cache scan when # --project X is invoked directly. Skips fallback keys (name:/local:/unknown) # which would fail the exact-string Codex/Cursor filter downstream anyway. # # Gated on dead cwd (matching the upstream resolver gate above). A live # project with no origin shouldn't resurrect a stale cached key — the # correct behavior there is fail-closed, same as pre-PR. # # Staleness: load_project_cache revalidates `inferred=1` rows on its next # run by forcing them into the miss set. This fallback reads raw TSV # without revalidation. Accepted ceiling — the inferred remote was # correct at last backfill; users who suspect staleness can # `bin/upload --clear-cache` or run `--all` to refresh. if [ -z "$selected_remote" ] && [ -n "$cwd" ] && [ ! -e "$cwd" ]; then local _cache_file="${HOME}/.paxel/cache/project-remotes-v2.tsv" if [ -f "$_cache_file" ]; then local _cached_key _cached_key=$(awk -F'\t' -v d="$project" '$1==d{print $2; exit}' "$_cache_file" 2>/dev/null || true) case "$_cached_key" in name:*|local:*|unknown|'') ;; *) selected_remote="$_cached_key" ;; esac fi fi # Collect matching Codex sessions via collect_codex_sessions helper. Gate on # non-empty remote — an empty selected_remote tells the helper to include # every repo's sessions (--all mode), which would widen a scoped upload. # Handoff gotcha #5. if [ -n "$selected_remote" ]; then collect_codex_sessions "$tmpdir" "$selected_remote" fi # Write a _metadata.json entry for the selected Claude project dir so # the server's TranscriptDiscoverer merges it with the matching Codex # and Cursor buckets (all three discovered by shared git_remote) instead # of creating a separate Project keyed by encoded_name. if [ -n "$selected_remote" ] && command -v jq &>/dev/null; then [ ! -f "$tmpdir/_metadata.json" ] && echo '{"version":1,"directories":{}}' > "$tmpdir/_metadata.json" local claude_updated claude_updated=$(jq \ --arg dir "$project" \ --arg remote "$selected_remote" \ --arg cwd "${cwd:-}" \ '.directories[$dir] = {"git_remote": $remote, "cwd": $cwd}' \ "$tmpdir/_metadata.json" 2>/dev/null) [ -n "$claude_updated" ] && echo "$claude_updated" > "$tmpdir/_metadata.json" fi # Collect matching Cursor IDE sessions for the same git remote. Skip # Cursor entirely when selected_remote is empty — empty here means the # project's cwd is resolvable-live but has no origin, OR is dead AND # the ancestor/sibling recovery above also failed (or was skipped for # Conductor paths). extract_cursor_db treats empty as "no filter, include # all workspaces", which would silently upload every Cursor workspace on # the machine to a scoped single-project archive. Codex is also skipped # in this case (:2822), keeping archive behavior consistent. if [ -n "$selected_remote" ]; then if ! collect_cursor_sessions "$tmpdir" "$selected_remote"; then echo " Warning: Cursor session extraction had errors. Continuing with other sessions." >&2 fi if ! collect_opencode_sessions "$tmpdir" "$selected_remote"; then echo " Warning: opencode session extraction had errors. Continuing with other sessions." >&2 fi if ! collect_gemini_sessions "$tmpdir" "$selected_remote"; then echo " Warning: Gemini session extraction had errors. Continuing with other sessions." >&2 fi fi _refresh_orphan_recovery_count "$tmpdir/_metadata.json" } # Resolve --project NAME to PROJECT_DIRS via grouped data resolve_project_name() { local name="$1" # First try: match against group display names local g=0 while [ $g -lt ${#GROUP_DISPLAYS[@]} ]; do if [ "${GROUP_DISPLAYS[$g]}" = "$name" ]; then IFS='|' read -ra PROJECT_DIRS <<< "${GROUP_DIRS[$g]}" return 0 fi g=$((g + 1)) done # Second try: match against encoded dir names (backward compat) g=0 while [ $g -lt ${#GROUP_DIRS[@]} ]; do local dirs_str="${GROUP_DIRS[$g]}" IFS='|' read -ra check_dirs <<< "$dirs_str" for d in "${check_dirs[@]}"; do if [ "$d" = "$name" ]; then IFS='|' read -ra PROJECT_DIRS <<< "$dirs_str" return 0 fi done g=$((g + 1)) done return 1 } # --- Docker orchestration --- check_docker() { local os arch os="$(uname -s)" arch="$(uname -m)" # Step 1: Check Docker is installed if ! command -v docker &>/dev/null; then echo "Paxel runs analysis locally in a Docker container to keep your code private." >&2 echo "" >&2 echo "Error: Docker is not installed or not in PATH." >&2 echo "" >&2 case "$os" in Darwin) echo "Install Docker Desktop for Mac:" >&2 if [ "$arch" = "arm64" ]; then echo " https://desktop.docker.com/mac/main/arm64/Docker.dmg" >&2 else echo " https://desktop.docker.com/mac/main/amd64/Docker.dmg" >&2 fi echo "" >&2 echo "Or visit: https://www.docker.com/products/docker-desktop/" >&2 if [ -c /dev/tty ]; then printf "Open download page in browser? [Y/n] " >&2 local answer read -r answer </dev/tty case "$answer" in n|N|no|No) ;; *) open "https://www.docker.com/products/docker-desktop/" ;; esac fi ;; Linux) echo "Install Docker via the convenience script:" >&2 echo " curl -fsSL https://get.docker.com | sh" >&2 echo "" >&2 echo "Or install via your package manager:" >&2 echo " Ubuntu/Debian: sudo apt-get install docker.io" >&2 echo " Fedora: sudo dnf install docker-ce" >&2 echo " Arch: sudo pacman -S docker" >&2 ;; *) echo "Install Docker: https://docs.docker.com/get-docker/" >&2 ;; esac exit 1 fi # Step 2: Check daemon is running — auto-launch on macOS if ! docker info &>/dev/null 2>&1; then case "$os" in Darwin) echo "Docker is installed but its daemon isn't running. Trying to start Docker Desktop..." >&2 # `open -a Docker` fails when the docker CLI is present without the # Docker Desktop app — colima, OrbStack, Rancher Desktop, a Homebrew # docker client, or a partial install. Guard it in an `if` (exempt from # set -e / the ERR trap) so the failure prints an actionable message # instead of tripping the generic _paxel_on_error "unexpected error". if ! open -a Docker 2>/dev/null; then echo "" >&2 echo "Error: Docker's daemon isn't running, and we couldn't auto-start Docker Desktop (the Docker app isn't installed)." >&2 echo "" >&2 echo "You have the docker CLI but no running engine. Start whichever Docker runtime you use, then re-run this script:" >&2 echo " Docker Desktop: open from Applications, or install at https://www.docker.com/products/docker-desktop/" >&2 echo " colima: colima start" >&2 echo " OrbStack: open -a OrbStack" >&2 echo " Rancher Desktop: open -a 'Rancher Desktop'" >&2 exit 1 fi local waited=0 while [ $waited -lt 60 ]; do if docker info &>/dev/null 2>&1; then echo "" >&2 break fi printf "." >&2 sleep 2 waited=$((waited + 2)) done if [ $waited -ge 60 ]; then echo "" >&2 echo "Error: Docker's daemon didn't come up within 60 seconds." >&2 echo "Start your Docker runtime manually (Docker Desktop, colima, OrbStack, …) and re-run this script." >&2 exit 1 fi ;; Linux) echo "Error: Docker daemon is not running." >&2 echo "Start it with: sudo systemctl start docker" >&2 exit 1 ;; *) echo "Error: Docker daemon is not running." >&2 echo "Start Docker Desktop or run: sudo systemctl start docker" >&2 exit 1 ;; esac fi # Step 3: Check Docker version >= 20.10 local docker_version docker_version=$(docker version --format '{{.Server.Version}}' 2>/dev/null || echo "0.0") local major minor major=$(echo "$docker_version" | cut -d. -f1) minor=$(echo "$docker_version" | cut -d. -f2) if [ "${major:-0}" -lt 20 ] || { [ "${major:-0}" -eq 20 ] && [ "${minor:-0}" -lt 10 ]; }; then echo "Error: Docker version $docker_version is too old (minimum: 20.10)." >&2 echo "Update Docker: https://docs.docker.com/engine/install/" >&2 exit 1 fi echo "[1/17] Checking prerequisites — Docker version $docker_version ✓" } save_token() { mkdir -p "$(dirname "$PAXEL_TOKEN_FILE")" echo "$YC_TOKEN" > "$PAXEL_TOKEN_FILE" chmod 600 "$PAXEL_TOKEN_FILE" echo " Token saved to $PAXEL_TOKEN_FILE" } # Validate a token against /api/v1/token/check. Returns 0 on 200, 1 otherwise # (including network errors — we treat unreachable as invalid to fail pre-Docker # rather than 5 minutes in at upload time). # # Also sets `$_PAXEL_LAST_TOKEN_CHECK_CODE` to one of: # "skip" — PAXEL_SKIP_TOKEN_VALIDATION=1 short-circuit # "empty" — empty token argument # "000" — curl failure (DNS / connect / timeout) # <http> — HTTP status code from /api/v1/token/check # Callers can inspect this to tailor error messages (401/403 = revoked, # 5xx/000 = server blip). The return value stays binary so the escape # hatch (PAXEL_SKIP_TOKEN_VALIDATION=1) and the happy path stay simple. # # PAXEL_SKIP_TOKEN_VALIDATION=1 bypasses the curl. All three token sources # (env-var, baked-from-URL, saved-file) now validate via this endpoint. Escape # hatch mirrors PAXEL_SKIP_PREFLIGHT. _PAXEL_LAST_TOKEN_CHECK_CODE="" validate_token() { local token="$1" if [ "${PAXEL_SKIP_TOKEN_VALIDATION:-0}" = "1" ]; then _PAXEL_LAST_TOKEN_CHECK_CODE="skip" return 0 fi if [ -z "$token" ]; then _PAXEL_LAST_TOKEN_CHECK_CODE="empty" return 1 fi # The `|| http_code="000"` fallback has to live OUTSIDE the command # substitution: curl writes its `%{http_code}` output ("000" on # connection failure) to stdout AND exits non-zero. An `|| echo "000"` # INSIDE `$(...)` concatenates to "000000", which the 000) case arm # below would silently miss. PR #655 dual-review (Codex + Opus) # caught this after the initial commit — tests stubbed `return 7` # with no stdout, which didn't match real curl behavior. local http_code http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 \ -H "X-YC-Token: ${token}" \ "${PAXEL_SERVER}/api/v1/token/check" 2>/dev/null) || http_code="000" _PAXEL_LAST_TOKEN_CHECK_CODE="$http_code" [ "$http_code" = "200" ] } # Try browser-based device auth flow. Opens browser, polls for token. # Returns 0 on success (YC_TOKEN set), 1 on failure (fall back to manual). try_device_auth() { # Generate 8-char alphanumeric code local code code=$(LC_ALL=C tr -dc 'A-Z0-9' < /dev/urandom 2>/dev/null | head -c 8 || true) if [ ${#code} -lt 8 ]; then return 1 fi # Register the code with the server local register_response register_http register_response=$(curl -s -w "\n%{http_code}" -X POST \ -H "Content-Type: application/json" \ -d "{\"code\":\"${code}\"}" \ "${PAXEL_SERVER}/auth/cli/register" 2>/dev/null) register_http=$(echo "$register_response" | tail -1) if [ "$register_http" != "201" ]; then return 1 fi # Open browser local auth_url="${PAXEL_SERVER}/auth/cli?code=${code}" echo "" echo "[2/17] Signing you in — opening browser..." echo " If the browser doesn't open, visit: $auth_url" echo " Authorize the CLI in your browser. If you're asked to sign in, we'll email you a login link — check spam if it's slow." echo "" if command -v open &>/dev/null; then open "$auth_url" elif command -v xdg-open &>/dev/null; then xdg-open "$auth_url" &>/dev/null & else echo " Open this URL in your browser: $auth_url" fi # Poll for token local poll_url="${PAXEL_SERVER}/auth/cli/poll?code=${code}" local waited=0 local max_wait=600 local poll_interval=2 printf " Waiting for browser authorization" while [ $waited -lt $max_wait ]; do local poll_response poll_status poll_response=$(curl -s "$poll_url" 2>/dev/null) # || true: empty poll_status is expected while the user hasn't clicked yet; # the case `*)` branch below treats it as "keep waiting". grep no-match under # pipefail would otherwise fire a false ERR banner each poll. poll_status=$(echo "$poll_response" | grep -o '"status":"[^"]*"' | head -1 | cut -d'"' -f4 || true) case "$poll_status" in complete) # Server said complete but may have returned an unexpected shape. Use # || true so grep no-match doesn't fire the ERR banner, then check and # surface a specific message if the token is missing. YC_TOKEN=$(echo "$poll_response" | grep -o '"token":"[^"]*"' | head -1 | cut -d'"' -f4 || true) if [ -z "$YC_TOKEN" ]; then echo "" echo " Server returned complete without a token — falling back to manual sign-in." >&2 return 1 fi echo "" echo " Authenticated!" save_token return 0 ;; expired|not_found) echo "" echo " Session expired." >&2 return 1 ;; *) printf "." sleep $poll_interval waited=$((waited + poll_interval)) ;; esac done echo "" echo " Browser auth timed out." >&2 return 1 } # Manual fallback: prompt user to paste token manual_token_entry() { echo "" echo "[2/17] Signing you in..." echo " Get your token at: ${PAXEL_SERVER}/auth/login" echo " After logging in, copy the CLI token from your dashboard." echo "" printf " Paste your token: " user_read -r YC_TOKEN YC_TOKEN=$(echo "$YC_TOKEN" | tr -d '[:space:]') if [ -z "$YC_TOKEN" ]; then echo "Error: No token provided." >&2 exit 1 fi save_token } # Send local git identity to server for pre-matching register_git_identity() { local git_name git_email git_name=$(git config user.name 2>/dev/null || true) git_email=$(git config user.email 2>/dev/null || true) [ -z "$git_name" ] && [ -z "$git_email" ] && return 0 # Escape values for JSON safety (handles quotes, backslashes in git names) local json_name json_email json_name=$(printf '%s' "$git_name" | sed 's/\\/\\\\/g; s/"/\\"/g') json_email=$(printf '%s' "$git_email" | sed 's/\\/\\\\/g; s/"/\\"/g') curl -s -X POST "${PAXEL_SERVER}/api/v1/identity/register" \ -H "X-YC-Token: ${YC_TOKEN}" \ -H "Content-Type: application/json" \ -d "{\"git_name\":\"${json_name}\",\"git_email\":\"${json_email}\"}" \ >/dev/null 2>&1 || true } # Detect the user's git email(s) for a specific repo. # Fully client-side — no data sent to server. # Returns pipe-delimited list of emails, or empty string if none found. # # Three signals: # 1. git config user.email — the configured email for this repo # 2. Session SHAs — commits made during Claude Code sessions, cross-referenced # with git log to find the author email (definitively the user's commits) # 3. Name match — all emails in git log where author name matches git config # user.name (case-insensitive), catching other emails used by the same person detect_author_emails() { local dir_cwd="$1" local session_dir="$2" # directory containing session JSONL files local emails=() # Source 1: git config for THIS repo (not global cwd) local config_email config_email=$(git -C "$dir_cwd" config user.email 2>/dev/null || true) [ -n "$config_email" ] && emails+=("$config_email") # Source 2: Session SHAs → git log author email # Extract short SHAs from git_commit events in session transcripts, # then look up the author email for each from the local git repo. if [ -d "$session_dir" ]; then local sha_emails # || true: grep no-match exits 1 under pipefail, which fires the ERR trap # even though an empty result is expected here. Do not remove. sha_emails=$(grep -roh '"type":"git_commit"[^}]*"sha":"[a-f0-9]*"' "$session_dir" 2>/dev/null \ | grep -o '"sha":"[a-f0-9]*"' | sed 's/"sha":"//;s/"//' | sort -u \ | while read -r sha; do git -C "$dir_cwd" log "$sha" -1 --format='%aE' 2>/dev/null done | sort -u || true) for e in $sha_emails; do [ -n "$e" ] && emails+=("$e") done fi # Source 3: Name match — find all emails where git author name matches # the user's configured name (case-insensitive exact match). # Catches multi-email cases like alice@gmail.com when name is "Alice Example". # Assumes author names are unique per person in a repo (validated against prod data). local config_name config_name=$(git -C "$dir_cwd" config user.name 2>/dev/null || true) if [ -n "$config_name" ]; then # Escape regex special chars in the name for safe grep local escaped_name escaped_name=$(printf '%s' "$config_name" | sed 's/[.+*?^$[\]\\]/\\&/g') local name_emails # || true: grep no-match exits 1 under pipefail → false ERR banner when # no git author name matches user.name (e.g. brand-new repo). Do not remove. name_emails=$(git -C "$dir_cwd" log --format='%aN|%aE' 2>/dev/null | sort -u \ | grep -i "^${escaped_name}|" | cut -d'|' -f2 | sort -u || true) for e in $name_emails; do [ -n "$e" ] && emails+=("$e") done fi # Deduplicate and return pipe-delimited if [ ${#emails[@]} -gt 0 ]; then printf '%s\n' "${emails[@]}" | sort -u | tr '\n' '|' | sed 's/|$//' fi } # Collect author-filtered commits for episode linking. # Writes _author_commits.jsonl and _author_numstat.txt alongside existing files. collect_author_commits() { local dir_cwd="$1" local git_data_dir="$2" local encoded="$3" local author_emails="$4" # pipe-delimited local oldest_session_date="$5" # ISO date or empty [ -z "$author_emails" ] && return 0 # Build --author flags with regex-safe escaping # git --author is a regex match on "Author Name <email>", # so we anchor to <email> to avoid partial matches on . and + local author_flags=() IFS='|' read -ra email_arr <<< "$author_emails" for email in "${email_arr[@]}"; do [ -z "$email" ] && continue local escaped escaped=$(printf '%s' "$email" | sed 's/[.+*?^$[\]\\]/\\&/g') author_flags+=(--author="<${escaped}>") done [ ${#author_flags[@]} -eq 0 ] && return 0 local since_flag="" [ -n "$oldest_session_date" ] && since_flag="--since=$oldest_session_date" # Author-filtered commits (full session date range). TAB-separated, subject # LAST: git's %s is raw, so a literal quote/backslash/control char in a commit # subject would corrupt a JSON line and silently drop that commit (audit C13). # A fixed-order TSV is robust to any subject content. Parsed by # ClientPipeline#parse_commits_tsv (the .jsonl extension is historical). git -C "$dir_cwd" log "${author_flags[@]}" $since_flag \ --format='%H%x09%h%x09%aN%x09%aE%x09%aI%x09%s' \ > "${git_data_dir}/${encoded}_author_commits.jsonl" 2>/dev/null || true # Author-filtered numstat (needed by CommitGrouper for LOC stats) git -C "$dir_cwd" log "${author_flags[@]}" $since_flag \ --format='COMMIT_BOUNDARY %H %aI %aN <%aE>' --numstat \ > "${git_data_dir}/${encoded}_author_numstat.txt" 2>/dev/null || true # Log what we collected local author_count author_count=$(wc -l < "${git_data_dir}/${encoded}_author_commits.jsonl" 2>/dev/null | tr -d ' ') if [ "${author_count:-0}" -gt 5000 ]; then echo " Warning: ${author_count} author commits collected (large)" >&2 fi if [ "${author_count:-0}" -gt 0 ]; then local email_count=${#email_arr[@]} local email_label="email" [ "$email_count" -ne 1 ] && email_label="emails" local email_list email_list=$(IFS=', '; echo "${email_arr[*]}") echo " Git: ${author_count} author-filtered commits (${email_count} ${email_label}: ${email_list})" >&2 fi } load_or_request_token() { # 1. Check environment variable. Validate against the server so an expired # env var fails pre-Docker instead of 5 minutes into the pipeline. # PAXEL_SKIP_TOKEN_VALIDATION=1 restores the pre-validation behavior for # CI / tests / air-gapped runs where the server is unreachable by design. if [ -n "${YC_TOKEN:-}" ]; then if validate_token "$YC_TOKEN"; then echo "[2/17] Signed in via environment token ✓" return fi echo "" >&2 # Copy reflects the actual failure mode — "invalid or expired" overpromises # diagnostic confidence when the real cause is a 5xx or network blip. case "$_PAXEL_LAST_TOKEN_CHECK_CODE" in 401|403) echo "Error: YC_TOKEN env var is invalid or expired." >&2 echo " Refresh at: ${PAXEL_SERVER}/auth/login" >&2 ;; 000) echo "Error: Couldn't reach ${PAXEL_SERVER} to verify YC_TOKEN." >&2 echo " Check your network connection and try again." >&2 ;; 5[0-9][0-9]) echo "Error: ${PAXEL_SERVER} couldn't verify YC_TOKEN (server returned ${_PAXEL_LAST_TOKEN_CHECK_CODE})." >&2 echo " This is usually temporary — wait a minute and try again." >&2 ;; *) echo "Error: YC_TOKEN failed validation (code: ${_PAXEL_LAST_TOKEN_CHECK_CODE})." >&2 echo " Refresh at: ${PAXEL_SERVER}/auth/login" >&2 ;; esac echo " Or unset YC_TOKEN to use your saved token / browser auth." >&2 echo " CI / air-gapped: set PAXEL_SKIP_TOKEN_VALIDATION=1 to bypass." >&2 exit 1 fi # 2. Baked token from URL takes priority — it's the freshest, user-specific token. # Validate before trusting: a stale URL (bookmarked `bin/upload` target, expired # session, revoked token) can bake a dead token. On validation failure, fall # through to the saved-token path — do NOT overwrite a valid saved token with # an invalid baked one. if [ -n "${PAXEL_BAKED_TOKEN:-}" ]; then if validate_token "$PAXEL_BAKED_TOKEN"; then local saved_token="" [ -f "$PAXEL_TOKEN_FILE" ] && saved_token=$(cat "$PAXEL_TOKEN_FILE" | tr -d '[:space:]') if [ "$saved_token" = "$PAXEL_BAKED_TOKEN" ]; then # Saved token matches the URL token — use it YC_TOKEN="$PAXEL_BAKED_TOKEN" echo "[2/17] Signed in ✓" return elif [ -n "$saved_token" ] && [ "$saved_token" != "$PAXEL_BAKED_TOKEN" ]; then # Saved token is for a DIFFERENT account — replace it echo "[2/17] Switching to your account..." YC_TOKEN="$PAXEL_BAKED_TOKEN" save_token return else # No saved token — save the baked one YC_TOKEN="$PAXEL_BAKED_TOKEN" save_token echo "[2/17] Signed in ✓" return fi fi # Differentiate revoked-token from server-blip — "Sign-in link expired" is # misleading when the real cause is a 5xx or network failure. case "$_PAXEL_LAST_TOKEN_CHECK_CODE" in 401|403) echo "[2/17] Sign-in link expired — trying saved token or browser auth..." ;; 000) echo "[2/17] Couldn't reach ${PAXEL_SERVER} to verify sign-in link — trying saved token or browser auth..." ;; 5[0-9][0-9]) echo "[2/17] Sign-in link check failed (server ${_PAXEL_LAST_TOKEN_CHECK_CODE}) — trying saved token or browser auth..." ;; *) echo "[2/17] Sign-in link check failed (code ${_PAXEL_LAST_TOKEN_CHECK_CODE}) — trying saved token or browser auth..." ;; esac fi # 3. Check for existing saved token — validate it against the current server if [ -f "$PAXEL_TOKEN_FILE" ]; then local saved_token saved_token=$(cat "$PAXEL_TOKEN_FILE" | tr -d '[:space:]') if [ -n "$saved_token" ]; then if validate_token "$saved_token"; then YC_TOKEN="$saved_token" echo "[2/17] Signed in ✓" return else echo "[2/17] Session expired, signing you in again..." rm -f "$PAXEL_TOKEN_FILE" fi fi fi # 3. Try device auth flow (browser-based), fall back to manual paste require_tty if ! try_device_auth; then manual_token_entry fi } pull_client_image() { # In dev mode (localhost), build from local Dockerfile unless USE_LIVE_DOCKER=1 if [ -n "$PAXEL_REPO_ROOT" ] && [ -f "$PAXEL_REPO_ROOT/Dockerfile.client" ] && [ "${USE_LIVE_DOCKER:-0}" != "1" ]; then if [ "${PAXEL_QUIET_PULL:-0}" = "1" ]; then echo "[paxel] Preparing replay container..." else echo "[3/17] Setting up analysis container..." echo " Cloud: gpt-5.5 (via YC proxy) for summaries and scoring." echo " YC covers all analysis costs — no API keys or subscriptions needed." echo " File bodies stay local; aggregate scores + metadata (paths, commit numstat, session events) upload." fi echo " Building from: $PAXEL_REPO_ROOT/Dockerfile.client" local build_log build_log=$(mktemp) if DOCKER_BUILDKIT=1 docker build --build-arg CACHE_BUST="$(date +%s)" -f "$PAXEL_REPO_ROOT/Dockerfile.client" -t paxel-client "$PAXEL_REPO_ROOT" > "$build_log" 2>&1; then grep -E '^(#[0-9]+ (DONE|exporting|naming)|Step |Successfully)' "$build_log" || true PAXEL_CLIENT_IMAGE="paxel-client" echo " Built: paxel-client (local)" else echo "Error: Docker build failed." >&2 tail -20 "$build_log" >&2 rm -f "$build_log" exit 1 fi rm -f "$build_log" else pull_from_ghcr fi } # Pull our PUBLIC image without triggering a Docker credential helper / keychain # prompt. `docker pull` resolves registry creds from ~/.docker/config.json: a # `credsStore` ("osxkeychain" on macOS, "secretservice"/"pass" on Linux) makes the # CLI shell out to that helper to look up creds for ghcr.io — popping a keychain # prompt (or hanging) for anyone who has logged into ghcr.io before. Our image is # public and needs no auth, so we pull with a THROWAWAY config that (a) drops the # credsStore and (b) carries an explicit EMPTY `auths` entry for the image's # registry. Dropping credsStore ALONE is not enough: OrbStack (and potentially # other runtimes) fall back to the system keychain when a registry has no creds # info in the config at all, so the prompt still fires. The empty `auths` entry # tells docker "anonymous creds already exist for this registry," so it uses them # and never consults any helper. We also carry over the active Docker context so # the daemon connection is preserved. The real ~/.docker/config.json is never # touched; portable across macOS and Linux — pure shell, no external JSON parser. # Returns non-zero on any failure so the caller falls back. docker_pull_credfree() { local src_cfg tmp ctx registry rc=0 src_cfg="${DOCKER_CONFIG:-$HOME/.docker}" # Throwaway scratch for the stripped-down config. mktemp -d is the right tool # here: it's 0700, created and removed within this one call, and the OS sweeps # it if we're killed mid-pull — so unlike the ~/.paxel/cache/*-$$ dirs this # needs no cleanup_temp_dirs / 24h-sweeper wiring. It lives only for one pull. tmp=$(mktemp -d) || return 1 # Registry host the image lives on — the JSON `auths` key docker matches the # pull against. It's the first /-segment of the ref, but only when that segment # is actually a hostname (has a dot or port colon, or is localhost); a bare # `name` or `library/name` ref is on Docker Hub with no host prefix, and dev's # local `paxel-client` build never reaches this function. Hostnames are # [a-zA-Z0-9.:_-] only, so no JSON escaping needed. case "$PAXEL_CLIENT_IMAGE" in */*) registry="${PAXEL_CLIENT_IMAGE%%/*}" case "$registry" in *.*|*:*|localhost) ;; *) registry="" ;; esac ;; *) registry="" ;; esac # currentContext + the contexts/ metadata dir are all docker needs to resolve # a named context (colima/orbstack/desktop-linux/...) and reach the daemon. # Context names are restricted to [a-zA-Z0-9_.+-], so no JSON escaping needed. ctx=$(docker context show 2>/dev/null || echo default) if [ -n "$registry" ]; then printf '{"currentContext":"%s","auths":{"%s":{}}}\n' "$ctx" "$registry" > "$tmp/config.json" || { rm -rf "$tmp"; return 1; } else printf '{"currentContext":"%s"}\n' "$ctx" > "$tmp/config.json" || { rm -rf "$tmp"; return 1; } fi cp -R "$src_cfg/contexts" "$tmp/" 2>/dev/null || true docker --config "$tmp" pull "$PAXEL_CLIENT_IMAGE" >/dev/null 2>&1 || rc=$? rm -rf "$tmp" return "$rc" } pull_from_ghcr() { if [ "${PAXEL_QUIET_PULL:-0}" = "1" ]; then echo "[paxel] Preparing replay container..." else echo "[3/17] Setting up analysis container..." echo " Cloud: gpt-5.5 (via YC proxy) for summaries and scoring." echo " YC covers all analysis costs — no API keys or subscriptions needed." echo " File bodies stay local; aggregate scores + metadata (paths, commit numstat, session events) upload." echo " LLM results are cached locally — reruns skip completed work and pick up where you left off." echo "" fi printf " Downloading container image..." # Prefer a credential-helper-free pull (our image is public, so no auth is # needed). If it can't run or fails, fall back to a normal pull with your # default Docker config — same daemon and cached layers, so the first attempt # isn't wasted. The fallback can surface a credential prompt; it is the safe # last resort, not a failure. if docker_pull_credfree; then printf "\r Downloaded: %s\n" "$PAXEL_CLIENT_IMAGE" return 0 fi printf "\n" echo " Retrying with your default Docker settings..." if docker pull "$PAXEL_CLIENT_IMAGE" >/dev/null 2>/dev/null; then echo " Downloaded: $PAXEL_CLIENT_IMAGE" return 0 fi echo " Using cached image (pull failed, may be offline)" if ! docker image inspect "$PAXEL_CLIENT_IMAGE" &>/dev/null; then echo "Error: Image not found locally either. Check your connection." >&2 exit 1 fi } # Prepare filtered transcripts and run Docker analysis for a single repo. # Used by the multi-repo Strategy 3 path. # Args: repo_root repo_name repo_remote transcript_dirs_str codex_files_str prepare_and_run_for_repo() { local repo_root="$1" local repo_name="$2" local repo_remote="$3" local transcript_dirs_str="$4" local codex_files_str="$5" # Save state local saved_claude_dir="$CLAUDE_DIR" local saved_codex_dir="$CODEX_DIR" local saved_repo_root="${REPO_ROOT:-}" local saved_estimate="${PAXEL_HOST_ESTIMATE_MINUTES:-}" local saved_estimated="${ESTIMATED_MINUTES:-}" local saved_selected_remote="${selected_remote:-}" # Create filtered transcript dir (PID-scoped to avoid races with concurrent runs) local filtered_dir="${HOME}/.paxel/cache/filtered-transcripts-$$" rm -rf "$filtered_dir" mkdir -p "$filtered_dir" # Copy matching transcript dirs local match_count=0 local copy_failed=0 if [ -n "$transcript_dirs_str" ]; then IFS='|' read -ra tdirs <<< "$transcript_dirs_str" for dir_name in "${tdirs[@]}"; do [ -z "$dir_name" ] && continue if [ -d "$saved_claude_dir/$dir_name" ]; then # _paxel_cp_transcripts: CoW clone where possible, preserving mtime (-p) so # the container's --since filter (File.mtime in analyze_local.rake) reflects # each session's real age, not the copy's. Guarded: this function runs # errexit-suppressed (called as `if prepare_and_run_for_repo`), so a bare cp # failure would be swallowed and the repo analyzed on partial data — track # it and fail loud below. if _paxel_cp_transcripts "$saved_claude_dir/$dir_name" "$filtered_dir/$dir_name"; then match_count=$((match_count + 1)) else echo " Failed to copy Claude transcripts for ${repo_name} (${dir_name})." >&2 copy_failed=1 fi fi done fi # Create filtered Codex dir (per-repo Codex filtering, Codex fix #2). # PID-suffixed so cleanup_temp_dirs EXIT trap + 24h stale sweeper cover it # (3-site rule per reference_extract_dir_three_site_cleanup.md). local filtered_codex_dir="${HOME}/.paxel/cache/filtered-codex-$$" rm -rf "$filtered_codex_dir" mkdir -p "$filtered_codex_dir" local codex_count=0 local codex_cross_tool_count=0 if [ -n "$codex_files_str" ]; then IFS='|' read -ra cfiles <<< "$codex_files_str" for codex_file in "${cfiles[@]}"; do [ -z "$codex_file" ] && continue [ -f "$codex_file" ] || continue local codex_basename codex_basename=$(basename "$codex_file") # -p preserves mtime so collect_codex_sessions's --since filter # (compares file mtime vs SINCE_EPOCH) reflects the session's actual # age, not the copy's recency. Without -p, every file ends up with # "now" as its mtime and --since is effectively a no-op. if cp -p "$codex_file" "$filtered_codex_dir/$codex_basename"; then codex_count=$((codex_count + 1)) # Phase 3.5 — track cross-tool subset for honest session-count display # below (matches picker bucketing: only Claude-launched Codex counts # as subagent; standalone Codex counts as a main session). local _origin _origin=$(get_codex_session_originator "$codex_file") codex_originator_is_standalone "$_origin" || codex_cross_tool_count=$((codex_cross_tool_count + 1)) else echo " Failed to copy Codex session for ${repo_name}." >&2 copy_failed=1 fi done fi # Fail loud on a copy failure instead of silently analyzing incomplete data. # Nothing global is mutated yet (CLAUDE_DIR et al. are reassigned just below), so # a bare return is clean; the EXIT trap reaps the PID-scoped filtered dirs. The # caller (run_selected_child_repos) records the non-zero return as a failed repo. if [ "$copy_failed" -eq 1 ]; then echo " ✗ ${repo_name}: session copy failed — skipping to avoid an incomplete report." >&2 return 1 fi CLAUDE_DIR="$filtered_dir" CLAUDE_MOUNT_SCOPE="filtered" CODEX_MOUNT_SCOPE="filtered" MOUNT_LABEL="$repo_name" if [ "$codex_count" -gt 0 ]; then CODEX_DIR="$filtered_codex_dir" else CODEX_DIR="${HOME}/.paxel/empty-codex" mkdir -p "$CODEX_DIR" fi # Count sessions. Phase 3.5 — Claude-launched Codex (cross-tool) counts # toward subagent_total (the "M" in "N sessions + M subagent"), not the # main session_count. session_count drives the time estimate + user-visible # "${session_count} sessions" line; subagent_total appears separately so # users see honest "N sessions + M subagent" math matching the picker. local claude_count claude_count=$({ find "$filtered_dir" -name "*.jsonl" -not -name "_*" -not -path "*/_git/*" -not -path "*/subagents/*" -maxdepth 3 2>/dev/null || true; } | wc -l | tr -d ' ') local codex_main_count=$((codex_count - codex_cross_tool_count)) local session_count=$((claude_count + codex_main_count)) # If no Claude/Codex sessions, fold in opencode/Gemini (same helpers + condition as # the picker) so an opencode/Gemini-only repo isn't dropped by the zero-session # early-return below; run_docker_analysis extracts them (scoped to repo_remote) # regardless. Helpers honor their own filters and return 0 for a local:/non-match. if [ "$session_count" -eq 0 ]; then local _oc_main _gm_main _oc_main=$(count_opencode_sessions "$repo_remote") _gm_main=$(count_gemini_sessions "$repo_remote") session_count=$((_oc_main + _gm_main)) fi # Project-scoped: scan filtered_dir (matched-to-this-repo only) NOT $CLAUDE_DIR # global; otherwise the per-repo display includes cross-project subagents. local subagent_count subagent_count=$(count_subagent_sessions "$filtered_dir") local subagent_total=$((subagent_count + codex_cross_tool_count)) if [ "$session_count" -eq 0 ] && [ "$subagent_total" -eq 0 ]; then echo " No sessions found for ${repo_name}, skipping." # Restore state CLAUDE_DIR="$saved_claude_dir" CODEX_DIR="$saved_codex_dir" REPO_ROOT="$saved_repo_root" PAXEL_HOST_ESTIMATE_MINUTES="$saved_estimate" ESTIMATED_MINUTES="$saved_estimated" return 0 fi # Bound author-commit collection to this repo's session window. run_docker_mode # computes OLDEST_SESSION_EPOCH only on the single-repo path, so the picker path # left it empty and collect_author_commits ran UNBOUNDED over the uploader's full # history (payload bloat + spurious commit-group clusters). Compute a # function-local floor from the copied sessions' mtimes — the CoW transcript copy # (-p) and cp -p preserve them — like run_docker_mode's source-mtime scan (this one also covers Codex # sessions, which the single-repo scan does not). Trailing `|| true` swallows # head's SIGPIPE under set -o pipefail (same as run_docker_mode's computation). local oldest_epoch="" oldest_epoch=$(find "$filtered_dir" "$filtered_codex_dir" -name "*.jsonl" -not -name "_*" -maxdepth 3 2>/dev/null \ | while read -r _f; do stat -c '%Y' "$_f" 2>/dev/null || stat -f '%m' "$_f" 2>/dev/null; done \ | sort -n | head -1 || true) # When --since is active the host hasn't filtered the copied sessions yet (the # container applies SINCE_EPOCH via File.mtime), so the raw min-mtime floor can # predate the requested window. Clamp UP to SINCE_EPOCH so the author-commit # window matches the analyzed-session window, not the absolute-oldest session. if [ -n "$SINCE_EPOCH" ]; then if [ -z "$oldest_epoch" ] || [ "$oldest_epoch" -lt "$SINCE_EPOCH" ]; then oldest_epoch="$SINCE_EPOCH" fi fi # Collect git metadata local git_data_dir="${filtered_dir}/_git" mkdir -p "$git_data_dir" local docker_metadata="{}" for dir in "$filtered_dir"/*/; do [ -d "$dir" ] || continue local dir_name dir_name=$(basename "$dir") [ "$dir_name" = "_git" ] && continue local dir_cwd dir_cwd=$(get_project_cwd "$dir_name") local dir_remote dir_remote=$(get_git_remote "$dir_cwd") if [ -z "$dir_remote" ] && [ -n "$dir_cwd" ] && [ ! -e "$dir_cwd" ]; then dir_remote=$(resolve_remote_for_dead_cwd "$dir_cwd") fi if [ -n "$dir_remote" ] || [ -n "$dir_cwd" ]; then docker_metadata=$(echo "$docker_metadata" | jq \ --arg dir "$dir_name" \ --arg remote "$dir_remote" \ --arg cwd "$dir_cwd" \ '. + {($dir): {"git_remote": $remote, "cwd": $cwd}}' 2>/dev/null || echo "$docker_metadata") fi if [ -n "$dir_cwd" ] && [ -e "$dir_cwd/.git" ]; then local encoded encoded=$(echo "$dir_name" | sed 's/[^a-zA-Z0-9_-]/_/g') local since_flag="" [ -n "$SINCE_EPOCH" ] && since_flag="--since=$(date -r "$SINCE_EPOCH" '+%Y-%m-%d' 2>/dev/null || date -d "@$SINCE_EPOCH" '+%Y-%m-%d' 2>/dev/null || echo '')" # Total commit count (cheap, no data) for the Git Metrics "N of M commits" # context — parity with the single-repo block. git -C "$dir_cwd" rev-list --count HEAD \ > "${git_data_dir}/${encoded}_commit_count.txt" 2>/dev/null || true # -${COMMIT_LIMIT:-1000} honors --commits and matches the single-repo default # (was hardcoded -500). TSV (subject LAST) — robust to quotes/backslashes (C13). git -C "$dir_cwd" log -${COMMIT_LIMIT:-1000} $since_flag \ --format='%H%x09%h%x09%aN%x09%aE%x09%aI%x09%s' \ > "${git_data_dir}/${encoded}_commits.jsonl" 2>/dev/null || true git -C "$dir_cwd" log -${COMMIT_LIMIT:-1000} $since_flag \ --format='COMMIT_BOUNDARY %H %aI %aN <%aE>' --numstat \ > "${git_data_dir}/${encoded}_numstat.txt" 2>/dev/null || true # Author-filtered commits for episode linking, bounded to this repo's session # window (oldest_epoch, computed above) so we don't pull the uploader's entire # git history into the upload. local author_emails author_emails=$(detect_author_emails "$dir_cwd" "$filtered_dir") if [ -n "$author_emails" ]; then local oldest_date="" if [ -n "$oldest_epoch" ]; then oldest_date=$(date -r "$oldest_epoch" '+%Y-%m-%d' 2>/dev/null || date -d "@$oldest_epoch" '+%Y-%m-%d' 2>/dev/null || echo '') fi collect_author_commits "$dir_cwd" "$git_data_dir" "$encoded" "$author_emails" "$oldest_date" fi fi done # Write _metadata.json local _rmdc_total _rmdc_total=$(_rmdc_recovery_count_unique) if command -v jq &>/dev/null; then jq -n --argjson dirs "$docker_metadata" \ --argjson recoveries "${_rmdc_total:-0}" \ '{"version": 1, "directories": $dirs, "orphan_recovery_count": $recoveries}' \ > "${filtered_dir}/_metadata.json" \ || echo " Warning: could not write attribution metadata for ${repo_name}; the container will fall back to name-based attribution." >&2 fi # Set repo root for code quality if [ -d "$repo_root" ]; then export REPO_ROOT="$repo_root" else unset REPO_ROOT 2>/dev/null || true fi # Set selected_remote so Cursor extraction filters by this repo's remote. selected_remote="$repo_remote" # A remote-less child carries a "local:/abs/path" grouping key (detect_child_repos), # NOT a real git remote. The non-Claude collectors filter by normalize_remote, # and "local:…" normalizes to itself and matches no session — so leaving it set # makes Codex/Cursor/opencode/Gemini silently drop every session AND run wasted # work. Blank it so run_docker_analysis's "skipped (no resolved remote)" guards # fire instead — honest, and matching the single-repo path (which sets an empty # selected_remote for a no-origin repo). Full path-based scoping for remote-less # repos is a follow-up. case "$selected_remote" in local:*|"") selected_remote="" ;; esac # Estimate and pass to Docker. Match print_estimate's `sessions + subagent_count` # math (line 352): AnalyzeSessionJob runs every logical-root + subagent + cross-tool # session, so the wall-clock estimate must include the same set the user sees in # "${session_count} sessions + ${subagent_total} subagent" below. local mins mins=$(estimate_time $((session_count + subagent_total))) ESTIMATED_MINUTES="$mins" export PAXEL_HOST_ESTIMATE_MINUTES="$mins" if [ "$subagent_total" -gt 0 ]; then echo " ${session_count} sessions + ${subagent_total} subagent, ~${mins} min" else echo " ${session_count} sessions, ~${mins} min" fi # Run Docker run_docker_analysis local result=$? # Restore state CLAUDE_DIR="$saved_claude_dir" CODEX_DIR="$saved_codex_dir" if [ -n "$saved_repo_root" ]; then REPO_ROOT="$saved_repo_root" else unset REPO_ROOT 2>/dev/null || true fi PAXEL_HOST_ESTIMATE_MINUTES="$saved_estimate" ESTIMATED_MINUTES="$saved_estimated" selected_remote="$saved_selected_remote" return $result } # Does any non-Claude tool (Codex, Cursor, opencode, Gemini) have a session for $remote? # Lets single-project auto-detect scope a repo the user worked in WITHOUT Claude # Code, instead of falling through to the "none of your sessions match" prompt. # Reuses the real collectors against a throwaway probe dir (so per-remote # matching has ONE source of truth), and only runs on the uncommon path where # Claude produced no match. Recovery logging is redirected to a throwaway so the # probe can't inflate orphan_recovery_count for the real run. remote_has_agent_sessions() { local remote="$1" [ -z "$remote" ] && return 1 local probe probe=$(mktemp -d 2>/dev/null) || return 1 local _saved_rmdc="${_RMDC_LOG_FILE:-}" _RMDC_LOG_FILE="$probe/.rmdc" local rc=1 # Gemini probe FIRST — its extraction needs no jq (sed/cp), so a repo with only # Gemini sessions can still be scoped on a host without jq. collect_gemini_sessions "$probe" "$remote" >/dev/null 2>&1 || true [ -n "$(find "$probe" -path '*/_gemini_*/*.jsonl' -print -quit 2>/dev/null)" ] && rc=0 # Codex/Cursor/opencode probes require jq; only run them if we haven't matched. if [ "$rc" -ne 0 ] && command -v jq >/dev/null 2>&1; then collect_codex_sessions "$probe" "$remote" >/dev/null 2>&1 || true collect_cursor_sessions "$probe" "$remote" >/dev/null 2>&1 || true collect_opencode_sessions "$probe" "$remote" >/dev/null 2>&1 || true if [ -n "$(find "$probe" -path '*/_codex_*/*.jsonl' -print -quit 2>/dev/null)" ] \ || [ -n "$(find "$probe" -path '*/_cursor_*/*.jsonl' -print -quit 2>/dev/null)" ] \ || [ -n "$(find "$probe" -path '*/_opencode_*/*.jsonl' -print -quit 2>/dev/null)" ]; then rc=0 fi fi _RMDC_LOG_FILE="$_saved_rmdc" rm -rf "$probe" return $rc } run_docker_analysis() { if [ "$MULTI_REPO_RUNNING" -eq 0 ]; then echo "Analyzing your coding sessions — this is the slow part (steps 4-17 below):" fi # When PAXEL_SERVER points to localhost, the container can't reach the host's # localhost directly. Rewrite to host.docker.internal (works on macOS/Windows # Docker Desktop and Linux with --add-host). local docker_server="$PAXEL_SERVER" # Mount logs directory for persistent output on the host local log_dir="${HOME}/.paxel/logs" # Mirror the label analyze_local.rake uses for the log filename — PAXEL_LOG_LABEL # (i.e. MOUNT_LABEL when set), else PROJECT_NAME, else "all" — with the SAME # sanitize, so the "Log: …" messages below point at the file actually written. # Without this the glob mismatches for --project runs (MOUNT_LABEL unset → file is # <project>-*.log, not all-*.log) and for labels with spaces/special chars. local _log_label _log_label=$(printf '%s' "${MOUNT_LABEL:-${PROJECT_NAME:-all}}" | tr -c 'A-Za-z0-9._-' '_') mkdir -p "$log_dir" # Pending-upload stash dir (resumable uploads). Stays a host bind mount so the # stash payload keeps its disclosed 0600-in-0700 posture in the user's home. local data_dir="${HOME}/.paxel/data" mkdir -p "$data_dir" # Honor the disclosed 0700 on ~/.paxel/data (and the logs dir) even if a # pre-change run created them 0755. The 0700 data dir is what shields the # pending-upload stash from other host users. chmod 700 "$log_dir" "$data_dir" 2>/dev/null || true # Persistent LLM result cache lives in a dedicated Docker NAMED VOLUME, not a # host bind mount. The container runs non-root (uid 1000); a host-owned 0700 # bind mount is unwritable when the host uid != 1000 (native Linux / CI / # cloud) — that silently disabled the cache and caused full re-spend every run # (no cross-run reuse → large histories burn the daily cost cap with zero # output). A fresh named volume inherits the image dir's uid-1000 ownership # (Dockerfile.client chowns /rails/cache), so the container can always write # it. Per-host-user by default (isolation); override with PAXEL_CACHE_VOLUME. local cache_volume cache_volume="${PAXEL_CACHE_VOLUME:-paxel-cache-$(id -u)}" # --clean flag: drop cached LLM results to force fresh analysis. The cache has # its own volume (nothing else lives there), so removing the whole volume is # safe; guarded so a "volume in use" / missing-volume case can't trip the # global ERR trap. Next run re-creates it fresh (correctly uid-1000-owned). # Also clears any orphaned legacy cache file from the old ~/.paxel/data path. if [ "${CLEAN:-0}" = "1" ]; then docker volume rm "$cache_volume" >/dev/null 2>&1 || true rm -f "${data_dir}/llm_cache.sqlite3" "${data_dir}/llm_cache.sqlite3-wal" "${data_dir}/llm_cache.sqlite3-shm" 2>/dev/null || true echo " Cleaned cached analysis data" fi local docker_args=(--rm -v "${log_dir}:/logs" -v "${data_dir}:/rails/data" -v "${cache_volume}:/rails/cache") # Mount Claude Code transcripts if [ -d "$CLAUDE_DIR" ]; then docker_args+=(-v "${CLAUDE_DIR}:/transcripts:ro") local cc_count cc_count=$(count_sessions "$CLAUDE_DIR") if [ "${CLAUDE_MOUNT_SCOPE:-all}" = "filtered" ]; then echo " Claude Code: ${cc_count} sessions (matched to ${MOUNT_LABEL})" else echo " Claude Code: ${cc_count} sessions" fi else # Create empty mount point so container doesn't fail docker_args+=(-v "${HOME}/.paxel/empty:/transcripts:ro") mkdir -p "${HOME}/.paxel/empty" fi # Extract Codex sessions on host, mount extracted dir at /codex_sessions:ro. # Mirrors the Cursor pattern below: host walks $CODEX_DIR, buckets per-session # remote into _codex_<slug>_<hash>/ (or _codex_unattributed/ for sessions # without a repository_url), writes a _metadata.json sidecar, and mounts the # result. The container's analyze_local.rake merges /codex_sessions into # transcript_dir so TranscriptDiscoverer creates a Codex Project per remote. # # Historical note: PR #604 removed a prior $CODEX_DIR bind-mount because # analyze_local.rake had no merge logic — Codex sessions were silently dropped # in Docker mode from 2026-02 to 2026-04. Restored here with the missing # container-side consumer (see analyze_local.rake:~134). if [ -d "$CODEX_DIR" ]; then # Scope guard: in filtered (single-project) Docker mode, refuse to # extract Codex if we couldn't resolve the project's git_remote. # collect_codex_sessions treats an empty selected_remote as --all, # so without this guard an auto-detected single-project upload for a # repo with no origin would pull in every Codex session across every # repo on the machine, widening scope far beyond what the user asked # for. In --all mode (CLAUDE_MOUNT_SCOPE=all or unset), empty # selected_remote is correct — we want every session. if [ "${CLAUDE_MOUNT_SCOPE:-all}" = "filtered" ] && [ -z "${selected_remote:-}" ]; then echo " Codex CLI: skipped (single-project scope but no resolved remote)" else local codex_extract_dir="${HOME}/.paxel/cache/codex_extracted-$$" rm -rf "$codex_extract_dir" mkdir -p "$codex_extract_dir" local codex_log="${HOME}/.paxel/logs/codex-extract.log" mkdir -p "$(dirname "$codex_log")" if ! collect_codex_sessions "$codex_extract_dir" "${selected_remote:-}" 2>"$codex_log"; then echo " Warning: Codex session extraction had errors. Continuing with other sessions." [ -s "$codex_log" ] && echo " Details: $codex_log" fi local codex_jsonl_count codex_jsonl_count=$(find "$codex_extract_dir" -maxdepth 2 -path "*/_codex_*/*.jsonl" 2>/dev/null | wc -l | tr -d ' ') if [ "$codex_jsonl_count" -gt 0 ]; then docker_args+=(-v "${codex_extract_dir}:/codex_sessions:ro") # Phase 3.5 — split by originator so the prelude matches the picker # bucketing the user just saw. Standalone (user-launched Codex) = # "Codex CLI"; Claude-launched = "Codex launched by Claude". local codex_standalone_count=0 local codex_cross_tool_count=0 local _cef while IFS= read -r _cef; do [ -z "$_cef" ] && continue local _ceo _ceo=$(get_codex_session_originator "$_cef") if codex_originator_is_standalone "$_ceo"; then codex_standalone_count=$((codex_standalone_count + 1)) else codex_cross_tool_count=$((codex_cross_tool_count + 1)) fi done < <(find "$codex_extract_dir" -maxdepth 2 -path "*/_codex_*/*.jsonl" 2>/dev/null) local match_suffix="" [ "${CLAUDE_MOUNT_SCOPE:-all}" = "filtered" ] && match_suffix=" (matched to ${MOUNT_LABEL})" if [ "$codex_standalone_count" -gt 0 ]; then echo " Codex CLI: ${codex_standalone_count} sessions${match_suffix}" fi if [ "$codex_cross_tool_count" -gt 0 ]; then echo " Codex launched by Claude: ${codex_cross_tool_count} sessions${match_suffix}" fi fi fi fi # Extract Cursor IDE sessions on host (SQLite → JSONL), mount extracted dir if { [ -d "$CURSOR_DIR" ] || [ -f "$CURSOR_GLOBAL_DB" ]; } && command -v sqlite3 &>/dev/null && command -v jq &>/dev/null; then # collect_cursor_sessions treats an empty selected_remote as --all, so without # this guard an auto-detected single-project upload for a repo with no resolved # remote would pull in every Cursor session on the machine. In --all mode # (CLAUDE_MOUNT_SCOPE=all/unset) empty is correct. Mirrors the Codex/opencode/ # Gemini blocks (Cursor was the only collector missing this guard). if [ "${CLAUDE_MOUNT_SCOPE:-all}" = "filtered" ] && [ -z "${selected_remote:-}" ]; then echo " Cursor IDE: skipped (single-project scope but no resolved remote)" else local cursor_extract_dir="${HOME}/.paxel/cache/cursor_extracted-$$" rm -rf "$cursor_extract_dir" mkdir -p "$cursor_extract_dir" echo " Extracting Cursor IDE sessions..." local cursor_log="${HOME}/.paxel/logs/cursor-extract.log" mkdir -p "$(dirname "$cursor_log")" if ! collect_cursor_sessions "$cursor_extract_dir" "${selected_remote:-}" 2>"$cursor_log"; then echo " Warning: Cursor session extraction had errors. Continuing with other sessions." [ -s "$cursor_log" ] && echo " Details: $cursor_log" fi local cursor_jsonl_count cursor_jsonl_count=$(find "$cursor_extract_dir" -maxdepth 2 -path "*/_cursor_*/*.jsonl" 2>/dev/null | wc -l | tr -d ' ') if [ "$cursor_jsonl_count" -gt 0 ]; then docker_args+=(-v "${cursor_extract_dir}:/cursor_sessions:ro") if [ "${CLAUDE_MOUNT_SCOPE:-all}" = "filtered" ]; then echo " Cursor IDE: ${cursor_jsonl_count} sessions (matched to ${MOUNT_LABEL})" else echo " Cursor IDE: ${cursor_jsonl_count} sessions" fi fi # $filtered_dir is dynamic-scoped from prepare_and_run_for_repo / run_docker_mode # (the two functions that wrote the archive sidecar with orphan_recovery_count). # Cursor extraction above may have triggered resolver calls; update the counter # before the container reads the sidecar. _refresh_orphan_recovery_count "${filtered_dir:-}/_metadata.json" fi fi # Extract opencode sessions on host (SQLite → opencode-native JSONL), mount # extracted dir at /opencode_sessions. Mirrors the Cursor block above; the # container's analyze_local merge folds _opencode_* buckets into transcript_dir. if { [ -d "$OPENCODE_DIR" ] || { [ -n "${OPENCODE_DB:-}" ] && [ -f "${OPENCODE_DB:-}" ]; }; } && command -v sqlite3 &>/dev/null && command -v jq &>/dev/null; then # collect_opencode_sessions treats an empty selected_remote as --all, so # without this guard an auto-detected single-project upload for a repo with # no resolved remote would pull in every opencode session on the machine. # In --all mode (CLAUDE_MOUNT_SCOPE=all/unset) empty is correct. Mirrors the # Codex block above. if [ "${CLAUDE_MOUNT_SCOPE:-all}" = "filtered" ] && [ -z "${selected_remote:-}" ]; then echo " opencode: skipped (single-project scope but no resolved remote)" else local opencode_extract_dir="${HOME}/.paxel/cache/opencode_extracted-$$" rm -rf "$opencode_extract_dir" mkdir -p "$opencode_extract_dir" echo " Extracting opencode sessions..." local opencode_log="${HOME}/.paxel/logs/opencode-extract.log" mkdir -p "$(dirname "$opencode_log")" if ! collect_opencode_sessions "$opencode_extract_dir" "${selected_remote:-}" 2>"$opencode_log"; then echo " Warning: opencode session extraction had errors. Continuing with other sessions." [ -s "$opencode_log" ] && echo " Details: $opencode_log" fi local opencode_jsonl_count opencode_jsonl_count=$(find "$opencode_extract_dir" -maxdepth 2 -path "*/_opencode_*/*.jsonl" 2>/dev/null | wc -l | tr -d ' ') if [ "$opencode_jsonl_count" -gt 0 ]; then docker_args+=(-v "${opencode_extract_dir}:/opencode_sessions:ro") if [ "${CLAUDE_MOUNT_SCOPE:-all}" = "filtered" ]; then echo " opencode: ${opencode_jsonl_count} sessions (matched to ${MOUNT_LABEL})" else echo " opencode: ${opencode_jsonl_count} sessions" fi fi _refresh_orphan_recovery_count "${filtered_dir:-}/_metadata.json" fi fi # Extract Gemini CLI sessions on host (raw JSONL copy + subagent relayout), mount # at /gemini_sessions. Unlike the SQLite tools above, gemini sessions are plain # JSONL — extraction needs neither sqlite3 nor jq (jq only enriches the sidecar), # so the gate is just the dir existing. if [ -d "$GEMINI_DIR" ]; then # collect_gemini_sessions treats an empty selected_remote as --all; guard the # single-project-no-remote case so we don't pull every gemini session (mirrors # the Codex/opencode blocks). if [ "${CLAUDE_MOUNT_SCOPE:-all}" = "filtered" ] && [ -z "${selected_remote:-}" ]; then echo " Gemini CLI: skipped (single-project scope but no resolved remote)" else local gemini_extract_dir="${HOME}/.paxel/cache/gemini_extracted-$$" rm -rf "$gemini_extract_dir" mkdir -p "$gemini_extract_dir" echo " Extracting Gemini CLI sessions..." local gemini_log="${HOME}/.paxel/logs/gemini-extract.log" mkdir -p "$(dirname "$gemini_log")" if ! collect_gemini_sessions "$gemini_extract_dir" "${selected_remote:-}" 2>"$gemini_log"; then echo " Warning: Gemini session extraction had errors. Continuing with other sessions." [ -s "$gemini_log" ] && echo " Details: $gemini_log" fi local gemini_jsonl_count gemini_jsonl_count=$(find "$gemini_extract_dir" -maxdepth 2 -path "*/_gemini_*/*.jsonl" 2>/dev/null | wc -l | tr -d ' ') if [ "$gemini_jsonl_count" -gt 0 ]; then docker_args+=(-v "${gemini_extract_dir}:/gemini_sessions:ro") if [ "${CLAUDE_MOUNT_SCOPE:-all}" = "filtered" ]; then echo " Gemini CLI: ${gemini_jsonl_count} sessions (matched to ${MOUNT_LABEL})" else echo " Gemini CLI: ${gemini_jsonl_count} sessions" fi fi _refresh_orphan_recovery_count "${filtered_dir:-}/_metadata.json" fi fi # Docker --all host-side recovery detection. Populates _RMDC_LOG_FILE # so the env-var passthrough below forwards non-zero project_cache / # unresolvable / ancestor / worktree_list / jj_workspace_list counts # to the container. # # ORDERING IS LOAD-BEARING: this scan MUST run before the # _rmdc_recovery_count_unique / _recovery_source_breakdown reads # below, or the env vars ship empty. Pinned by CJ10f bats test. if [ "${ALL_PROJECTS:-0}" -eq 1 ]; then _docker_all_host_scan_for_recovery "$CLAUDE_DIR" fi # Extract per-repo git history for --all so the container can sum it into one # combined git_metrics (collect_git_data_aggregate). --no-repo opts out, the # same way it suppresses the single-repo mount below. if [ "${ALL_PROJECTS:-0}" -eq 1 ] && [ "${NO_REPO:-0}" != "1" ]; then _docker_all_extract_git_data "$CLAUDE_DIR" fi # Pass orphan_recovery_count + recovery_breakdown as Docker env vars. # ClientPipeline's readers fall back to these env vars when the archive # sidecar lacks the fields. Host-side activity only — in-container # recoveries flow through the container's own path. # # The scan above also self-warms ~/.paxel/cache/project-remotes-v2.tsv # (via _project_cache_persist_rows) so future runs can recover a # deleted Conductor workspace that was live during this run. # Symmetric with legacy --all's behavior in collect_all_projects:3376. if [ -n "${_RMDC_LOG_FILE:-}" ]; then local _rmdc_for_env _rmdc_for_env=$(_rmdc_recovery_count_unique) docker_args+=(-e "PAXEL_ORPHAN_RECOVERY_COUNT=${_rmdc_for_env:-0}") local _rbrk_for_env _rbrk_for_env=$(_recovery_source_breakdown) docker_args+=(-e "PAXEL_RECOVERY_BREAKDOWN=${_rbrk_for_env}") fi # Bind-mount the host-written sidecar read-only into the container so # TranscriptDiscoverer.read_sidecar can resolve Claude workspace # git_remotes for Conductor dead-cwds (host-scan cache hits). Docker # --all bind-mounts $CLAUDE_DIR read-only at /transcripts, so there's # no archive sidecar to carry these; this secondary mount closes the # attribution gap end-to-end. Gated on ALL_PROJECTS=1 and existence of # the host-written file (jq-less hosts skip the write, and we skip the # mount here too — the container falls back to encoded_name, same as # the pre-sidecar baseline). if [ "${ALL_PROJECTS:-0}" -eq 1 ]; then local _dall_sidecar _dall_sidecar="$(_docker_all_sidecar_dir)" # OR _git/ so jq-less hosts (which skip the _metadata.json write) still ship aggregate git. if [ -f "${_dall_sidecar}/_metadata.json" ] || [ -d "${_dall_sidecar}/_git" ]; then docker_args+=(-v "${_dall_sidecar}:/paxel_sidecar:ro") fi fi # Repo mount for on-device code quality analysis (on by default, --no-repo to skip) if [ "${NO_REPO:-0}" != "1" ]; then local repo_root # `|| true` is LOAD-BEARING: when REPO_ROOT is unset (e.g. "Analyze ALL # projects" chosen from a non-repo dir like $HOME), git rev-parse exits 128. # `2>/dev/null` hides stderr but NOT the exit code, so under `set -Eeuo # pipefail` + `set -E` the failure fires the ERR trap (once inside the $() # subshell, once for the outer assignment) and aborts the whole upload. The # `|| true` keeps the substitution empty-on-failure so the `[ -n ]` guard # below simply skips the repo mount. See _paxel_on_error trap above. repo_root="${REPO_ROOT:-$(git rev-parse --show-toplevel 2>/dev/null || true)}" if [ -n "$repo_root" ]; then local mount_repo=1 if [ "$mount_repo" = "1" ]; then echo " Mounting repo from ${repo_root} (read-only)" docker_args+=(-v "${repo_root}:/repo:ro") # Extract git metrics on host (bounded to 5000 commits) local git_metrics_file="${HOME}/.paxel/git_metrics.txt" mkdir -p "${HOME}/.paxel" git -C "$repo_root" log -n 5000 --no-merges --numstat --format='%H|%aI|%s' > "$git_metrics_file" 2>/dev/null || true # No file-level chmod: git_metrics.txt is bind-mounted read-only into the # container (uid 1000) and is already shielded from other host users by # the 0700 ~/.paxel parent dir. if [ -s "$git_metrics_file" ]; then docker_args+=(-v "${git_metrics_file}:/git_metrics.txt:ro") echo " Extracted git metrics ($(wc -l < "$git_metrics_file" | tr -d ' ') lines)" fi fi fi fi local is_localhost=0 if echo "$PAXEL_SERVER" | grep -qE 'https?://(localhost|127\.0\.0\.1)'; then docker_server=$(echo "$PAXEL_SERVER" | sed -E 's/(localhost|127\.0\.0\.1)/host.docker.internal/') # Linux needs explicit host mapping; macOS/Windows Docker Desktop has it built-in docker_args+=(--add-host=host.docker.internal:host-gateway) is_localhost=1 fi # LLM proxy URL: rewrite localhost to host.docker.internal for Docker networking local llm_proxy_url="${PAXEL_LLM_PROXY}" if [ "$is_localhost" -eq 1 ]; then llm_proxy_url=$(echo "$PAXEL_LLM_PROXY" | sed -E 's/(localhost|127\.0\.0\.1)/host.docker.internal/') fi # Allocate a pseudo-TTY when our OUTPUT is a terminal so the container can # render a live progress UI (sticky footer + animated bars). Gate on stdout # ([ -t 1 ]), NOT stdin ([ -t 0 ]): under `curl … | bash` stdin is the script # pipe and never a TTY, but stdout is the user's terminal. We pass -t only # (never -i) — the container never reads stdin; all prompts are host-side via # /dev/tty — so interactive selection is unaffected. Non-TTY output (CI, # `> file`, `| pipe`) skips -t and the pipeline falls back to plain logging. if [ -t 1 ]; then docker_args+=(-t) fi docker_args+=( -e "YC_TOKEN=${YC_TOKEN}" -e "YC_API_KEY=${YC_TOKEN}" -e "YC_LLM_PROXY_URL=${llm_proxy_url}" -e "YC_RESULTS_ENDPOINT=${docker_server}/api/v1/results" -e "PAXEL_LOG_DIR=/logs" ) # Unified [Step N/17] counter on BOTH paths: bash owns steps 1-3 # (prereq/sign-in/pull, printed once up front), the container owns steps 4-17. # Multi-repo runs the container once per repo, so each repo re-runs steps 4-17 # under its own "═══ [n/total] Analyzing: NAME ═══" banner — same /17 scale as # the pull line. Previously multi-repo left the offset unset, so each repo reset # to a disjoint [Step 1/14]..[14/14] that clashed with the /17 pull line and read # like a restart/stall (risking a needless Ctrl-C mid-batch). docker_args+=(-e "PAXEL_STEP_OFFSET=3") # Localhost = dev mode, enable verbose logging if [ "$is_localhost" -eq 1 ]; then docker_args+=(-e "PAXEL_VERBOSE=1") fi # Propagate PAXEL_CLIENT_MODE into the container. AnthropicClient uses it to # pick env-aware remediation text (bin/upload for dev, `curl | bash` for # public users) when a Fatal LLM error hits the rake footer. # bin/upload sets this to "dev"; public curl|bash runs leave it unset. if [ -n "${PAXEL_CLIENT_MODE:-}" ]; then docker_args+=(-e "PAXEL_CLIENT_MODE=${PAXEL_CLIENT_MODE}") fi # Escape hatch for the preflight-signatures handshake (reviewed by # AnthropicClient.preflight_signatures!). Host-side env needs explicit # `docker -e` forwarding; without this line a host-side # `PAXEL_SKIP_PREFLIGHT=1 bin/upload` is a no-op because the container # never sees the value. if [ "${PAXEL_SKIP_PREFLIGHT:-0}" = "1" ]; then docker_args+=(-e "PAXEL_SKIP_PREFLIGHT=1") fi # Credential-scrub bypass (SecretScrubber + ToolInputSummarizer + related # EventExtractor hooks). Default on inside the container. Set # PAXEL_TOOL_OUTPUT_SCRUB=0 on the host before bin/upload to disable for # admin debugging — chunks.content then carries raw tool_use / text content # untouched. Not intended as a user-facing toggle; only forwarded when # explicitly set, otherwise container uses its own default (on). if [ -n "${PAXEL_TOOL_OUTPUT_SCRUB:-}" ]; then docker_args+=(-e "PAXEL_TOOL_OUTPUT_SCRUB=${PAXEL_TOOL_OUTPUT_SCRUB}") fi # Pass through optional filters if [ -n "$SINCE_EPOCH" ]; then docker_args+=(-e "SINCE_EPOCH=${SINCE_EPOCH}") fi if [ -n "$PROJECT_NAME" ]; then docker_args+=(-e "PROJECT_NAME=${PROJECT_NAME}") fi # Signal to the container that the host already filtered transcripts by # git remote (more accurate than the container's encoded-name substring # match) so analyze_local.rake skips its redundant in-container filter. # Without this, the sidecar has been getting dropped on re-filter. if [ "${CLAUDE_MOUNT_SCOPE:-all}" = "filtered" ]; then docker_args+=(-e "CLAUDE_MOUNT_SCOPE=filtered") fi # Per-repo log label so each multi-repo container writes an identifiable # ${MOUNT_LABEL}-<ts>.log instead of a colliding all-<ts>.log (analyze_local.rake # prefers PAXEL_LOG_LABEL over PROJECT_NAME for the filename). if [ -n "${MOUNT_LABEL:-}" ]; then docker_args+=(-e "PAXEL_LOG_LABEL=${MOUNT_LABEL}") fi # Pass host-side time estimate for telemetry calibration if [ -n "${PAXEL_HOST_ESTIMATE_MINUTES:-}" ]; then docker_args+=(-e "PAXEL_HOST_ESTIMATE_MINUTES=${PAXEL_HOST_ESTIMATE_MINUTES}") fi # Dev-tuning overrides: bump client-side concurrency for faster local runs. # Published defaults (20 / 20 / 20 / 20) are production-safe. The YC LLM # proxy + Anthropic tier still cap upstream rate, and AnthropicClient # retries 429s with backoff — so bumping these is safe, just effective. # DB_POOL must scale with concurrency because each worker checks out an # AR connection for session.update! / LlmCall writes. for var in PAXEL_NARRATIVE_CONCURRENCY PAXEL_EPISODE_CONCURRENCY \ PAXEL_NARRATIVE_AGGREGATOR_CONCURRENCY PAXEL_CROSS_SESSION_CONCURRENCY \ DB_POOL; do val="${!var:-}" if [ -n "$val" ]; then docker_args+=(-e "${var}=${val}") fi done # --no-sentry overrides the baked-in DSN with an empty string. The client # initializer returns early when CLIENT_SENTRY_DSN is empty, so no events # leave your machine. Without this flag, the image's baked DSN (set via # --build-arg during prod publish) is used. if [ "${NO_SENTRY:-0}" = "1" ]; then docker_args+=(-e "CLIENT_SENTRY_DSN=") elif [ -n "${CLIENT_SENTRY_DSN:-}" ]; then # Host env wins over baked default — useful for dev testing with a scratch DSN. docker_args+=(-e "CLIENT_SENTRY_DSN=${CLIENT_SENTRY_DSN}") fi # Run the client container (disable set -e so we can capture exit code and show a friendly message) local exit_code=0 docker run "${docker_args[@]}" "$PAXEL_CLIENT_IMAGE" || exit_code=$? # Exit code 3 == ClientPipeline::EXIT_NO_ANALYZABLE_SESSIONS (kept in sync with # the Ruby constant by comment only). Benign: the repo had sessions but they # were all too short to analyze — not a failure, so never print the scary # "email us" banner. The container already printed the friendly explanation; # in --all mode add a "skipping" line and move on to the next repo (mirrors the # zero-session pre-skip at the top of the multi-repo flow), in single-repo mode # just exit cleanly. if [ $exit_code -eq 3 ]; then if [ "$MULTI_REPO_RUNNING" -eq 1 ]; then echo "" echo "No analyzable sessions for ${MOUNT_LABEL:-this project} (sessions too short to analyze) — skipping." return 0 fi exit 0 fi # exit 4 (EXIT_UPLOAD_DEFERRED): analysis finished, the upload failed, but it was # stashed for replay. Honest message instead of "Upload complete!" — the results # aren't lost and the next run auto-uploads them; nothing lands at /reports until then. if [ $exit_code -eq 4 ]; then echo "" echo "⚠ Analysis finished, but the upload to the server didn't go through — your results are saved locally." echo " They'll upload automatically the next time you run, or re-run now to retry." echo " Your report won't appear at ${PAXEL_SERVER}/reports until it uploads." if [ "$MULTI_REPO_RUNNING" -eq 1 ]; then return 0; fi exit 0 fi # exit 5 (EXIT_UPLOAD_FAILED): analysis finished but the upload failed with no replay # artifact (a permanent rejection or an un-stashable result). There's nothing to # auto-retry, so tell the user to re-run rather than promising an automatic upload. if [ $exit_code -eq 5 ]; then echo "" echo "⚠ Analysis finished, but the upload to the server didn't go through and couldn't be queued for an automatic retry." echo " Re-run to try the upload again. Your report won't appear at ${PAXEL_SERVER}/reports until it uploads." if [ "$MULTI_REPO_RUNNING" -eq 1 ]; then return 0; fi exit 0 fi if [ $exit_code -ne 0 ]; then echo "" # Name the repo + point to its (now per-repo-named) log so a mid-batch failure is # diagnosable — the "═══ Analyzing: NAME ═══" header scrolls away, and the # generic message used to give the user nothing to act on. echo "Analysis failed for ${MOUNT_LABEL:-this project} (exit code: $exit_code)." >&2 echo " Log: ${log_dir}/${_log_label}-*.log" >&2 echo "Try again, or email paxel@ycombinator.com with that log if the problem persists." >&2 if [ "$MULTI_REPO_RUNNING" -eq 1 ]; then return $exit_code fi exit $exit_code fi if [ "$MULTI_REPO_RUNNING" -eq 0 ]; then echo "Upload complete! Check your results at: ${PAXEL_SERVER}/reports" echo " Logs saved to: ${log_dir}/" else echo " Done — log: ${log_dir}/${_log_label}-*.log" fi # Terminal bell + macOS notification on completion (skip in multi-repo mode, one at the end) if [ "$MULTI_REPO_RUNNING" -eq 0 ]; then printf '\a' if [ "$(uname -s)" = "Darwin" ]; then local notify_msg="Analysis uploaded." if [ -n "${ESTIMATED_MINUTES:-}" ]; then notify_msg="Analysis uploaded. Results in ~${ESTIMATED_MINUTES} minutes." fi osascript -e "display notification \"${notify_msg}\" with title \"Paxel\"" 2>/dev/null || true fi fi } run_docker_mode() { echo "" echo "YC Paxel — coding agent analysis" echo "Scanning for coding agent transcripts (Claude Code, Codex, Cursor)..." echo "" check_docker load_or_request_token register_git_identity # Pending-upload replay (from prior failed runs). # Replay-and-exit: if a stash exists, replay it and exit WITHOUT running # the fresh pipeline. The user re-runs `bin/upload` for a fresh analysis. # --no-replay (PAXEL_SKIP_REPLAY) bypasses. # PAXEL_IN_REPLAY guards against the rake task re-entering replay. if [ -z "${PAXEL_SKIP_REPLAY:-}" ] && [ -z "${PAXEL_IN_REPLAY:-}" ]; then local _pending_dir="$HOME/.paxel/data/pending-uploads" local _pending_count=0 if [ -d "$_pending_dir" ]; then _pending_count=$(find "$_pending_dir" -maxdepth 1 -type f -name '*.meta.json' 2>/dev/null | wc -l | tr -d ' ') fi if [ "$_pending_count" -gt 0 ] || [ -n "${PAXEL_RESUME_PENDING_ONLY:-}" ]; then if [ "$_pending_count" -eq 0 ]; then # --resume-pending with nothing pending — explicit exit (no fresh pipeline). echo "[paxel] No pending uploads — nothing to resume." exit 0 fi echo "" echo "[paxel] Found $_pending_count pending upload(s) from a prior failed run." # Pre-replay banner is intentionally neutral — the outcome-specific case # statement below emits the correct remediation command. Embedding # rerun_phrase here would show the user a pre-baked-token curl BEFORE # we know if the replay needs re-authentication (in which case that # token is the one to replace). Caught in review by both Codex and Opus. echo "[paxel] Replaying, then exiting." echo "" # Populate $PAXEL_CLIENT_IMAGE for this branch. Reuses the existing helper so # dev mode uses the locally-built tag and prod uses GHCR. PAXEL_QUIET_PULL=1 # suppresses the step-indexed banner + cost-coverage blurb, which are # misleading during a replay-and-exit (no 17-step pipeline is about to run). PAXEL_QUIET_PULL=1 pull_client_image # Mirror the fresh-pipeline's localhost→host.docker.internal rewrite. # Uses sed -E for macOS/BSD compatibility (no \b word boundary). local _replay_endpoint="${YC_RESULTS_ENDPOINT:-${PAXEL_SERVER}/api/v1/results}" _replay_endpoint=$(printf '%s' "$_replay_endpoint" | sed -E 's/(localhost|127\.0\.0\.1)/host.docker.internal/') mkdir -p "$HOME/.paxel/logs" local _replay_log local _replay_exit local _reauth_attempted=0 # Loop runs at most twice: once with the cached/current token; if that # exits 2 (reauth_required), clear the cached token, re-prompt via # load_or_request_token, and retry ONCE. A second-attempt 2 falls # through to the normal case statement below (user intervention # required). PAXEL_SKIP_REAUTH_RETRY=1 disables the loop for CI / # scripted runs that prefer an explicit exit-2 signal over an # interactive prompt. while :; do # Token: prefer shell env, fall back to the token file (matches load_or_request_token). # Re-read each iteration because load_or_request_token on retry replaces it. # `|| true` is required: `2>/dev/null` suppresses cat's stderr but NOT # its non-zero exit. Under `set -E`, the ERR trap fires on cat failure # inside the command substitution even though `local var=$(…)` masks # set -e exit propagation. Without it, users whose token file was # deleted (or never created — YC_TOKEN env-only path) would see the # "email us" banner before the replay runs. Empty `_replay_token` is # a valid state: the replay container 401s, falls through to exit=2 # (reauth), and the user gets the "re-authentication" message. local _replay_token="${YC_TOKEN:-$(cat "${PAXEL_TOKEN_FILE:-$HOME/.paxel/token}" 2>/dev/null || true)}" # Timestamped per-attempt so first-attempt output isn't clobbered on retry. # `.$$` (PID) suffix guards against a same-second collision if two # bin/upload invocations race each other — timestamp is second-resolution # alone, so concurrent runs would otherwise tee into the same file. if [ "$_reauth_attempted" = "1" ]; then _replay_log="$HOME/.paxel/logs/replay-$(date +%Y%m%d-%H%M%S)-retry.$$.log" else _replay_log="$HOME/.paxel/logs/replay-$(date +%Y%m%d-%H%M%S).$$.log" fi local _replay_args=( --rm -v "$HOME/.paxel/data:/rails/data:rw" -e YC_TOKEN="$_replay_token" -e YC_RESULTS_ENDPOINT="$_replay_endpoint" -e PAXEL_PENDING_UPLOAD_DIR=/rails/data/pending-uploads -e PAXEL_IN_REPLAY=1 ) # Propagate PAXEL_CLIENT_MODE into the replay container. Symmetric to # the fresh-pipeline forward at :4597 — AnthropicClient's env-aware # user_action helpers (rebuild / auth / input_too_large / # model_not_allowed / system_prompt_missing) need this to pick # dev-appropriate remediation text when a Fatal LLM error hits the # replay rake's log. (Added in PR #726.) if [ -n "${PAXEL_CLIENT_MODE:-}" ]; then _replay_args+=(-e "PAXEL_CLIENT_MODE=${PAXEL_CLIENT_MODE}") fi # Honor --no-sentry in the replay container (same policy as fresh-path at :4081). # The client initializer short-circuits when CLIENT_SENTRY_DSN is empty, so # forcing "" here disables telemetry even though the DSN is baked into the image. if [ "${NO_SENTRY:-0}" = "1" ]; then _replay_args+=(-e "CLIENT_SENTRY_DSN=") elif [ -n "${CLIENT_SENTRY_DSN:-}" ]; then _replay_args+=(-e "CLIENT_SENTRY_DSN=${CLIENT_SENTRY_DSN}") fi if [ "$(uname -s)" = "Linux" ]; then _replay_args+=(--add-host=host.docker.internal:host-gateway) fi _replay_args+=( --entrypoint /bin/bash "$PAXEL_CLIENT_IMAGE" -c 'bin/rails pending_uploads:replay' ) # Run and preserve the docker exit code through the tee pipe. # `|| true` is LOAD-BEARING: without it, under `set -Eeuo pipefail`, a # non-zero docker exit (e.g. deferred=1 on HTTP 504, or a crashed daemon) # aborts the LHS group BEFORE `echo "__EXIT__:$?"` runs, skipping the # sentinel and firing the ERR trap's "email us" banner twice (once per # pipeline command). With `|| true`, bash's set-e exception suppresses # errexit for the whole pipeline AND inside the LHS group, so the # sentinel is emitted and the case statement below handles the exit code. { docker run "${_replay_args[@]}"; echo "__EXIT__:$?"; } 2>&1 | tee "$_replay_log" || true # `2>/dev/null || true` guards the fallback below: if tee never wrote # the log (disk full, unwritable dir), awk returns non-zero and would # otherwise trip the ERR trap before the `-z` fallback can assign 99. _replay_exit=$(awk -F: '/^__EXIT__:/ { print $2; exit }' "$_replay_log" 2>/dev/null || true) # Defensive fallback — if the sentinel is missing (docker crashed before # printing, tee write failed, etc.), default to a generic failure code # rather than passing "" to `exit` (which bash rejects with "numeric # argument required" and leaves the user with a confusing error). [ -z "$_replay_exit" ] && _replay_exit=99 # Strip the sentinel from the log for cleanliness. sed -i.bak '/^__EXIT__:/d' "$_replay_log" 2>/dev/null && rm -f "${_replay_log}.bak" # Auto-re-auth on reauth_required (exit 2), once. Without this the # user has to manually re-run bin/upload after re-authing; the replay # itself already knows what needs fixing. if [ "$_replay_exit" = "2" ] \ && [ "$_reauth_attempted" = "0" ] \ && [ "${PAXEL_SKIP_REAUTH_RETRY:-0}" != "1" ]; then echo "" echo "[paxel] Your session expired — re-authenticating..." # Clear the invalid cached token so load_or_request_token falls # through to the baked-token / browser-auth paths. Unset the env # var too, since it takes precedence in that function. rm -f "${PAXEL_TOKEN_FILE:-$HOME/.paxel/token}" 2>/dev/null || true unset YC_TOKEN _reauth_attempted=1 load_or_request_token echo "" echo "[paxel] Retrying replay with refreshed credentials..." continue fi break done echo "" case "$_replay_exit" in 0) echo "[paxel] Replay complete. $(rerun_phrase fresh)" # All stashes cleared, so the next run skips this gate and reaches the # picker — re-analyzing any repo a prior multi-repo run didn't finish. multi_repo_replay_hint ;; 1) echo "[paxel] Replay partial — some stashes deferred. $(rerun_phrase next_upload)" ;; 2) if [ "$_reauth_attempted" = "1" ]; then echo "[paxel] Replay still needs re-authentication after one retry — your credentials may be revoked." echo "[paxel] Sign in again at: ${PAXEL_SERVER}/auth/login" else echo "[paxel] Replay needs re-authentication. $(rerun_phrase reauth)" fi ;; *) echo "[paxel] Replay failed with exit code $_replay_exit. $(rerun_phrase bypass_replay)" ;; esac echo "[paxel] Log: $_replay_log" exit "$_replay_exit" fi fi local match_label="" # Auto-scope to current project unless --all was passed. # Filtering happens HERE on the host (not in Docker — the container can't # see host git repos, so it only has substring matching as a fallback). # # Even when --project is explicit, we still run this block when invoked # from inside a git repo. If the filter produces a match, the resulting # sidecar gives TranscriptDiscoverer the git_remote it needs to collapse # Conductor worktree scatter. If PROJECT_NAME is set but doesn't match # the matched repo's name (e.g. user ran --project yc-backend from inside # paxel), we discard the filter below and let the container substring-match. if [ "$ALL_PROJECTS" -eq 0 ]; then local filtered_dir="${HOME}/.paxel/cache/filtered-transcripts-$$" rm -rf "$filtered_dir" mkdir -p "$filtered_dir" local match_count=0 match_label="" # Strategy 1: match by git remote URL (most accurate for repos with multiple workspaces) local cwd_remote cwd_remote=$(get_git_remote "$(pwd)") if [ -n "$cwd_remote" ]; then local repo_name repo_name=$(echo "$cwd_remote" | sed 's|.*[:/]||' | sed 's/\.git$//') match_label="$repo_name" if [ -d "$CLAUDE_DIR" ]; then # First pass: collect all dir names, CWDs, and remotes local _bfc_names=() _bfc_cwds=() _bfc_remotes=() local _scan_total=0 local _scan_count=0 for d in "$CLAUDE_DIR"/*/; do [ -d "$d" ] && _scan_total=$((_scan_total + 1)); done echo " Finding your coding sessions..." >&2 for dir in "$CLAUDE_DIR"/*/; do [ -d "$dir" ] || continue _scan_count=$((_scan_count + 1)) if [ $((_scan_count % 500)) -eq 0 ]; then echo " ...${_scan_count}/${_scan_total} checked" >&2 fi local dir_name dir_name=$(basename "$dir") local dir_cwd dir_cwd=$(get_project_cwd "$dir_name") local dir_remote dir_remote=$(get_git_remote "$dir_cwd") _bfc_names+=("$dir_name") _bfc_cwds+=("$dir_cwd") _bfc_remotes+=("$dir_remote") done # Backfill remotes for deleted Conductor workspaces backfill_conductor_remotes # Recover remotes for non-Conductor orphan cwds local _orphan_j=0 while [ $_orphan_j -lt ${#_bfc_cwds[@]} ]; do if [ -z "${_bfc_remotes[$_orphan_j]}" ]; then local _orphan_recovered _orphan_recovered=$(resolve_remote_for_dead_cwd "${_bfc_cwds[$_orphan_j]}") [ -n "$_orphan_recovered" ] && _bfc_remotes[$_orphan_j]="$_orphan_recovered" fi _orphan_j=$((_orphan_j + 1)) done # Second pass: filter by matching remote local i=0 while [ $i -lt ${#_bfc_names[@]} ]; do if [ "${_bfc_remotes[$i]}" = "$cwd_remote" ]; then # CoW clone, mtime-preserving (-p) so --since (container File.mtime # filter) works. Bare under active errexit → a copy failure aborts loud. _paxel_cp_transcripts "$CLAUDE_DIR/${_bfc_names[$i]}" "$filtered_dir/${_bfc_names[$i]}" match_count=$((match_count + 1)) fi i=$((i + 1)) done fi fi # Strategy 2: match by current directory path (works without git) if [ "$match_count" -eq 0 ]; then local current_dir current_dir=$(pwd) # Encode current path the same way Claude does: it replaces BOTH "/" AND # "." with "-" (e.g. /a/b.c -> -a-b-c). Matching only "/" (the old # `sed 's|/|-|g'`) silently broke Strategy-2 path matching for any cwd # containing a dot — a repo/domain/dir like x70.one or qerdp.co.uk, even a # macOS /var/folders/.../T/tmp.X path — so a no-remote repo fell through to # "None of your sessions match this directory." See SESSION_DETECTION.md # §3a. ("." is a literal inside the [] bracket expression on BSD + GNU sed.) local encoded_cwd encoded_cwd=$(encode_claude_dir_name "$current_dir") match_label="$(basename "$current_dir")" if [ -d "$CLAUDE_DIR" ]; then for dir in "$CLAUDE_DIR"/*/; do [ -d "$dir" ] || continue local dir_name dir_name=$(basename "$dir") # Exact match (this workspace) or prefix match (subdirectory) if [ "$dir_name" = "$encoded_cwd" ]; then # CoW clone, mtime-preserving (-p) so --since (container File.mtime # filter) works. Bare under active errexit → a copy failure aborts loud. _paxel_cp_transcripts "$dir" "$filtered_dir/$dir_name" match_count=$((match_count + 1)) fi done fi fi # PROJECT_NAME override safety: if the user passed --project NAME and the # auto-filter matched a DIFFERENT repo (e.g. ran --project yc-backend from # inside paxel), discard the filter so the container's substring match can # pick the right thing. Matches repo_name loosely (case-insensitive, -/_ # treated the same). if [ "$match_count" -gt 0 ] && [ -n "$PROJECT_NAME" ]; then local _pn_norm _pn_norm=$(echo "$PROJECT_NAME" | tr '[:upper:]' '[:lower:]' | tr '_' '-') local _rn_norm _rn_norm=$(echo "${match_label}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') case "$_rn_norm" in *"$_pn_norm"*) : ;; # matched project-name substring — keep the filter *) echo " --project $PROJECT_NAME given, but current dir is '${match_label}'. Deferring to container substring match." >&2 rm -rf "$filtered_dir" mkdir -p "$filtered_dir" match_count=0 ;; esac fi if [ "$match_count" -gt 0 ]; then # Check: if match_count is low AND this directory has child repos with more data, # prefer the multi-repo picker over a weak single-project match. # This handles the case where ~/git itself is a repo but the user really wants # to analyze child repos inside ~/git. local child_repo_override=0 if [ "$match_count" -le 2 ]; then local self_sessions self_sessions=$(count_sessions "$filtered_dir") if detect_child_repos && [ ${#CHILD_REPO_NAMES[@]} -ge 2 ]; then # Sum child repo sessions local child_total=0 local cr=0 while [ $cr -lt ${#CHILD_REPO_SESSIONS[@]} ]; do child_total=$((child_total + ${CHILD_REPO_SESSIONS[$cr]})) cr=$((cr + 1)) done # If child repos have significantly more data, prefer multi-repo if [ "$child_total" -gt $((self_sessions * 3)) ]; then child_repo_override=1 fi fi fi if [ "$child_repo_override" -eq 1 ]; then # Discard the weak single-project match, use multi-repo path instead rm -rf "$filtered_dir" show_child_repo_menu run_selected_child_repos fi local orig_claude_dir="$CLAUDE_DIR" # Compute OLDEST_SESSION_EPOCH from source files (before cp changed mtime) # Uses original CLAUDE_DIR files, not filtered_dir copies if [ -z "$OLDEST_SESSION_EPOCH" ]; then local _oldest # Trailing `|| true` swallows the SIGPIPE (141) from `head -1` closing # upstream `stat` writes under `set -euo pipefail` — with 4000+ jsonls, # head reliably closes before stat drains and without this the whole # pipeline returns 141, killing the script mid-scan. # GNU stat -c %Y first: BSD stat -f on Linux exits 0 with filesystem # info, so BSD-first would silently poison the result on the CI runner. _oldest=$(find "$orig_claude_dir" -name "*.jsonl" -not -name "_*" -maxdepth 3 2>/dev/null \ | while read -r f; do stat -c '%Y' "$f" 2>/dev/null || stat -f '%m' "$f" 2>/dev/null; done \ | sort -n | head -1 || true) [ -n "$_oldest" ] && OLDEST_SESSION_EPOCH="$_oldest" fi CLAUDE_DIR="$filtered_dir" CLAUDE_MOUNT_SCOPE="filtered" MOUNT_LABEL="$match_label" # Scope Cursor extraction to the same remote so cross-repo Cursor sessions # don't leak into a single-project upload. Without this, collect_cursor_sessions # gets an empty filter and pulls in every workspace's Cursor history. selected_remote="$cwd_remote" echo "Auto-detected project: ${match_label} (${match_count} workspaces)" >&2 # Low-count warning: if we matched very few workspaces but there are many more available if [ "$match_count" -le 3 ]; then local total_dirs=0 for d in "$orig_claude_dir"/*/; do [ -d "$d" ] && total_dirs=$((total_dirs + 1)) done if [ "$total_dirs" -ge $((match_count * 3)) ]; then local total_sessions total_sessions=$(count_sessions "$orig_claude_dir" 2>/dev/null || echo "many") echo "" >&2 echo " Tip: You have ${total_sessions} total sessions across ${total_dirs} projects." >&2 echo " To include all: $(rerun_cmd --all)" >&2 echo " To pick a different project: $(rerun_cmd --project NAME)" >&2 fi fi # Collect git metadata for Docker mode (Docker can't see host git repos) # Write _metadata.json + _git/ dir with commits and numstat # Use backfill-resolved data from _bfc_* arrays when available (Strategy 1), # fall back to direct resolution for Strategy 2 matches. if [ -z "${_bfc_names+x}" ]; then _bfc_names=(); _bfc_cwds=(); _bfc_remotes=(); fi local git_data_dir="${filtered_dir}/_git" mkdir -p "$git_data_dir" local docker_metadata="{}" local _author_cwds_done="" for dir in "$filtered_dir"/*/; do [ -d "$dir" ] || continue local dir_name dir_name=$(basename "$dir") # Look up resolved CWD/remote from backfill arrays if available local dir_cwd="" local dir_remote="" if [ -n "${_bfc_names+x}" ] && [ ${#_bfc_names[@]} -gt 0 ]; then local _lookup=0 while [ $_lookup -lt ${#_bfc_names[@]} ]; do if [ "${_bfc_names[$_lookup]}" = "$dir_name" ]; then dir_cwd="${_bfc_cwds[$_lookup]}" dir_remote="${_bfc_remotes[$_lookup]}" break fi _lookup=$((_lookup + 1)) done fi # Fall back to direct resolution (Strategy 2 matches) if [ -z "$dir_cwd" ]; then dir_cwd=$(get_project_cwd "$dir_name") dir_remote=$(get_git_remote "$dir_cwd") if [ -z "$dir_remote" ] && [ -n "$dir_cwd" ] && [ ! -e "$dir_cwd" ]; then dir_remote=$(resolve_remote_for_dead_cwd "$dir_cwd") fi fi # Build metadata sidecar if [ -n "$dir_remote" ] || [ -n "$dir_cwd" ]; then docker_metadata=$(echo "$docker_metadata" | jq \ --arg dir "$dir_name" \ --arg remote "$dir_remote" \ --arg cwd "$dir_cwd" \ '. + {($dir): {"git_remote": $remote, "cwd": $cwd}}' 2>/dev/null || echo "$docker_metadata") fi # Collect git commits + numstat if this is a git repo if [ -n "$dir_cwd" ] && [ -e "$dir_cwd/.git" ]; then local encoded encoded=$(echo "$dir_name" | sed 's/[^a-zA-Z0-9_-]/_/g') local since_flag="" [ -n "$SINCE_EPOCH" ] && since_flag="--since=$(date -r "$SINCE_EPOCH" '+%Y-%m-%d' 2>/dev/null || date -d "@$SINCE_EPOCH" '+%Y-%m-%d' 2>/dev/null || echo '')" # Total commit count (for accurate reporting — cheap, no data) git -C "$dir_cwd" rev-list --count HEAD \ > "${git_data_dir}/${encoded}_commit_count.txt" 2>/dev/null || true # Recent commits with author emails (TSV, subject LAST) — team-wide for # velocity context; robust to quotes/backslashes in subjects (audit C13). git -C "$dir_cwd" log -${COMMIT_LIMIT:-1000} $since_flag \ --format='%H%x09%h%x09%aN%x09%aE%x09%aI%x09%s' \ > "${git_data_dir}/${encoded}_commits.jsonl" 2>/dev/null || true # Numstat with author emails (same format as legacy upload) — team-wide for velocity git -C "$dir_cwd" log -${COMMIT_LIMIT:-1000} $since_flag \ --format='COMMIT_BOUNDARY %H %aI %aN <%aE>' --numstat \ > "${git_data_dir}/${encoded}_numstat.txt" 2>/dev/null || true # Author-filtered commits for episode linking (collect once per remote, copy for others) local _dedup_key="${dir_remote:-${dir_cwd}}" local _first_encoded="" case "$_author_cwds_done" in *"|${_dedup_key}="*) _first_encoded=$(echo "$_author_cwds_done" | grep -o "|${_dedup_key}=[^|]*|" | sed "s#|${_dedup_key}=##;s#|##") if [ -n "$_first_encoded" ]; then cp -f "${git_data_dir}/${_first_encoded}_author_commits.jsonl" "${git_data_dir}/${encoded}_author_commits.jsonl" 2>/dev/null || true cp -f "${git_data_dir}/${_first_encoded}_author_numstat.txt" "${git_data_dir}/${encoded}_author_numstat.txt" 2>/dev/null || true fi ;; *) local author_emails author_emails=$(detect_author_emails "$dir_cwd" "$filtered_dir") if [ -n "$author_emails" ]; then # Clamp the author-commit floor to --since when active: OLDEST_SESSION_EPOCH # is the absolute-oldest session, which can predate the requested window. # Local var so we don't mutate the shared global. (Mirrors the picker path.) local _floor_epoch="$OLDEST_SESSION_EPOCH" if [ -n "$SINCE_EPOCH" ]; then if [ -z "$_floor_epoch" ] || [ "$_floor_epoch" -lt "$SINCE_EPOCH" ]; then _floor_epoch="$SINCE_EPOCH" fi fi local oldest_date="" if [ -n "$_floor_epoch" ]; then oldest_date=$(date -r "$_floor_epoch" '+%Y-%m-%d' 2>/dev/null || date -d "@$_floor_epoch" '+%Y-%m-%d' 2>/dev/null || echo '') fi collect_author_commits "$dir_cwd" "$git_data_dir" "$encoded" "$author_emails" "$oldest_date" _author_cwds_done="${_author_cwds_done}|${_dedup_key}=${encoded}|" fi ;; esac fi done # Write _metadata.json sidecar local _rmdc_total _rmdc_total=$(_rmdc_recovery_count_unique) if command -v jq &>/dev/null; then jq -n --argjson dirs "$docker_metadata" \ --argjson recoveries "${_rmdc_total:-0}" \ '{"version": 1, "directories": $dirs, "orphan_recovery_count": $recoveries}' \ > "${filtered_dir}/_metadata.json" fi elif [ -n "$cwd_remote" ] && remote_has_agent_sessions "$cwd_remote"; then # No Claude Code history for this repo, but Codex/Cursor/opencode have # sessions here — set up a Claude-LESS filtered run so they get scoped to # this project and uploaded, instead of falling to the "none match" prompt. # This makes auto-detect tool-agnostic: any tool or combination works # without Claude. The empty filtered_dir means no Claude transcripts; # run_docker_analysis still mounts it at /transcripts (0 sessions is fine), # extracts the agent tools filtered to selected_remote, and analyze_local # merges their buckets in. The cwd repo is mounted for code quality as usual. CLAUDE_DIR="$filtered_dir" CLAUDE_MOUNT_SCOPE="filtered" MOUNT_LABEL="$match_label" selected_remote="$cwd_remote" if command -v jq >/dev/null 2>&1; then echo '{"version": 1, "directories": {}}' > "${filtered_dir}/_metadata.json" fi echo "Auto-detected project: ${match_label} (matched Codex/Cursor/opencode/Gemini sessions; no Claude Code history here)" >&2 elif detect_child_repos; then # Strategy 3: parent directory with child repos that have transcript data rm -rf "$filtered_dir" show_child_repo_menu run_selected_child_repos else # No matches — confirm with user before processing everything rm -rf "$filtered_dir" echo "None of your sessions match this directory." >&2 echo " To analyze a specific project: $(rerun_cmd --project NAME)" >&2 if [ -c /dev/tty ]; then echo "" >&2 echo "Options:" >&2 echo " 1) Analyze ALL projects" >&2 echo " 2) Cancel" >&2 echo "" >&2 local choice user_read -rp "Choose [1-2]: " choice case "$choice" in 1) ALL_PROJECTS=1 ;; *) echo "Cancelled."; exit 0 ;; esac else echo "To specify what to analyze:" >&2 echo " $(rerun_cmd --project NAME)" >&2 echo " $(rerun_cmd --all)" >&2 exit 1 fi fi fi # Count sessions and show time estimate (or abort if zero) local claude_count claude_count=$(count_sessions "$CLAUDE_DIR") # Codex sessions split by originator: the user's mental model is "I ran # codex N times" (standalone), distinct from "Claude dispatched codex M # times" (cross-tool, looks like a subagent invocation). codex-companion # writes payload.originator="Claude Code" for the latter; codex_cli_rs etc. # for the former. Server picks up the same distinction post-link via # CrossToolLinker (cross_tool_origin column). local codex_standalone_count=0 local codex_cross_tool_count=0 if [ -d "$CODEX_DIR" ] && [ -z "${TRANSCRIPT_DIR:-}" ]; then if [ -n "${selected_remote:-}" ]; then # Single-project mode: count only sessions matching project remote # (mirrors collect_codex_sessions filter applied during extraction). # Keep get_codex_session_remote unchanged — it has cwd-fallback semantics # used by collect_codex_sessions; classify originator separately. local _sr_norm _sr_norm=$(normalize_remote "$selected_remote") while IFS= read -r _cf; do [ -z "$_cf" ] && continue if [ -n "$SINCE_EPOCH" ]; then local _fm _fm=$(stat -c %Y "$_cf" 2>/dev/null || stat -f %m "$_cf" 2>/dev/null || echo "0") [ "$_fm" -lt "$SINCE_EPOCH" ] 2>/dev/null && continue fi local _cr _cr=$(get_codex_session_remote "$_cf") [ "$_cr" != "$_sr_norm" ] && continue local _co _co=$(get_codex_session_originator "$_cf") if codex_originator_is_standalone "$_co"; then codex_standalone_count=$((codex_standalone_count + 1)) else codex_cross_tool_count=$((codex_cross_tool_count + 1)) fi done < <(find "$CODEX_DIR" -name "*.jsonl" -maxdepth 6 2>/dev/null) else # Orphan-cwd path: no remote to filter on, count every Codex JSONL # split by originator. while IFS= read -r _cf; do [ -z "$_cf" ] && continue local _co _co=$(get_codex_session_originator "$_cf") if codex_originator_is_standalone "$_co"; then codex_standalone_count=$((codex_standalone_count + 1)) else codex_cross_tool_count=$((codex_cross_tool_count + 1)) fi done < <(find "$CODEX_DIR" -name "*.jsonl" -maxdepth 6 2>/dev/null) fi fi # opencode sessions (SQLite-backed) matching this project's remote. Folded # into N so an opencode-only user isn't told "No sessions found". local opencode_count opencode_count=$(count_opencode_sessions "${selected_remote:-}") # Gemini CLI sessions matching this project's remote. Folded into N so a # Gemini-only user isn't told "No sessions found". local gemini_count gemini_count=$(count_gemini_sessions "${selected_remote:-}") # Cross-tool Codex is folded into the subagent total below — it appears in # M (subagents), NOT in N (sessions). No double-count. local session_count=$((claude_count + codex_standalone_count + opencode_count + gemini_count)) if [ "$session_count" -eq 0 ]; then echo "" echo "No sessions found in this directory." echo "Run from a project directory with coding agent sessions, or include all:" echo " $(rerun_cmd --all)" exit 1 fi local subagent_count # NOTE — counts ALL subagents under $CLAUDE_DIR globally, not just this # project's. claude_count (line above) has the same global scope, so the # ratio is internally consistent for this prelude display. Project-scoping # requires mapping selected_remote → matched Claude project dirs (the same # mapping the multi-repo flow does in prepare_and_run_for_repo); deferred # to Phase 4 since neither subagent_count nor claude_count drive any # downstream gate — they're prelude display only. (Phase 3.5 callout) subagent_count=$(count_subagent_sessions "$CLAUDE_DIR") # Fold Codex-via-Claude into subagents — they look like subagent invocations # to the user. print_estimate displays each component on its own line so the # mix is legible. local subagent_total=$((subagent_count + codex_cross_tool_count)) local data_mb data_mb=$(get_data_size "$CLAUDE_DIR") ESTIMATED_MINUTES="" if [ -n "$PROJECT_NAME" ]; then echo "" echo "This usually takes a few minutes." echo "" echo " ★ You'll get an email when your report is ready." echo "" ESTIMATED_MINUTES="5" else print_estimate "$session_count" "$data_mb" "$claude_count" "$codex_standalone_count" "$codex_cross_tool_count" "$match_label" "$subagent_total" # opencode sessions are folded into the $session_count total above; show # the per-tool line too so the breakdown sums (mirrors the Codex CLI line). [ "$opencode_count" -gt 0 ] && echo " opencode: ${opencode_count} sessions" [ "$gemini_count" -gt 0 ] && echo " Gemini CLI: ${gemini_count} sessions" fi # Pass estimate to Docker for telemetry export PAXEL_HOST_ESTIMATE_MINUTES="${ESTIMATED_MINUTES:-}" # Set selected_remote so Cursor extraction filters by this project's remote selected_remote="${cwd_remote:-}" pull_client_image run_docker_analysis } # --- Main --- while [ $# -gt 0 ]; do case "$1" in --project) PROJECT_NAME="$2" shift 2 ;; --since) SINCE_EPOCH=$(parse_since "$2") shift 2 ;; --all) ALL_PROJECTS=1 shift ;; --no-repo) NO_REPO=1 shift ;; --with-repo) # On by default now. Kept for backward compatibility. shift ;; --clean) CLEAN=1 shift ;; --no-sentry) NO_SENTRY=1 shift ;; --commits) COMMIT_LIMIT="$2" shift 2 ;; --no-orphan-recovery) PAXEL_NO_ORPHAN_RECOVERY=1 export PAXEL_NO_ORPHAN_RECOVERY shift ;; --clear-cache) # Clear legacy v1 (raw git URLs) and current v2 (normalized) caches. _cleared=0 for _cache_file in \ "$HOME/.paxel/cache/project-remotes.tsv" \ "$HOME/.paxel/cache/project-remotes-v2.tsv"; do if [ -f "$_cache_file" ]; then rm -f "$_cache_file" echo "[paxel] Cleared project-remotes cache ($_cache_file)" >&2 _cleared=$((_cleared + 1)) fi done [ "$_cleared" -eq 0 ] && echo "[paxel] No project-remotes cache to clear" >&2 unset _cache_file _cleared shift ;; --clear-pending) _pending_dir="$HOME/.paxel/data/pending-uploads" if [ -d "$_pending_dir" ]; then _count=$(find "$_pending_dir" -type f \( -name '*.json*' -o -name '*.meta.json' -o -name '*.error.json' \) 2>/dev/null | wc -l | tr -d ' ') rm -rf "$_pending_dir" echo "[paxel] Cleared $_count pending upload artifact(s)" >&2 else echo "[paxel] No pending uploads to clear" >&2 fi unset _pending_dir _count exit 0 ;; --resume-pending) PAXEL_RESUME_PENDING_ONLY=1 export PAXEL_RESUME_PENDING_ONLY shift ;; --no-replay) PAXEL_SKIP_REPLAY=1 export PAXEL_SKIP_REPLAY shift ;; *) echo "Unknown option: $1" echo "Usage: $0 [--project NAME] [--since DURATION] [--commits N] [--all] [--no-repo] [--no-sentry] [--clean] [--no-orphan-recovery] [--clear-cache] [--clear-pending] [--resume-pending] [--no-replay]" echo " or: $(rerun_cmd '[OPTIONS]')" exit 1 ;; esac done # Docker mode. File bodies stay local; aggregate metrics + metadata # (paths, commit numstat, session events) upload. See /data-handling. run_docker_mode