From ad06b12e50470b470dddecb17ff7ac1a806e6534 Mon Sep 17 00:00:00 2001
From: claude <claude@llm-impact.org>
Date: Mon, 16 Mar 2026 15:11:30 +0000
Subject: [PATCH] Log edited file list in impact hook for review delta analysis

The hook now records which files were edited and how many times,
enabling future comparison with committed code to measure human
review effort (Phase 2 of quantify-social-costs plan).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .claude/hooks/pre-compact-snapshot.sh        | 76 +++++++++++++++-----
 impact-toolkit/hooks/pre-compact-snapshot.sh | 76 +++++++++++++++-----
 tasks/README.md                              |  1 +
 3 files changed, 119 insertions(+), 34 deletions(-)

diff --git a/.claude/hooks/pre-compact-snapshot.sh b/.claude/hooks/pre-compact-snapshot.sh
index c37da0b..699049d 100755
--- a/.claude/hooks/pre-compact-snapshot.sh
+++ b/.claude/hooks/pre-compact-snapshot.sh
@@ -131,24 +131,29 @@ else:
     auto_ratio_pm = 0
 
 print(f'{turns}\t{input_tokens}\t{cache_creation}\t{cache_read}\t{output_tokens}\t{model_id}\t{auto_ratio_pm}\t{user_tokens_est}\t{unique_files}\t{total_edits}\t{test_passes}\t{test_failures}\t{has_public_push}')
+# Second line: JSON array of edited files with counts
+print(json.dumps(edited_files))
 " "$TRANSCRIPT_PATH" 2>/dev/null || echo "")
 
-  if [ -n "$USAGE_DATA" ] && [ "$(echo "$USAGE_DATA" | cut -f1)" -gt 0 ] 2>/dev/null; then
+  USAGE_LINE1=$(echo "$USAGE_DATA" | head -1)
+  EDITED_FILES_JSON=$(echo "$USAGE_DATA" | tail -1)
+
+  if [ -n "$USAGE_LINE1" ] && [ "$(echo "$USAGE_LINE1" | cut -f1)" -gt 0 ] 2>/dev/null; then
     # Actual token counts available
     TOKEN_SOURCE="actual"
-    ASSISTANT_TURNS=$(echo "$USAGE_DATA" | cut -f1)
-    INPUT_TOKENS=$(echo "$USAGE_DATA" | cut -f2)
-    CACHE_CREATION=$(echo "$USAGE_DATA" | cut -f3)
-    CACHE_READ=$(echo "$USAGE_DATA" | cut -f4)
-    OUTPUT_TOKENS=$(echo "$USAGE_DATA" | cut -f5)
-    MODEL_ID=$(echo "$USAGE_DATA" | cut -f6)
-    AUTO_RATIO_PM=$(echo "$USAGE_DATA" | cut -f7)
-    USER_TOKENS_EST=$(echo "$USAGE_DATA" | cut -f8)
-    UNIQUE_FILES=$(echo "$USAGE_DATA" | cut -f9)
-    TOTAL_EDITS=$(echo "$USAGE_DATA" | cut -f10)
-    TEST_PASSES=$(echo "$USAGE_DATA" | cut -f11)
-    TEST_FAILURES=$(echo "$USAGE_DATA" | cut -f12)
-    HAS_PUBLIC_PUSH=$(echo "$USAGE_DATA" | cut -f13)
+    ASSISTANT_TURNS=$(echo "$USAGE_LINE1" | cut -f1)
+    INPUT_TOKENS=$(echo "$USAGE_LINE1" | cut -f2)
+    CACHE_CREATION=$(echo "$USAGE_LINE1" | cut -f3)
+    CACHE_READ=$(echo "$USAGE_LINE1" | cut -f4)
+    OUTPUT_TOKENS=$(echo "$USAGE_LINE1" | cut -f5)
+    MODEL_ID=$(echo "$USAGE_LINE1" | cut -f6)
+    AUTO_RATIO_PM=$(echo "$USAGE_LINE1" | cut -f7)
+    USER_TOKENS_EST=$(echo "$USAGE_LINE1" | cut -f8)
+    UNIQUE_FILES=$(echo "$USAGE_LINE1" | cut -f9)
+    TOTAL_EDITS=$(echo "$USAGE_LINE1" | cut -f10)
+    TEST_PASSES=$(echo "$USAGE_LINE1" | cut -f11)
+    TEST_FAILURES=$(echo "$USAGE_LINE1" | cut -f12)
+    HAS_PUBLIC_PUSH=$(echo "$USAGE_LINE1" | cut -f13)
 
     # Cumulative input = all tokens that went through the model.
     # Cache reads are cheaper (~10-20% of full compute), so we weight them.
@@ -184,6 +189,7 @@ print(f'{turns}\t{input_tokens}\t{cache_creation}\t{cache_read}\t{output_tokens}
     TEST_PASSES=0
     TEST_FAILURES=0
     HAS_PUBLIC_PUSH=0
+    EDITED_FILES_JSON="{}"
   fi
 
   # --- Cost estimates ---
@@ -226,12 +232,48 @@ else
   TEST_PASSES=0
   TEST_FAILURES=0
   HAS_PUBLIC_PUSH=0
+  EDITED_FILES_JSON="{}"
 fi
 
 # --- Write log entry ---
 
-cat >> "$LOG_FILE" <<EOF
-{"timestamp":"$TIMESTAMP","session_id":"$SESSION_ID","trigger":"$TRIGGER","token_source":"$TOKEN_SOURCE","transcript_bytes":$TRANSCRIPT_BYTES,"transcript_lines":$TRANSCRIPT_LINES,"assistant_turns":$ASSISTANT_TURNS,"tool_uses":$TOOL_USES,"cumulative_input_tokens":$CUMULATIVE_INPUT,"cumulative_input_raw":$CUMULATIVE_INPUT_RAW,"cache_creation_tokens":$CACHE_CREATION,"cache_read_tokens":$CACHE_READ,"output_tokens":$OUTPUT_TOKENS,"energy_wh":$ENERGY_WH,"co2_g":$CO2_G,"cost_cents":$COST_CENTS,"model_id":"$MODEL_ID","automation_ratio_pm":$AUTO_RATIO_PM,"user_tokens_est":$USER_TOKENS_EST,"unique_files_edited":$UNIQUE_FILES,"total_file_edits":$TOTAL_EDITS,"test_passes":$TEST_PASSES,"test_failures":$TEST_FAILURES,"has_public_push":$HAS_PUBLIC_PUSH}
-EOF
+# Build log entry using Python to safely embed the edited_files JSON
+python3 -c "
+import json, sys
+entry = {
+    'timestamp': sys.argv[1],
+    'session_id': sys.argv[2],
+    'trigger': sys.argv[3],
+    'token_source': sys.argv[4],
+    'transcript_bytes': int(sys.argv[5]),
+    'transcript_lines': int(sys.argv[6]),
+    'assistant_turns': int(sys.argv[7]),
+    'tool_uses': int(sys.argv[8]),
+    'cumulative_input_tokens': int(sys.argv[9]),
+    'cumulative_input_raw': int(sys.argv[10]),
+    'cache_creation_tokens': int(sys.argv[11]),
+    'cache_read_tokens': int(sys.argv[12]),
+    'output_tokens': int(sys.argv[13]),
+    'energy_wh': int(sys.argv[14]),
+    'co2_g': int(sys.argv[15]),
+    'cost_cents': int(sys.argv[16]),
+    'model_id': sys.argv[17],
+    'automation_ratio_pm': int(sys.argv[18]),
+    'user_tokens_est': int(sys.argv[19]),
+    'unique_files_edited': int(sys.argv[20]),
+    'total_file_edits': int(sys.argv[21]),
+    'test_passes': int(sys.argv[22]),
+    'test_failures': int(sys.argv[23]),
+    'has_public_push': int(sys.argv[24]),
+    'edited_files': json.loads(sys.argv[25]),
+}
+print(json.dumps(entry, separators=(',', ':')))
+" "$TIMESTAMP" "$SESSION_ID" "$TRIGGER" "$TOKEN_SOURCE" \
+  "$TRANSCRIPT_BYTES" "$TRANSCRIPT_LINES" "$ASSISTANT_TURNS" "$TOOL_USES" \
+  "$CUMULATIVE_INPUT" "$CUMULATIVE_INPUT_RAW" "$CACHE_CREATION" "$CACHE_READ" \
+  "$OUTPUT_TOKENS" "$ENERGY_WH" "$CO2_G" "$COST_CENTS" \
+  "$MODEL_ID" "$AUTO_RATIO_PM" "$USER_TOKENS_EST" \
+  "$UNIQUE_FILES" "$TOTAL_EDITS" "$TEST_PASSES" "$TEST_FAILURES" \
+  "$HAS_PUBLIC_PUSH" "$EDITED_FILES_JSON" >> "$LOG_FILE"
 
 exit 0
diff --git a/impact-toolkit/hooks/pre-compact-snapshot.sh b/impact-toolkit/hooks/pre-compact-snapshot.sh
index c37da0b..699049d 100755
--- a/impact-toolkit/hooks/pre-compact-snapshot.sh
+++ b/impact-toolkit/hooks/pre-compact-snapshot.sh
@@ -131,24 +131,29 @@ else:
     auto_ratio_pm = 0
 
 print(f'{turns}\t{input_tokens}\t{cache_creation}\t{cache_read}\t{output_tokens}\t{model_id}\t{auto_ratio_pm}\t{user_tokens_est}\t{unique_files}\t{total_edits}\t{test_passes}\t{test_failures}\t{has_public_push}')
+# Second line: JSON array of edited files with counts
+print(json.dumps(edited_files))
 " "$TRANSCRIPT_PATH" 2>/dev/null || echo "")
 
-  if [ -n "$USAGE_DATA" ] && [ "$(echo "$USAGE_DATA" | cut -f1)" -gt 0 ] 2>/dev/null; then
+  USAGE_LINE1=$(echo "$USAGE_DATA" | head -1)
+  EDITED_FILES_JSON=$(echo "$USAGE_DATA" | tail -1)
+
+  if [ -n "$USAGE_LINE1" ] && [ "$(echo "$USAGE_LINE1" | cut -f1)" -gt 0 ] 2>/dev/null; then
     # Actual token counts available
     TOKEN_SOURCE="actual"
-    ASSISTANT_TURNS=$(echo "$USAGE_DATA" | cut -f1)
-    INPUT_TOKENS=$(echo "$USAGE_DATA" | cut -f2)
-    CACHE_CREATION=$(echo "$USAGE_DATA" | cut -f3)
-    CACHE_READ=$(echo "$USAGE_DATA" | cut -f4)
-    OUTPUT_TOKENS=$(echo "$USAGE_DATA" | cut -f5)
-    MODEL_ID=$(echo "$USAGE_DATA" | cut -f6)
-    AUTO_RATIO_PM=$(echo "$USAGE_DATA" | cut -f7)
-    USER_TOKENS_EST=$(echo "$USAGE_DATA" | cut -f8)
-    UNIQUE_FILES=$(echo "$USAGE_DATA" | cut -f9)
-    TOTAL_EDITS=$(echo "$USAGE_DATA" | cut -f10)
-    TEST_PASSES=$(echo "$USAGE_DATA" | cut -f11)
-    TEST_FAILURES=$(echo "$USAGE_DATA" | cut -f12)
-    HAS_PUBLIC_PUSH=$(echo "$USAGE_DATA" | cut -f13)
+    ASSISTANT_TURNS=$(echo "$USAGE_LINE1" | cut -f1)
+    INPUT_TOKENS=$(echo "$USAGE_LINE1" | cut -f2)
+    CACHE_CREATION=$(echo "$USAGE_LINE1" | cut -f3)
+    CACHE_READ=$(echo "$USAGE_LINE1" | cut -f4)
+    OUTPUT_TOKENS=$(echo "$USAGE_LINE1" | cut -f5)
+    MODEL_ID=$(echo "$USAGE_LINE1" | cut -f6)
+    AUTO_RATIO_PM=$(echo "$USAGE_LINE1" | cut -f7)
+    USER_TOKENS_EST=$(echo "$USAGE_LINE1" | cut -f8)
+    UNIQUE_FILES=$(echo "$USAGE_LINE1" | cut -f9)
+    TOTAL_EDITS=$(echo "$USAGE_LINE1" | cut -f10)
+    TEST_PASSES=$(echo "$USAGE_LINE1" | cut -f11)
+    TEST_FAILURES=$(echo "$USAGE_LINE1" | cut -f12)
+    HAS_PUBLIC_PUSH=$(echo "$USAGE_LINE1" | cut -f13)
 
     # Cumulative input = all tokens that went through the model.
     # Cache reads are cheaper (~10-20% of full compute), so we weight them.
@@ -184,6 +189,7 @@ print(f'{turns}\t{input_tokens}\t{cache_creation}\t{cache_read}\t{output_tokens}
     TEST_PASSES=0
     TEST_FAILURES=0
     HAS_PUBLIC_PUSH=0
+    EDITED_FILES_JSON="{}"
   fi
 
   # --- Cost estimates ---
@@ -226,12 +232,48 @@ else
   TEST_PASSES=0
   TEST_FAILURES=0
   HAS_PUBLIC_PUSH=0
+  EDITED_FILES_JSON="{}"
 fi
 
 # --- Write log entry ---
 
-cat >> "$LOG_FILE" <<EOF
-{"timestamp":"$TIMESTAMP","session_id":"$SESSION_ID","trigger":"$TRIGGER","token_source":"$TOKEN_SOURCE","transcript_bytes":$TRANSCRIPT_BYTES,"transcript_lines":$TRANSCRIPT_LINES,"assistant_turns":$ASSISTANT_TURNS,"tool_uses":$TOOL_USES,"cumulative_input_tokens":$CUMULATIVE_INPUT,"cumulative_input_raw":$CUMULATIVE_INPUT_RAW,"cache_creation_tokens":$CACHE_CREATION,"cache_read_tokens":$CACHE_READ,"output_tokens":$OUTPUT_TOKENS,"energy_wh":$ENERGY_WH,"co2_g":$CO2_G,"cost_cents":$COST_CENTS,"model_id":"$MODEL_ID","automation_ratio_pm":$AUTO_RATIO_PM,"user_tokens_est":$USER_TOKENS_EST,"unique_files_edited":$UNIQUE_FILES,"total_file_edits":$TOTAL_EDITS,"test_passes":$TEST_PASSES,"test_failures":$TEST_FAILURES,"has_public_push":$HAS_PUBLIC_PUSH}
-EOF
+# Build log entry using Python to safely embed the edited_files JSON
+python3 -c "
+import json, sys
+entry = {
+    'timestamp': sys.argv[1],
+    'session_id': sys.argv[2],
+    'trigger': sys.argv[3],
+    'token_source': sys.argv[4],
+    'transcript_bytes': int(sys.argv[5]),
+    'transcript_lines': int(sys.argv[6]),
+    'assistant_turns': int(sys.argv[7]),
+    'tool_uses': int(sys.argv[8]),
+    'cumulative_input_tokens': int(sys.argv[9]),
+    'cumulative_input_raw': int(sys.argv[10]),
+    'cache_creation_tokens': int(sys.argv[11]),
+    'cache_read_tokens': int(sys.argv[12]),
+    'output_tokens': int(sys.argv[13]),
+    'energy_wh': int(sys.argv[14]),
+    'co2_g': int(sys.argv[15]),
+    'cost_cents': int(sys.argv[16]),
+    'model_id': sys.argv[17],
+    'automation_ratio_pm': int(sys.argv[18]),
+    'user_tokens_est': int(sys.argv[19]),
+    'unique_files_edited': int(sys.argv[20]),
+    'total_file_edits': int(sys.argv[21]),
+    'test_passes': int(sys.argv[22]),
+    'test_failures': int(sys.argv[23]),
+    'has_public_push': int(sys.argv[24]),
+    'edited_files': json.loads(sys.argv[25]),
+}
+print(json.dumps(entry, separators=(',', ':')))
+" "$TIMESTAMP" "$SESSION_ID" "$TRIGGER" "$TOKEN_SOURCE" \
+  "$TRANSCRIPT_BYTES" "$TRANSCRIPT_LINES" "$ASSISTANT_TURNS" "$TOOL_USES" \
+  "$CUMULATIVE_INPUT" "$CUMULATIVE_INPUT_RAW" "$CACHE_CREATION" "$CACHE_READ" \
+  "$OUTPUT_TOKENS" "$ENERGY_WH" "$CO2_G" "$COST_CENTS" \
+  "$MODEL_ID" "$AUTO_RATIO_PM" "$USER_TOKENS_EST" \
+  "$UNIQUE_FILES" "$TOTAL_EDITS" "$TEST_PASSES" "$TEST_FAILURES" \
+  "$HAS_PUBLIC_PUSH" "$EDITED_FILES_JSON" >> "$LOG_FILE"
 
 exit 0
diff --git a/tasks/README.md b/tasks/README.md
index e1ee318..14d4289 100644
--- a/tasks/README.md
+++ b/tasks/README.md
@@ -37,6 +37,7 @@ separately as handoffs.
 | 24 | Update show-impact.sh for new fields | quantify-social-costs | DONE | Social cost proxies displayed in impact viewer |
 | 25 | Update methodology confidence summary | quantify-social-costs | DONE | 4 categories moved to "Proxy", explanation added |
 | 26 | Build aggregate dashboard | quantify-social-costs | DONE | `show-aggregate.sh` — portfolio-level social cost metrics |
+| 27 | Log edited file list in hook | quantify-social-costs | DONE | `edited_files` dict in JSONL (file path → edit count) |
 
 ## Handoffs