import numpy as np
from scipy import stats
import itertools
import matplotlib.pyplot as plt

# percent accuracy; index i corresponds to seed 42+i, identical across conditions
acc = {
    'base_50': [89.07, 89.33, 88.20, 87.13, 88.73],
    'base_25': [71.40, 76.47, 71.13, 71.27, 68.60],
    'uf_50_1': [86.80, 88.60, 85.40, 87.20, 87.27],
    'uf_50_4': [86.20, 86.67, 85.20, 86.53, 87.13],
    'f_50_1':  [86.87, 89.60, 85.73, 87.67, 88.13],
    'f_50_4':  [86.73, 87.73, 86.13, 85.73, 86.07],
    'uf_25_1': [78.60, 78.60, 77.53, 76.40, 76.47],
    'uf_25_4': [79.07, 80.40, 76.80, 78.80, 78.87],
    'f_25_1':  [78.33, 78.33, 76.60, 78.47, 77.67],
    'f_25_4':  [79.80, 79.33, 77.93, 79.73, 78.27],
}
acc = {k: np.array(v) for k, v in acc.items()}

# sanity check: condition means should match the published results table
for k, v in acc.items():
    print(f'{k:9s} mean = {v.mean():6.2f}%')

base_50   mean =  88.49%
base_25   mean =  71.77%
uf_50_1   mean =  87.05%
uf_50_4   mean =  86.35%
f_50_1    mean =  87.60%
f_50_4    mean =  86.48%
uf_25_1   mean =  77.52%
uf_25_4   mean =  78.79%
f_25_1    mean =  77.88%
f_25_4    mean =  79.01%

cells = [('50', '1'), ('50', '4'), ('25', '1'), ('25', '4')]
diffs = {}  # (count, ratio) -> per-seed paired differences (filtered - unfiltered)

print(f"{'condition':>16} {'effect (pp)':>12} {'paired-t p':>12}")
for count, ratio in cells:
    d = acc[f'f_{count}_{ratio}'] - acc[f'uf_{count}_{ratio}']
    diffs[(count, ratio)] = d
    t, p = stats.ttest_rel(acc[f'f_{count}_{ratio}'], acc[f'uf_{count}_{ratio}'])
    label = f'{count}/class {ratio}:1'
    print(f'{label:>16} {d.mean():>+12.2f} {p:>12.3f}')
print()
print('All four positive; none significant after correcting for the four looks.')

       condition  effect (pp)   paired-t p
    50/class 1:1        +0.55        0.033
    50/class 4:1        +0.13        0.781
    25/class 1:1        +0.36        0.550
    25/class 4:1        +0.22        0.640

All four positive; none significant after correcting for the four looks.

D = np.vstack([diffs[c] for c in cells])      # shape (4 conditions, 5 seeds)
seed_avg = D.mean(axis=0)                      # one average effect per seed
print('per-seed average filter effect (pp):', np.round(seed_avg, 3))
print('all positive:', bool(np.all(seed_avg > 0)))
print()

# [1] appropriate test: one-sample t on the 5 independent seed-level estimates
t, p = stats.ttest_1samp(seed_avg, 0)
ci = stats.t.interval(0.95, len(seed_avg) - 1, loc=seed_avg.mean(), scale=stats.sem(seed_avg))
print('[1] seed-level one-sample t-test')
print(f'    mean = {seed_avg.mean():+.3f} pp   95% CI [{ci[0]:+.3f}, {ci[1]:+.3f}]   '
      f't = {t:.2f} (df={len(seed_avg)-1})   p = {p:.4f}')

# [2] distribution-free backups (no normality assumption)
w, pw = stats.wilcoxon(seed_avg)
obs = seed_avg.mean()
perm = [np.mean(seed_avg * np.array(s)) for s in itertools.product([1, -1], repeat=len(seed_avg))]
p_perm = float(np.mean([abs(m) >= abs(obs) - 1e-12 for m in perm]))   # two-sided
print(f'[2] Wilcoxon signed-rank p = {pw:.4f}   exact sign-flip permutation p = {p_perm:.4f}')

# [3] the WRONG test: 20 diffs treated as independent (ignores within-seed correlation)
t2, p2 = stats.ttest_1samp(D.flatten(), 0)
print(f'[3] naive t on all 20 diffs (incorrect): t = {t2:.2f}   p = {p2:.4f}  -> misses the signal')

# [4] sign test on the seed-level estimates
n_pos = int((seed_avg > 0).sum())
sign_p = stats.binomtest(n_pos, len(seed_avg), 0.5, alternative="greater").pvalue
print(f'[4] sign test: {n_pos}/{len(seed_avg)} seeds positive   one-sided p = {sign_p:.4f}')

per-seed average filter effect (pp): [0.265 0.18  0.365 0.668 0.1  ]
all positive: True

[1] seed-level one-sample t-test
    mean = +0.316 pp   95% CI [+0.042, +0.589]   t = 3.21 (df=4)   p = 0.0327
[2] Wilcoxon signed-rank p = 0.0625   exact sign-flip permutation p = 0.0625
[3] naive t on all 20 diffs (incorrect): t = 1.59   p = 0.1292  -> misses the signal
[4] sign test: 5/5 seeds positive   one-sided p = 0.0312

labels, means, errs = [], [], []
for count, ratio in cells:
    d = diffs[(count, ratio)]
    lo, hi = stats.t.interval(0.95, len(d) - 1, loc=d.mean(), scale=stats.sem(d))
    labels.append(f'{count}/class, {ratio}:1')
    means.append(d.mean())
    errs.append((hi - lo) / 2)

lo, hi = stats.t.interval(0.95, len(seed_avg) - 1, loc=seed_avg.mean(), scale=stats.sem(seed_avg))
labels.append('POOLED')
means.append(seed_avg.mean())
errs.append((hi - lo) / 2)

y = np.arange(len(labels))[::-1]
fig, ax = plt.subplots(figsize=(7, 3.2))
ax.errorbar(means[:4], y[:4], xerr=errs[:4], fmt='o', color='gray', capsize=4, label='per condition')
ax.errorbar(means[4:], y[4:], xerr=errs[4:], fmt='o', color='C0', capsize=4, label='pooled')
ax.axvline(0, color='k', lw=0.8, ls='--')
ax.set_yticks(y)
ax.set_yticklabels(labels)
ax.set_xlabel('filtered - unfiltered (percentage points)')
ax.set_title('Stage-1 filter effect: per condition vs pooled')
ax.legend(loc='lower right', frameon=False)
plt.tight_layout()
plt.show()

Does Stage-1 critic filtering help? A pooled re-analysis¶

1. Data¶

2. Per-condition contrasts (reproduce the paper)¶

3. Pool across the four conditions¶

4. Forest plot¶

5. Conclusion¶