Experiment 016 — Rare Biosphere Signal Detection
Rendered from exp-016-rare-biosphere.ipynb
Experiment 016 — Rare Biosphere Signal Detection
At what sequencing depth can we reliably distinguish rare biological lineages from sequencing artifacts? This extends Exp 004 (genus saturation at 5 000 reads) to the rare end of the abundance distribution using Chao1 richness estimation and analytical detection power.
Method:
- Synthetic community with 50 species across 5 abundance tiers
- Multinomial sampling simulates sequencing at various depths
- Chao1 non-parametric richness estimator (Chao 1984)
- Detection power: P(detect) = 1 - (1 - p)^D
- Detection threshold: D* = ceil(ln(0.05) / ln(1-p))
- Abundance-occupancy relationship across replicate samples
Reference: Anderson, Sogin, Baross (2015) FEMS Microbiol Ecol 91:fiv016 Chao (1984) Scand J Stat 11:265-270 Sogin et al. (2006) PNAS 103:12115-12120
Cross-spring: wetSpring (microbial diversity), Exp 004 (sequencing depth).
Domain: Genomics Faculty: R. Anderson (Carleton) Reference: R. Anderson — deep subsurface microbiology
Data source: control/rare_biosphere/rare_biosphere.py + benchmark_*.json
This notebook is the publication-grade Python baseline for Experiment 016. The identical computations are validated in Rust (see validate_* binary) and delegated to barraCuda for GPU acceleration.
import json
import math
import sys
from pathlib import Path
import numpy as np
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
# Wire path to groundSpring control/ for common utilities
CONTROL = Path('..') / '..' / 'control'
sys.path.insert(0, str(CONTROL))
from common import * # noqa: F403 — validation harness
# Load benchmark data
benchmark_path = CONTROL / 'rare_biosphere' / 'benchmark_rare_biosphere.json'
with open(benchmark_path) as f:
benchmark = json.load(f)
PASS_COLOR = '#2ecc71'
FAIL_COLOR = '#e74c3c'
INFO_COLOR = '#3498db'
print(f'Loaded benchmark: benchmark_rare_biosphere.json')
print(f'Provenance: {benchmark.get("_provenance", {})}')Validation
Initialization
model = benchmark["model"]
pred = benchmark["analytical_predictions"]
exp = benchmark["expected_results"]
community = np.array(model["community"])
n_species = model["n_species"]
depths = model["depths"]
n_reps = model["n_replicates"]
base_seed = model["base_seed"]
tiers = model["tier_boundaries"]
print("groundSpring Exp 016: Rare Biosphere Signal Detection")
print(f" Community: {n_species} species, 5 abundance tiers")
print(f" Depths: {depths}")
print(f" Replicates: {n_reps}")
print(" Reference: Anderson, Sogin, Baross (2015) FEMS")Chao1 Richness Estimation
rng = np.random.default_rng(base_seed)
chao1_by_depth = {}
sobs_by_depth = {}
for depth in depths:
chao1_vals = []
sobs_vals = []
for _ in range(n_reps):
seed = int(rng.integers(0, 2**32))
rep_rng = np.random.default_rng(seed)
counts = rep_rng.multinomial(depth, community)
chao1_vals.append(chao1(counts))
sobs_vals.append(int(np.sum(counts > 0)))
mean_chao1 = float(np.mean(chao1_vals))
mean_sobs = float(np.mean(sobs_vals))
chao1_by_depth[depth] = mean_chao1
sobs_by_depth[depth] = mean_sobs
print(f" D={depth:6d}: S_obs={mean_sobs:.1f}, Chao1={mean_chao1:.1f} (true={n_species})")
check_range(
"Chao1 at D=50000 ≈ true richness",
chao1_by_depth[50000],
exp["chao1_at_depth_50000_range"][0],
exp["chao1_at_depth_50000_range"][1],
)
check_true(
"Chao1 > S_obs at low depth (D=100)",
chao1_by_depth[100] > sobs_by_depth[100],
)
check_true(
"All species detected at D=50000",
sobs_by_depth[50000] >= exp["sobs_at_depth_50000"] - 0.5,
)Detection Power by Tier
dom_lo, dom_hi = tiers["dominant"]
vr_lo, vr_hi = tiers["very_rare"]
dom_rate = tier_detection_rate(
community, dom_lo, dom_hi, 100, n_reps, base_seed + 1000,
)
vr_rate_100 = tier_detection_rate(
community, vr_lo, vr_hi, 100, n_reps, base_seed + 2000,
)
vr_rate_5000 = tier_detection_rate(
community, vr_lo, vr_hi, 5000, n_reps, base_seed + 3000,
)
p_dom = detection_power(0.06, 100)
p_vr_100 = detection_power(0.003, 100)
p_vr_5000 = detection_power(0.003, 5000)
print(f" Dominant at D=100: rate={dom_rate:.3f} (theory ≥ {p_dom:.3f})")
print(f" Very rare at D=100: rate={vr_rate_100:.3f} (theory ≈ {p_vr_100:.3f})")
print(f" Very rare at D=5000: rate={vr_rate_5000:.3f} (theory ≈ {p_vr_5000:.3f})")
check_min(
"Dominant detected at D=100",
dom_rate,
exp["detection_rate_dominant_at_100_min"],
)
check_max(
"Very rare rarely detected at D=100",
vr_rate_100,
exp["detection_rate_very_rare_at_100_max"],
)
check_min(
"Very rare detected at D=5000",
vr_rate_5000,
exp["detection_rate_very_rare_at_5000_min"],
)Analytical Detection Thresholds
for label, p_val, expected_key in [
("very_rare (p=0.003)", 0.003, "detection_threshold_very_rare_p003"),
("rare (p=0.004)", 0.004, "detection_threshold_rare_p004"),
("moderate (p=0.008)", 0.008, "detection_threshold_moderate_p008"),
("common (p=0.030)", 0.030, "detection_threshold_common_p030"),
]:
computed = detection_threshold(p_val, 0.95)
expected = pred[expected_key]
print(f" {label}: D*={computed} (expected {expected})")
check_true(
"Detection threshold monotonically decreases with abundance",
(
detection_threshold(0.003) > detection_threshold(0.004)
> detection_threshold(0.008) > detection_threshold(0.030)
),
)Abundance-Occupancy Relationship
n_samples = model["n_samples_occupancy"]
occ_depth = model["occupancy_depth"]
occ_rng = np.random.default_rng(base_seed + 50000)
detection_counts = np.zeros(n_species)
for _ in range(n_samples):
seed = int(occ_rng.integers(0, 2**32))
rep_rng = np.random.default_rng(seed)
counts = rep_rng.multinomial(occ_depth, community)
detection_counts += (counts > 0).astype(float)
occupancy = detection_counts / n_samples
dom_occ = float(np.mean(occupancy[tiers["dominant"][0]:tiers["dominant"][1]]))
vr_occ = float(np.mean(occupancy[tiers["very_rare"][0]:tiers["very_rare"][1]]))
print(f" Dominant mean occupancy: {dom_occ:.3f}")
print(f" Very rare mean occupancy: {vr_occ:.3f}")
from scipy.stats import spearmanr
rho, _ = spearmanr(community, occupancy)
print(f" Spearman(abundance, occupancy) = {rho:.3f}")
check_true("Occupancy positively correlated with abundance", bool(rho > 0.5))Singleton Fraction vs Depth
sing_rng = np.random.default_rng(base_seed + 60000)
singleton_fracs = {}
for depth in depths:
frac_sum = 0.0
for _ in range(n_reps):
seed = int(sing_rng.integers(0, 2**32))
rep_rng = np.random.default_rng(seed)
counts = rep_rng.multinomial(depth, community)
s_obs = int(np.sum(counts > 0))
f1 = int(np.sum(counts == 1))
frac_sum += f1 / s_obs if s_obs > 0 else 0.0
singleton_fracs[depth] = frac_sum / n_reps
for depth in depths:
print(f" D={depth:6d}: singleton fraction = {singleton_fracs[depth]:.3f}")
check_true(
"Singleton fraction decreases with depth",
singleton_fracs[depths[0]] > singleton_fracs[depths[-1]],
)Determinism
det_rng1 = np.random.default_rng(99999)
det_rng2 = np.random.default_rng(99999)
c1 = det_rng1.multinomial(1000, community)
c2 = det_rng2.multinomial(1000, community)
check_true("Multinomial deterministic (same seed)", np.array_equal(c1, c2))
chao1_a = chao1(c1)
chao1_b = chao1(c2)
check_true("Chao1 deterministic", chao1_a == chao1_b)Key Findings
print(f"\n{'=' * 72}")Visualization
# Publication-grade summary chart for Exp 016
fig, ax = plt.subplots(figsize=(8, 4))
p, f_count, t = pass_count(), fail_count(), total_count()
ax.barh(['Pass', 'Fail'], [p, f_count], color=[PASS_COLOR, FAIL_COLOR])
ax.set_xlim(0, max(t * 1.15, 1))
ax.set_title('Exp 016: Rare Biosphere Signal Detection — Validation Results')
ax.set_xlabel('Check Count')
for i, v in enumerate([p, f_count]):
if v > 0:
ax.text(v + 0.3, i, str(v), va='center', fontweight='bold')
plt.tight_layout()
plt.savefig(f'/tmp/groundspring_exp016.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'\nResult: {p}/{t} PASS, {f_count}/{t} FAIL')Provenance & Summary
| Field | Value |
|---|---|
| Experiment | 016 — Rare Biosphere Signal Detection |
| Domain | Genomics |
| Reference | R. Anderson — deep subsurface microbiology |
| Faculty | R. Anderson (Carleton) |
| Python baseline | control/rare_biosphere/rare_biosphere.py |
| Benchmark JSON | control/rare_biosphere/benchmark_rare_biosphere.json |
| Rust validator | validate_* binary (exit-code protocol) |
| Rust speedup | See benchmark comparison notebook |
| License | AGPL-3.0-or-later |
Provenance chain: Python baseline → Rust validation → barraCuda GPU → metalForge cross-substrate → primal IPC composition
See primals.eco for rendered lab notebooks.