1.x updates

2026-05-19 08:34:22 -04:00
parent 3f3fce62d3
commit 9d91ac8ebc
53 changed files with 4541 additions and 2111 deletions
--- a/analysis.py
+++ b/analysis.py
@@ -1,597 +1,12 @@
-"""Shared loaders + derived columns for Garmin analysis notebooks.
+"""Backwards-compat shim — everything is now in openrun.

-Usage in a notebook:
-
-    from analysis import open_conn, load_activities, load_wellness
-    conn = open_conn()
-    runs = load_activities(conn, type='running')
-    wellness = load_wellness(conn)
+This module used to host all loaders + derived metrics. Phase 0 of the
+openrun refactor moved them into the `openrun` package. Existing notebooks
+that still `from analysis import ...` continue to work; new code should
+`from openrun import ...` directly.
 """
-
-from __future__ import annotations
-
-import json
-import sqlite3
-from pathlib import Path
-
-import pandas as pd
-
-DB_PATH = Path(__file__).parent / "data" / "garmin.db"
-
-
-def open_conn() -> sqlite3.Connection:
-    return sqlite3.connect(DB_PATH)
-
-
-# ---------------------------------------------------------------------------
-# activities
-# ---------------------------------------------------------------------------
-
-def load_activities(conn: sqlite3.Connection, *, type: str | None = None) -> pd.DataFrame:
-    """Activities with derived: distance_km, duration_min, pace_min_per_km, week, month, year."""
-    sql = """
-        SELECT activity_id, start_time_local, activity_type, activity_name,
-               distance_m, duration_s, moving_duration_s,
-               avg_speed_mps, max_speed_mps, avg_hr, max_hr, calories,
-               elevation_gain_m, elevation_loss_m,
-               training_load, aerobic_te, anaerobic_te, vo2_max
-        FROM activities
-    """
-    if type:
-        sql += " WHERE activity_type = ?"
-        df = pd.read_sql(sql, conn, params=[type], parse_dates=["start_time_local"])
-    else:
-        df = pd.read_sql(sql, conn, parse_dates=["start_time_local"])
-
-    df["distance_km"] = df["distance_m"] / 1000
-    df["duration_min"] = df["duration_s"] / 60
-    df["moving_min"] = df["moving_duration_s"] / 60
-    df["pace_min_per_km"] = df["moving_min"] / df["distance_km"]
-    # Filter physically-impossible paces (sub-3 min/km is faster than world-record marathon pace)
-    # and walks-with-stops (>30 min/km).
-    impossible = (df["distance_km"] < 0.2) | (df["pace_min_per_km"] < 3) | (df["pace_min_per_km"] > 30)
-    df.loc[impossible, "pace_min_per_km"] = pd.NA
-    df["date"] = df["start_time_local"].dt.normalize()
-    df["week"] = df["start_time_local"].dt.to_period("W").dt.start_time
-    df["month"] = df["start_time_local"].dt.to_period("M").dt.to_timestamp()
-    df["year"] = df["start_time_local"].dt.year
-    return df.sort_values("start_time_local").reset_index(drop=True)
-
-
-# ---------------------------------------------------------------------------
-# wellness
-# ---------------------------------------------------------------------------
-
-def load_wellness(conn: sqlite3.Connection) -> pd.DataFrame:
-    """Joined daily wellness frame indexed by calendar_date (datetime)."""
-    df = pd.read_sql(
-        """
-        SELECT s.calendar_date,
-               s.total_steps,
-               sl.sleep_score,
-               sl.deep_s, sl.light_s, sl.rem_s, sl.awake_s,
-               st.avg_stress,
-               h.last_night_avg AS hrv_last_night,
-               h.weekly_avg     AS hrv_weekly,
-               h.status         AS hrv_status,
-               im.moderate_minutes,
-               im.vigorous_minutes,
-               rh.resting_hr,
-               bb.charged       AS bb_charged,
-               bb.drained       AS bb_drained,
-               bb.highest       AS bb_highest,
-               bb.lowest        AS bb_lowest
-        FROM daily_steps s
-        LEFT JOIN daily_sleep              sl ON sl.calendar_date = s.calendar_date
-        LEFT JOIN daily_stress             st ON st.calendar_date = s.calendar_date
-        LEFT JOIN daily_hrv                h  ON h.calendar_date  = s.calendar_date
-        LEFT JOIN daily_intensity_minutes  im ON im.calendar_date = s.calendar_date
-        LEFT JOIN daily_resting_hr         rh ON rh.calendar_date = s.calendar_date
-        LEFT JOIN daily_body_battery       bb ON bb.calendar_date = s.calendar_date
-        ORDER BY s.calendar_date
-        """,
-        conn,
-        parse_dates=["calendar_date"],
-    ).set_index("calendar_date")
-    df["sleep_total_s"] = df[["deep_s", "light_s", "rem_s"]].sum(axis=1, min_count=1)
-    df["sleep_hours"] = df["sleep_total_s"] / 3600
-    df["deep_pct"] = df["deep_s"] / df["sleep_total_s"]
-    df["rem_pct"] = df["rem_s"] / df["sleep_total_s"]
-    return df
-
-
-# ---------------------------------------------------------------------------
-# combine: training load by day, joined with next-day wellness
-# ---------------------------------------------------------------------------
-
-def daily_training_load(conn: sqlite3.Connection) -> pd.DataFrame:
-    """Sum training load + distance per calendar date (any activity type)."""
-    acts = load_activities(conn)
-    daily = (
-        acts.groupby("date")
-        .agg(
-            training_load=("training_load", "sum"),
-            distance_km=("distance_km", "sum"),
-            duration_min=("duration_min", "sum"),
-            n_activities=("activity_id", "count"),
-            avg_hr_weighted=("avg_hr", "mean"),  # simple unweighted; refine if needed
-        )
-    )
-    daily.index = pd.to_datetime(daily.index)
-    return daily
-
-
-def joined(conn: sqlite3.Connection) -> pd.DataFrame:
-    """Wellness joined with same-day and previous-day training load."""
-    wellness = load_wellness(conn)
-    tl = daily_training_load(conn)
-    df = wellness.join(tl, how="left")
-    df[["training_load", "distance_km", "duration_min", "n_activities"]] = (
-        df[["training_load", "distance_km", "duration_min", "n_activities"]].fillna(0)
-    )
-    # previous day training load (commonly correlated with overnight HRV / next-morning RHR)
-    df["training_load_prev"] = df["training_load"].shift(1)
-    df["distance_km_prev"] = df["distance_km"].shift(1)
-    return df
-
-
-# ---------------------------------------------------------------------------
-# expand the raw JSON of a table when you want fields the schema doesn't surface
-# ---------------------------------------------------------------------------
-
-def expand_raw(df: pd.DataFrame, raw_col: str = "raw") -> pd.DataFrame:
-    """For a frame with a `raw` JSON column, return a normalized companion frame."""
-    if raw_col not in df.columns:
-        raise KeyError(f"no '{raw_col}' column in frame")
-    return pd.json_normalize([json.loads(r) for r in df[raw_col]])
-
-
-# ---------------------------------------------------------------------------
-# splits — per-lap data with cadence, stride, GPS, etc. extracted from raw JSON
-# ---------------------------------------------------------------------------
-
-_SPLIT_RAW_FIELDS = (
-    "averageRunCadence",
-    "maxRunCadence",
-    "strideLength",
-    "verticalOscillation",
-    "verticalRatio",
-    "groundContactTime",
-    "averagePower",
-    "normalizedPower",
-    "startLatitude",
-    "startLongitude",
-    "endLatitude",
-    "endLongitude",
-    "avgGradeAdjustedSpeed",
-    "maxHR",
-    "elevationGain",
-    "elevationLoss",
+from openrun import *  # noqa: F401,F403
+from openrun.model import (  # noqa: F401  (re-export private-ish helpers some callers use)
+    _SPLIT_RAW_FIELDS,
+    _resolve_fit_path,
 )
-
-
-def load_splits(conn: sqlite3.Connection, *, activity_type: str | None = "running") -> pd.DataFrame:
-    """Per-split frame with rich fields expanded from raw JSON, joined to activity start time.
-
-    Derived columns:
-      pace_min_per_km, pace_min_per_mile, speed_kmh, split_seq (0-based position in run),
-      n_splits (total in that run), frac_through (0..1), year, month.
-
-    Splits with implausible values (no HR, distance < 200m, pace > 30 min/km) are dropped.
-    """
-    sql = """
-        SELECT s.activity_id, s.split_index, s.distance_m, s.duration_s,
-               s.avg_hr, s.avg_speed_mps, s.elevation_gain_m AS split_elev_gain_m,
-               s.raw, a.start_time_local, a.activity_type
-        FROM activity_splits s
-        JOIN activities a ON a.activity_id = s.activity_id
-    """
-    params: list = []
-    if activity_type:
-        sql += " WHERE a.activity_type = ?"
-        params.append(activity_type)
-    sql += " ORDER BY s.activity_id, s.split_index"
-    df = pd.read_sql(sql, conn, params=params, parse_dates=["start_time_local"])
-
-    raws = [json.loads(r) if r else {} for r in df["raw"]]
-    for k in _SPLIT_RAW_FIELDS:
-        df[k] = [r.get(k) for r in raws]
-    df = df.drop(columns=["raw"])
-
-    df["pace_min_per_km"] = (df["duration_s"] / 60) / (df["distance_m"] / 1000)
-    df["pace_min_per_mile"] = (df["duration_s"] / 60) / (df["distance_m"] / 1609.344)
-    df["speed_kmh"] = df["avg_speed_mps"] * 3.6
-
-    bad = (
-        df["distance_m"].lt(200)
-        | df["avg_hr"].isna()
-        | df["avg_hr"].lt(60)
-        | df["pace_min_per_km"].gt(30)
-        | df["pace_min_per_km"].lt(2.5)
-    )
-    df = df.loc[~bad].copy()
-
-    df["split_seq"] = df.groupby("activity_id").cumcount()
-    df["n_splits"] = df.groupby("activity_id")["activity_id"].transform("count")
-    denom = (df["n_splits"] - 1).replace(0, pd.NA)
-    df["frac_through"] = df["split_seq"] / denom
-    df["year"] = df["start_time_local"].dt.year
-    df["month"] = df["start_time_local"].dt.to_period("M").dt.to_timestamp()
-    return df.reset_index(drop=True)
-
-
-def decoupling(splits: pd.DataFrame, min_splits: int = 6) -> pd.DataFrame:
-    """Per-activity Pa:Hr decoupling using duration-weighted halves.
-
-    `efficiency` per half = mean(speed_mps weighted by duration) / mean(HR weighted by duration).
-    `decoupling_pct` = (first_half_eff / second_half_eff - 1) * 100.
-        Positive  = pace/HR dropped in 2nd half (the textbook 'cardiac drift' direction).
-        Negative  = ran faster per beat in 2nd half (often: negative split, conservative start).
-
-    Endurance benchmark: <5% on a steady aerobic run is 'aerobically developed'.
-    """
-    valid = splits[splits["n_splits"] >= min_splits].copy()
-    valid["half"] = (valid["frac_through"] >= 0.5).map({False: "first", True: "second"})
-
-    def _half_eff(d: pd.DataFrame) -> float:
-        w = d["duration_s"].to_numpy()
-        speed = (d["avg_speed_mps"].to_numpy() * w).sum() / w.sum()
-        hr = (d["avg_hr"].to_numpy() * w).sum() / w.sum()
-        return speed / hr if hr else float("nan")
-
-    eff = (
-        valid.groupby(["activity_id", "half"])[["avg_speed_mps", "avg_hr", "duration_s"]]
-        .apply(_half_eff)
-        .unstack("half")
-    )
-    eff["decoupling_pct"] = (eff["first"] / eff["second"] - 1) * 100
-    eff = eff.dropna(subset=["decoupling_pct"])
-
-    meta = valid.groupby("activity_id").agg(
-        start_time_local=("start_time_local", "first"),
-        distance_km=("distance_m", lambda s: s.sum() / 1000),
-        duration_min=("duration_s", lambda s: s.sum() / 60),
-        avg_hr=("avg_hr", "mean"),
-        avg_pace_min_per_km=("pace_min_per_km", "mean"),
-        n_splits=("n_splits", "first"),
-    )
-    out = eff.join(meta).reset_index()
-    out["year"] = out["start_time_local"].dt.year
-    return out
-
-
-_HR_ZONE_BOUNDS = (0.50, 0.60, 0.70, 0.80, 0.90, 1.01)
-_HR_ZONE_LABELS = ("Z1", "Z2", "Z3", "Z4", "Z5")
-
-
-def assign_hr_zone(hr: float, hr_max: float) -> str | None:
-    if hr is None or pd.isna(hr) or not hr_max:
-        return None
-    frac = hr / hr_max
-    for lo, hi, lab in zip(_HR_ZONE_BOUNDS[:-1], _HR_ZONE_BOUNDS[1:], _HR_ZONE_LABELS):
-        if lo <= frac < hi:
-            return lab
-    return "Z5" if frac >= _HR_ZONE_BOUNDS[-2] else "Z1"
-
-
-# Garmin-configured HR zones for this user (source: DI_CONNECT/.../heartRateZones.json).
-# trainingMethod=HR_MAX, maxHeartRateUsed=209, lactateThresholdHR=182, RHR=52.
-HR_ZONES_USER: tuple[tuple[str, int, int], ...] = (
-    ("Z1", 102, 122),  # recovery
-    ("Z2", 123, 143),  # easy aerobic — long-run target
-    ("Z3", 144, 164),  # tempo / "junk-miles middle"
-    ("Z4", 165, 185),  # threshold (LTHR sits inside Z4 at 182)
-    ("Z5", 186, 209),  # VO2 max
-)
-
-
-def hr_to_user_zone(hr: float, zones: tuple[tuple[str, int, int], ...] = HR_ZONES_USER) -> str | None:
-    """Map a single HR reading to its configured zone label (Z1..Z5).
-
-    Below Z1 floor → None (warmup / walking).
-    Above Z5 ceiling → still Z5 (rare, edge of effort).
-    """
-    if hr is None or pd.isna(hr):
-        return None
-    if hr < zones[0][1]:  # below Z1 lower bound
-        return None
-    for label, lo, hi in zones:
-        if lo <= hr <= hi:
-            return label
-    return zones[-1][0]  # above Z5 ceiling
-
-
-def time_in_zone_from_fit(records: pd.DataFrame,
-                          zones: tuple[tuple[str, int, int], ...] = HR_ZONES_USER) -> dict[str, float]:
-    """Per-second time-in-zone (seconds) from a FIT records frame.
-
-    Each record contributes `elapsed_s - prev_elapsed_s` to whichever zone its HR
-    falls in. Large gaps (>30 s, e.g. a paused recording) are clipped to 30 s
-    so a stopped watch doesn't dump hours into one zone.
-    """
-    if records is None or records.empty or "heart_rate" not in records:
-        return {}
-    r = records.dropna(subset=["heart_rate", "elapsed_s"]).copy()
-    if r.empty:
-        return {}
-    r["dt"] = r["elapsed_s"].diff().fillna(1.0).clip(lower=0, upper=30.0)
-    r["zone"] = r["heart_rate"].apply(lambda h: hr_to_user_zone(h, zones))
-    return r.dropna(subset=["zone"]).groupby("zone")["dt"].sum().to_dict()
-
-
-def time_in_zone_from_splits(splits_df: pd.DataFrame,
-                             zones: tuple[tuple[str, int, int], ...] = HR_ZONES_USER) -> dict[str, float]:
-    """Fallback when there's no FIT — assign each split's avg HR to one zone.
-
-    Coarser than the per-second method: a split with avg HR 155 contributes its
-    entire duration to Z3, even if it was actually 1 min Z2 + 3 min Z3 + 1 min Z4.
-    """
-    if splits_df is None or splits_df.empty:
-        return {}
-    s = splits_df.dropna(subset=["avg_hr", "duration_s"]).copy()
-    s["zone"] = s["avg_hr"].apply(lambda h: hr_to_user_zone(h, zones))
-    return s.dropna(subset=["zone"]).groupby("zone")["duration_s"].sum().to_dict()
-
-
-def haversine_km(lat1, lon1, lat2, lon2):
-    """Vectorised great-circle distance, kilometres. Inputs in degrees."""
-    import numpy as np
-
-    r = 6371.0
-    lat1, lon1, lat2, lon2 = map(np.radians, (lat1, lon1, lat2, lon2))
-    dlat = lat2 - lat1
-    dlon = lon2 - lon1
-    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
-    return 2 * r * np.arcsin(np.sqrt(a))
-
-
-def banister(
-    daily_load: pd.Series,
-    *,
-    ctl_tau: float = 42.0,
-    atl_tau: float = 7.0,
-    start_date: str | pd.Timestamp | None = None,
-    end_date: str | pd.Timestamp | None = None,
-) -> pd.DataFrame:
-    """Banister fitness/fatigue/form (CTL/ATL/TSB) from a daily training-load series.
-
-    `daily_load` should be a Series indexed by date (one value per day, 0 for rest days).
-    Missing dates inside the range are filled with 0 — a rest day still updates both
-    EWMAs (CTL drifts down slowly, ATL drifts down fast → TSB recovers).
-
-    Returns a frame indexed by date with columns CTL, ATL, TSB.
-
-    Conventions (per Coggan / TrainingPeaks):
-      CTL_today  = CTL_yesterday · exp(−1/τ_CTL) + load_today · (1 − exp(−1/τ_CTL))
-      ATL_today  = ATL_yesterday · exp(−1/τ_ATL) + load_today · (1 − exp(−1/τ_ATL))
-      TSB_today  = CTL_yesterday − ATL_yesterday          # *yesterday's* values
-
-    TSB interpretation:
-      < −30  severely fatigued (injury risk)
-      −10 to −30  productive overload, the heart of a build block
-      −10 to 0   balanced building
-       0 to +10  sharpening
-      +10 to +25 fresh / peaked  ← race-day target
-      > +25   detrained (taper too long)
-    """
-    import numpy as np
-
-    if daily_load.empty:
-        return pd.DataFrame(columns=["CTL", "ATL", "TSB"])
-
-    idx = pd.to_datetime(daily_load.index).normalize()
-    s = pd.Series(daily_load.values, index=idx).groupby(level=0).sum()
-
-    lo = pd.Timestamp(start_date) if start_date else s.index.min()
-    hi = pd.Timestamp(end_date) if end_date else s.index.max()
-    full = s.reindex(pd.date_range(lo, hi, freq="D"), fill_value=0.0)
-
-    decay_ctl, decay_atl = np.exp(-1 / ctl_tau), np.exp(-1 / atl_tau)
-    w_ctl, w_atl = 1 - decay_ctl, 1 - decay_atl
-
-    n = len(full)
-    ctl = np.zeros(n)
-    atl = np.zeros(n)
-    loads = full.to_numpy()
-    for i in range(n):
-        prev_ctl = ctl[i - 1] if i else 0.0
-        prev_atl = atl[i - 1] if i else 0.0
-        ctl[i] = prev_ctl * decay_ctl + loads[i] * w_ctl
-        atl[i] = prev_atl * decay_atl + loads[i] * w_atl
-
-    out = pd.DataFrame({"CTL": ctl, "ATL": atl}, index=full.index)
-    out["TSB"] = out["CTL"].shift(1) - out["ATL"].shift(1)
-    return out
-
-
-def daily_training_load_series(
-    conn: sqlite3.Connection,
-    *,
-    activity_types: tuple[str, ...] = ("running", "trail_running"),
-) -> pd.Series:
-    """Daily-summed training_load across the given activity types, in ascending date order."""
-    placeholders = ",".join(["?"] * len(activity_types))
-    df = pd.read_sql(
-        f"""SELECT date(start_time_local) AS d, SUM(training_load) AS tl
-            FROM activities
-            WHERE activity_type IN ({placeholders}) AND training_load IS NOT NULL
-            GROUP BY d ORDER BY d""",
-        conn,
-        params=list(activity_types),
-        parse_dates=["d"],
-    )
-    return df.set_index("d")["tl"]
-
-
-def _resolve_fit_path(rel_path: str) -> Path:
-    """Find a FIT file on disk. `fit_path` in the DB is stored relative to the
-    export root that was passed to `link_fit_files.py`. We don't know which
-    top-level folder under the project that was, so try each."""
-    project_root = Path(__file__).parent
-    for entry in project_root.iterdir():
-        if entry.is_dir():
-            candidate = entry / rel_path
-            if candidate.exists():
-                return candidate
-    # Maybe the path is already absolute or relative to cwd
-    p = Path(rel_path)
-    if p.exists():
-        return p
-    raise FileNotFoundError(f"could not locate FIT file: {rel_path}")
-
-
-def load_fit_records(conn: sqlite3.Connection, activity_id: int) -> pd.DataFrame:
-    """Per-second FIT `record` messages for one activity as a DataFrame.
-
-    Columns (subset of what's present):
-      timestamp (UTC, tz-aware), elapsed_s, heart_rate, speed_mps, distance_m,
-      cadence_spm (both legs), altitude_m, power_w, position_lat_deg,
-      position_long_deg, vertical_oscillation_mm, step_length_mm.
-
-    Raises if no FIT is linked for the activity.
-    """
-    import fitparse  # heavy-ish import; keep lazy
-
-    row = conn.execute(
-        "SELECT fit_path FROM activity_fit_files WHERE activity_id = ?", (activity_id,)
-    ).fetchone()
-    if row is None:
-        raise ValueError(f"no FIT linked for activity {activity_id}")
-    fit_file = _resolve_fit_path(row[0])
-
-    fit = fitparse.FitFile(str(fit_file))
-    rows: list[dict] = []
-    for msg in fit.get_messages("record"):
-        rows.append(msg.get_values())
-    if not rows:
-        return pd.DataFrame()
-
-    df = pd.DataFrame(rows)
-    # Normalise & rename
-    out = pd.DataFrame()
-    if "timestamp" in df.columns:
-        out["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
-        out["elapsed_s"] = (out["timestamp"] - out["timestamp"].iloc[0]).dt.total_seconds()
-    out["heart_rate"] = df.get("heart_rate")
-    # Prefer enhanced_speed (always m/s) over the legacy `speed` field
-    out["speed_mps"] = df.get("enhanced_speed", df.get("speed"))
-    out["distance_m"] = df.get("distance")
-    # `cadence` is already both-legs SPM in this account's exports;
-    # fractional_cadence is a 0–1 fractional adjustment, ignored.
-    out["cadence_spm"] = df.get("cadence")
-    out["altitude_m"] = df.get("enhanced_altitude", df.get("altitude"))
-    out["power_w"] = df.get("power")
-    out["vertical_oscillation_mm"] = df.get("vertical_oscillation")
-    out["step_length_mm"] = df.get("step_length")
-    # Position: semicircles → degrees
-    SEMI = 180.0 / (2 ** 31)
-    if "position_lat" in df.columns:
-        out["position_lat_deg"] = df["position_lat"] * SEMI
-    if "position_long" in df.columns:
-        out["position_long_deg"] = df["position_long"] * SEMI
-    return out
-
-
-def fit_decoupling(
-    records: pd.DataFrame,
-    *,
-    segments: int = 2,
-    warmup_min: float = 5.0,
-    cooldown_min: float = 2.0,
-    min_speed_mps: float = 0.5,
-) -> pd.DataFrame:
-    """Per-second Pa:Hr decoupling — Friel's method, faithful to the literature.
-
-    Steps:
-      1. Drop the first `warmup_min` and last `cooldown_min` of the run.
-      2. Drop "stopped" records (speed below `min_speed_mps`) so aid-station
-         pauses don't drag mean speed down.
-      3. Slice the remaining moving time into `segments` equal-time chunks.
-      4. For each chunk: `efficiency = mean(speed_mps) / mean(heart_rate)`.
-      5. decoupling % = (segment_i / segment_0 − 1) × 100. Negative ⇒ pace/HR
-         improved (negative split). Positive ⇒ cardiac drift.
-
-    Returns one row per segment.
-    """
-    r = records.dropna(subset=["heart_rate", "speed_mps", "elapsed_s"]).copy()
-    if r.empty:
-        return pd.DataFrame()
-    total = r["elapsed_s"].iloc[-1]
-    r = r[(r["elapsed_s"] >= warmup_min * 60) & (r["elapsed_s"] <= total - cooldown_min * 60)]
-    moving = r[r["speed_mps"] >= min_speed_mps].copy()
-    if moving.empty:
-        return pd.DataFrame()
-
-    moving = moving.reset_index(drop=True)
-    seg_size = len(moving) // segments
-    out_rows: list[dict] = []
-    for i in range(segments):
-        s = i * seg_size
-        e = (i + 1) * seg_size if i < segments - 1 else len(moving)
-        chunk = moving.iloc[s:e]
-        speed = chunk["speed_mps"].mean()
-        hr = chunk["heart_rate"].mean()
-        out_rows.append(
-            {
-                "segment": i + 1,
-                "from_min": chunk["elapsed_s"].iloc[0] / 60,
-                "to_min": chunk["elapsed_s"].iloc[-1] / 60,
-                "mean_speed_mps": speed,
-                "mean_pace_min_per_km": (1 / speed) * 1000 / 60 if speed else float("nan"),
-                "mean_hr": hr,
-                "efficiency": speed / hr if hr else float("nan"),
-            }
-        )
-    out = pd.DataFrame(out_rows)
-    base = out["efficiency"].iloc[0]
-    out["decoupling_pct"] = (base / out["efficiency"] - 1) * 100
-    return out
-
-
-def fit_rolling_efficiency(records: pd.DataFrame, window_s: int = 300) -> pd.DataFrame:
-    """Rolling mean speed/HR (efficiency) and its derived rolling pace + HR.
-
-    Useful for plotting when efficiency declines through a race. `window_s`
-    defaults to 5 minutes — long enough to smooth GPS/HR jitter but short
-    enough to see drift in the second half.
-    """
-    r = records.dropna(subset=["heart_rate", "speed_mps", "elapsed_s"]).copy()
-    if r.empty:
-        return r
-    r = r.set_index("elapsed_s")
-    win = f"{window_s}s"
-    # Rolling needs a DatetimeIndex; build a synthetic one from elapsed_s.
-    r["_ts"] = pd.to_datetime(r.index, unit="s")
-    r = r.set_index("_ts")
-    rolled = pd.DataFrame(index=r.index)
-    rolled["rolling_speed_mps"] = r["speed_mps"].rolling(win).mean()
-    rolled["rolling_hr"] = r["heart_rate"].rolling(win).mean()
-    rolled["rolling_efficiency"] = rolled["rolling_speed_mps"] / rolled["rolling_hr"]
-    rolled["elapsed_min"] = (rolled.index - rolled.index[0]).total_seconds() / 60
-    return rolled.reset_index(drop=True)
-
-
-def cluster_routes(lats, lons, radius_km: float = 0.25):
-    """Greedy haversine-radius clustering of run start points.
-
-    Assigns each point to the cluster of the first unassigned point within `radius_km`.
-    Returns an integer label array; -1 means unclustered (no neighbours).
-    Good enough for a few hundred runs; for thousands, switch to sklearn DBSCAN with metric='haversine'.
-    """
-    import numpy as np
-
-    lats = np.asarray(lats, dtype=float)
-    lons = np.asarray(lons, dtype=float)
-    n = len(lats)
-    labels = np.full(n, -1, dtype=int)
-    next_label = 0
-    for i in range(n):
-        if labels[i] != -1:
-            continue
-        d = haversine_km(lats[i], lons[i], lats, lons)
-        neigh = np.where((d <= radius_km) & (labels == -1))[0]
-        # Require at least 2 runs to count as a "route"; singletons stay -1.
-        if len(neigh) >= 2:
-            labels[neigh] = next_label
-            next_label += 1
-    return labels