Files
openrun/link_fit_files.py
2026-05-18 12:53:24 -04:00

151 lines
5.8 KiB
Python

"""Match FIT files to activity rows by content (session.start_time).
The Garmin Connect data export embeds activity IDs in FIT filenames
(`<id>_<name>.fit`), so `ingest_export.py` can index them straight from the
filename. The newer Garmin **Takeout** dump uses upload IDs instead
(`<email>_<upload_id>.fit`), which are a different ID space, so filename-based
linking fails. This script does the linking via FIT content: it opens each FIT,
reads the `session.start_time` (UTC), and matches it against
`activities.start_time_gmt` within a small tolerance.
Usage:
uv run link_fit_files.py <export_root>
uv run link_fit_files.py <export_root> --dry-run --min-size-kb 50
Defaults:
Only parses FITs >= 50 KB (workout FITs; smaller files are HRV/sleep/body-battery
monitoring snapshots, ~99% of the dump by count).
Tolerance: 60 seconds. Garmin sometimes rounds session start to the second
while activities.start_time_gmt is also second-precision.
Re-running is safe — the table is upsert by activity_id.
"""
from __future__ import annotations
import argparse
import sqlite3
import sys
from datetime import datetime, timezone
from pathlib import Path
import fitparse
from db import connect
def _parse_session_start(fit_path: Path) -> datetime | None:
"""Return the session start_time as a UTC datetime, or None if the file has none."""
try:
fit = fitparse.FitFile(str(fit_path))
for msg in fit.get_messages("session"):
vals = msg.get_values()
ts = vals.get("start_time")
if ts is None:
continue
if isinstance(ts, datetime):
# fitparse returns naive datetimes in UTC for FIT timestamps.
return ts.replace(tzinfo=timezone.utc) if ts.tzinfo is None else ts.astimezone(timezone.utc)
return None
except Exception as exc: # noqa: BLE001
print(f" ! parse failed for {fit_path.name}: {exc}", file=sys.stderr)
return None
def _load_activity_index(conn: sqlite3.Connection) -> dict[int, int]:
"""Map (epoch_seconds_utc) -> activity_id for every activity with a start_time_gmt."""
idx: dict[int, int] = {}
for aid, gmt in conn.execute("SELECT activity_id, start_time_gmt FROM activities"):
if gmt is None:
continue
# start_time_gmt may be an ISO string, ms epoch number, or sec epoch.
try:
if isinstance(gmt, (int, float)):
v = float(gmt)
# heuristic: >1e12 means ms epoch, otherwise seconds
secs = int(v / 1000) if v > 1e12 else int(v)
else:
s = str(gmt).strip().rstrip("Z")
secs = int(datetime.fromisoformat(s).replace(tzinfo=timezone.utc).timestamp())
except (ValueError, TypeError):
continue
idx[secs] = aid
return idx
def link(export_root: Path, *, dry_run: bool, min_size_kb: int, tolerance_s: int) -> None:
if not export_root.exists():
sys.exit(f"path does not exist: {export_root}")
conn = connect()
index = _load_activity_index(conn)
if not index:
sys.exit("no activities with start_time_gmt — ingest activities first")
sorted_keys = sorted(index.keys())
print(f"loaded {len(index):,} activity start times")
candidates = [p for p in export_root.rglob("*.fit") if p.stat().st_size >= min_size_kb * 1024]
print(f"scanning {len(candidates):,} FIT files ≥ {min_size_kb} KB")
linked = unmatched = parse_failed = 0
for i, p in enumerate(candidates, 1):
if i % 50 == 0:
print(f"{i}/{len(candidates)} processed (linked={linked}, unmatched={unmatched})")
start = _parse_session_start(p)
if start is None:
parse_failed += 1
continue
target = int(start.timestamp())
# Binary search the closest key within tolerance.
from bisect import bisect_left
pos = bisect_left(sorted_keys, target)
candidates_near: list[int] = []
if pos < len(sorted_keys):
candidates_near.append(sorted_keys[pos])
if pos > 0:
candidates_near.append(sorted_keys[pos - 1])
best = min((c for c in candidates_near if abs(c - target) <= tolerance_s), default=None, key=lambda c: abs(c - target))
if best is None:
unmatched += 1
continue
aid = index[best]
if not dry_run:
rel = p.relative_to(export_root)
conn.execute(
"""INSERT OR REPLACE INTO activity_fit_files (activity_id, fit_path, indexed_at)
VALUES (?, ?, datetime('now'))""",
(aid, str(rel)),
)
linked += 1
if not dry_run:
conn.commit()
conn.close()
print()
print("=== linker summary ===")
print(f" candidates scanned: {len(candidates):,}")
print(f" linked : {linked:,}" + (" (dry-run)" if dry_run else ""))
print(f" unmatched start : {unmatched:,} (no activity within ±{tolerance_s}s)")
print(f" parse failures : {parse_failed:,}")
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("export_root", help="Path to the unzipped Garmin export folder")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--min-size-kb", type=int, default=50,
help="Skip FITs smaller than this; default 50 KB filters out HRV/sleep snapshots")
parser.add_argument("--tolerance-s", type=int, default=60,
help="Max seconds between FIT session start and activity start to count as a match")
args = parser.parse_args()
link(Path(args.export_root).expanduser().resolve(),
dry_run=args.dry_run,
min_size_kb=args.min_size_kb,
tolerance_s=args.tolerance_s)
if __name__ == "__main__":
main()