Files
openrun/ingest_export.py
2026-05-18 12:53:24 -04:00

499 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Ingest a Garmin Connect data export (zip or unzipped directory) into SQLite.
Usage:
uv run ingest_export.py path/to/export.zip
uv run ingest_export.py path/to/unzipped_export_dir/
Garmin's export contains a tree like:
DI_CONNECT/
DI-Connect-Fitness/ # activity JSONs + .fit files
DI-Connect-Wellness/ # daily wellness JSONs
DI-Connect-Aggregator/ # rolled-up summaries
DI-Connect-User/ # profile
...
File names vary by account / export date. We dispatch on filename substrings
and log anything unrecognized so you can tell us about new shapes.
"""
from __future__ import annotations
import argparse
import json
import sqlite3
import sys
import tempfile
import zipfile
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Any, Callable, Iterable
from db import connect, set_state
# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
def _load_json(path: Path) -> Any:
try:
with path.open() as fh:
return json.load(fh)
except (OSError, json.JSONDecodeError) as exc:
print(f" ! failed to read {path.name}: {exc}", file=sys.stderr)
return None
def _as_list(payload: Any) -> list[dict]:
"""Garmin JSON files are sometimes a list, sometimes a {"key": [...]} envelope."""
if isinstance(payload, list):
return [x for x in payload if isinstance(x, dict)]
if isinstance(payload, dict):
for v in payload.values():
if isinstance(v, list):
return [x for x in v if isinstance(x, dict)]
return [payload]
return []
def _dump(obj: Any) -> str:
return json.dumps(obj, separators=(",", ":"), default=str)
def _first(d: dict, *keys: str) -> Any:
"""Pull the first non-null value among candidate keys (Garmin renames fields between exports)."""
for k in keys:
if k in d and d[k] is not None:
return d[k]
return None
def _date_key(d: dict) -> str | None:
"""Find the calendar date in a daily-stats record, normalized to ISO."""
raw = _first(d, "calendarDate", "date", "summaryDate", "statisticsStartDate")
if not raw:
return None
if isinstance(raw, str):
return raw[:10]
return str(raw)[:10]
# ---------------------------------------------------------------------------
# handlers — one per data category
# ---------------------------------------------------------------------------
def handle_activities(conn: sqlite3.Connection, path: Path) -> int:
payload = _load_json(path)
if payload is None:
return 0
# summarizedActivities format: [{"summarizedActivitiesExport": [{...}, {...}]}]
if isinstance(payload, list) and payload and isinstance(payload[0], dict):
if "summarizedActivitiesExport" in payload[0]:
items = payload[0]["summarizedActivitiesExport"]
else:
items = payload
elif isinstance(payload, dict):
items = _as_list(payload)
else:
items = []
# The Garmin Takeout `summarizedActivities` export uses scaled integer units:
# distance cm → m (÷100)
# duration ms → s (÷1000)
# elevation cm → m (÷100)
# speed m/s ÷ 10 → m/s (×10)
# The live API (`sync.py`) returns these in SI directly, so we only convert
# when the source is this export and the raw values are present in the scaled form.
def _scale(v, factor):
return None if v is None else v * factor
n = 0
for raw in items:
aid = _first(raw, "activityId", "activityIdLocal")
if aid is None:
continue
atype = raw.get("activityType")
if isinstance(atype, dict):
type_key = atype.get("typeKey")
else:
type_key = atype # sometimes a plain string in exports
conn.execute(
"""
INSERT INTO activities (
activity_id, start_time_local, start_time_gmt, activity_type,
activity_name, distance_m, duration_s, moving_duration_s,
avg_speed_mps, max_speed_mps, avg_hr, max_hr, calories,
elevation_gain_m, elevation_loss_m, training_load,
aerobic_te, anaerobic_te, vo2_max, raw, fetched_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))
ON CONFLICT(activity_id) DO UPDATE SET
activity_name=excluded.activity_name,
raw=excluded.raw,
fetched_at=excluded.fetched_at
""",
(
aid,
_first(raw, "startTimeLocal", "beginTimestamp"),
_first(raw, "startTimeGmt", "startTimeGMT"),
type_key,
_first(raw, "activityName", "name"),
_scale(_first(raw, "distance"), 0.01),
_scale(_first(raw, "duration"), 0.001),
_scale(_first(raw, "movingDuration"), 0.001),
_scale(_first(raw, "averageSpeed", "avgSpeed"), 10.0),
_scale(_first(raw, "maxSpeed"), 10.0),
_first(raw, "averageHR", "avgHr"),
_first(raw, "maxHR", "maxHr"),
_first(raw, "calories"),
_scale(_first(raw, "elevationGain"), 0.01),
_scale(_first(raw, "elevationLoss"), 0.01),
_first(raw, "activityTrainingLoad"),
_first(raw, "aerobicTrainingEffect"),
_first(raw, "anaerobicTrainingEffect"),
_first(raw, "vO2MaxValue"),
_dump(raw),
),
)
n += 1
conn.commit()
return n
def handle_sleep(conn: sqlite3.Connection, path: Path) -> int:
payload = _load_json(path)
if payload is None:
return 0
n = 0
for item in _as_list(payload):
date_key = _date_key(item)
if not date_key:
continue
conn.execute(
"""INSERT OR REPLACE INTO daily_sleep
(calendar_date, sleep_start_gmt, sleep_end_gmt, deep_s, light_s, rem_s, awake_s, sleep_score, raw, fetched_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))""",
(
date_key,
_first(item, "sleepStartTimestampGMT", "sleepStartTimeGmt"),
_first(item, "sleepEndTimestampGMT", "sleepEndTimeGmt"),
_first(item, "deepSleepSeconds"),
_first(item, "lightSleepSeconds"),
_first(item, "remSleepSeconds"),
_first(item, "awakeSleepSeconds", "awakeSeconds"),
_first(item, "sleepScore", "overallSleepScore", "value"),
_dump(item),
),
)
n += 1
conn.commit()
return n
def handle_steps(conn: sqlite3.Connection, path: Path) -> int:
payload = _load_json(path)
if payload is None:
return 0
n = 0
for item in _as_list(payload):
date_key = _date_key(item)
if not date_key:
continue
conn.execute(
"""INSERT OR REPLACE INTO daily_steps
(calendar_date, total_steps, step_goal, distance_m, raw, fetched_at)
VALUES (?, ?, ?, ?, ?, datetime('now'))""",
(
date_key,
_first(item, "totalSteps", "steps"),
_first(item, "stepGoal", "dailyStepGoal"),
_first(item, "totalDistance", "distance"),
_dump(item),
),
)
n += 1
conn.commit()
return n
def handle_stress(conn: sqlite3.Connection, path: Path) -> int:
payload = _load_json(path)
if payload is None:
return 0
n = 0
for item in _as_list(payload):
date_key = _date_key(item)
if not date_key:
continue
conn.execute(
"""INSERT OR REPLACE INTO daily_stress
(calendar_date, avg_stress, max_stress, raw, fetched_at)
VALUES (?, ?, ?, ?, datetime('now'))""",
(
date_key,
_first(item, "overallStressLevel", "averageStressLevel", "avgStress"),
_first(item, "maxStressLevel", "maxStress"),
_dump(item),
),
)
n += 1
conn.commit()
return n
def handle_hrv(conn: sqlite3.Connection, path: Path) -> int:
payload = _load_json(path)
if payload is None:
return 0
n = 0
for item in _as_list(payload):
date_key = _date_key(item)
if not date_key:
continue
conn.execute(
"""INSERT OR REPLACE INTO daily_hrv
(calendar_date, weekly_avg, last_night_avg, last_night_5min, status, raw, fetched_at)
VALUES (?, ?, ?, ?, ?, ?, datetime('now'))""",
(
date_key,
_first(item, "weeklyAvg"),
_first(item, "lastNightAvg"),
_first(item, "lastNight5MinHigh"),
_first(item, "status"),
_dump(item),
),
)
n += 1
conn.commit()
return n
def handle_resting_hr(conn: sqlite3.Connection, path: Path) -> int:
payload = _load_json(path)
if payload is None:
return 0
n = 0
for item in _as_list(payload):
date_key = _date_key(item)
if not date_key:
continue
rhr = _first(item, "restingHeartRate", "value")
if rhr is None:
continue
conn.execute(
"""INSERT OR REPLACE INTO daily_resting_hr
(calendar_date, resting_hr, raw, fetched_at)
VALUES (?, ?, ?, datetime('now'))""",
(date_key, rhr, _dump(item)),
)
n += 1
conn.commit()
return n
def handle_intensity_minutes(conn: sqlite3.Connection, path: Path) -> int:
payload = _load_json(path)
if payload is None:
return 0
n = 0
for item in _as_list(payload):
date_key = _date_key(item)
if not date_key:
continue
conn.execute(
"""INSERT OR REPLACE INTO daily_intensity_minutes
(calendar_date, moderate_minutes, vigorous_minutes, raw, fetched_at)
VALUES (?, ?, ?, ?, datetime('now'))""",
(
date_key,
_first(item, "moderateIntensityMinutes", "moderateValue"),
_first(item, "vigorousIntensityMinutes", "vigorousValue"),
_dump(item),
),
)
n += 1
conn.commit()
return n
def handle_body_battery(conn: sqlite3.Connection, path: Path) -> int:
payload = _load_json(path)
if payload is None:
return 0
n = 0
for item in _as_list(payload):
date_key = _date_key(item)
if not date_key:
continue
conn.execute(
"""INSERT OR REPLACE INTO daily_body_battery
(calendar_date, charged, drained, highest, lowest, raw, fetched_at)
VALUES (?, ?, ?, ?, ?, ?, datetime('now'))""",
(
date_key,
_first(item, "charged", "bodyBatteryChargedValue"),
_first(item, "drained", "bodyBatteryDrainedValue"),
_first(item, "highest", "highestBatteryLevel", "bodyBatteryHighestValue"),
_first(item, "lowest", "lowestBatteryLevel", "bodyBatteryLowestValue"),
_dump(item),
),
)
n += 1
conn.commit()
return n
def handle_fit(conn: sqlite3.Connection, path: Path, export_root: Path) -> int:
"""Index .fit file location by activity ID. Don't parse the binary here.
Garmin uses a few filename formats — handle both the classic `<id>_<name>.fit`
and the Takeout variant `<email>_<id>.fit` by picking the first 8+ digit chunk.
"""
aid = None
for chunk in path.stem.split("_"):
if chunk.isdigit() and len(chunk) >= 8:
aid = int(chunk)
break
if aid is None:
# filename has no recognisable activity-id chunk
return 0
# Verify the parsed number is actually an activity_id we know about.
# The Garmin Takeout dump uses *upload IDs* in FIT filenames (different ID space
# from activity IDs), so naive insert would create thousands of orphaned rows.
# If the id isn't in `activities`, skip — link_fit_files.py handles the
# by-content matching for takeout-format exports.
row = conn.execute(
"SELECT 1 FROM activities WHERE activity_id = ? LIMIT 1", (aid,)
).fetchone()
if row is None:
return 0
rel = path.relative_to(export_root)
conn.execute(
"""INSERT OR REPLACE INTO activity_fit_files (activity_id, fit_path, indexed_at)
VALUES (?, ?, datetime('now'))""",
(aid, str(rel)),
)
conn.commit()
return 1
# ---------------------------------------------------------------------------
# dispatch — pattern → handler
# ---------------------------------------------------------------------------
Handler = Callable[[sqlite3.Connection, Path], int]
# Order matters — first match wins. Use lowercased filename substrings.
DISPATCH: list[tuple[str, str, Handler]] = [
("summarizedActivities", "activities", handle_activities),
("sleepData", "sleep", handle_sleep),
("sleep", "sleep", handle_sleep),
("UDSFile", "steps", handle_steps), # daily steps file naming varies
("step", "steps", handle_steps),
("stressLevel", "stress", handle_stress),
("stress", "stress", handle_stress),
("hrvStatus", "hrv", handle_hrv),
("hrv", "hrv", handle_hrv),
("restingHeart", "resting_hr", handle_resting_hr),
("RHR", "resting_hr", handle_resting_hr),
("intensityMinute", "intensity", handle_intensity_minutes),
("bodyBattery", "body_batt", handle_body_battery),
("BBS", "body_batt", handle_body_battery),
]
def classify(name: str) -> tuple[str, Handler] | None:
lower = name.lower()
for needle, label, fn in DISPATCH:
if needle.lower() in lower:
return label, fn
return None
# ---------------------------------------------------------------------------
# entry
# ---------------------------------------------------------------------------
def iter_files(root: Path) -> Iterable[Path]:
for p in root.rglob("*"):
if p.is_file():
yield p
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("source", help="Path to export.zip or unzipped export directory")
parser.add_argument("--dry-run", action="store_true",
help="Show what would be ingested without writing to the DB")
args = parser.parse_args()
src = Path(args.source).expanduser().resolve()
if not src.exists():
sys.exit(f"path does not exist: {src}")
cleanup_dir: tempfile.TemporaryDirectory | None = None
if src.is_file() and src.suffix.lower() == ".zip":
cleanup_dir = tempfile.TemporaryDirectory(prefix="garmin_export_")
export_root = Path(cleanup_dir.name)
print(f"unzipping {src.name}{export_root}")
with zipfile.ZipFile(src) as zf:
zf.extractall(export_root)
elif src.is_dir():
export_root = src
else:
sys.exit(f"unsupported source: {src}")
conn = connect()
counts: dict[str, int] = defaultdict(int)
unknown: list[str] = []
fit_count = 0
for path in iter_files(export_root):
if path.suffix.lower() == ".fit":
if not args.dry_run:
fit_count += handle_fit(conn, path, export_root)
else:
fit_count += 1
continue
if path.suffix.lower() != ".json":
continue
matched = classify(path.name)
if not matched:
unknown.append(str(path.relative_to(export_root)))
continue
label, fn = matched
if args.dry_run:
counts[label] += 1
continue
try:
counts[label] += fn(conn, path)
except Exception as exc: # noqa: BLE001
print(f" ! error in {label} handler for {path.name}: {exc}", file=sys.stderr)
print("\n=== ingest summary ===")
for label, n in sorted(counts.items()):
print(f" {label:20s} {n:>6} rows" + (" (file count, dry run)" if args.dry_run else ""))
print(f" fit_files {fit_count:>6}")
if unknown:
print(f"\n unrecognized JSON files ({len(unknown)}):")
for name in unknown[:25]:
print(f" {name}")
if len(unknown) > 25:
print(f" ... and {len(unknown) - 25} more")
if not args.dry_run:
set_state(conn, "last_ingest_utc", datetime.utcnow().isoformat(timespec="seconds"))
set_state(conn, "last_ingest_source", str(src))
conn.commit()
conn.close()
if cleanup_dir:
cleanup_dir.cleanup()
print("\n✓ done")
if __name__ == "__main__":
main()