###########################################################################
# Copyright (C) 2025 ETH Zurich
# CosinorAge: Prediction of biological age based on accelerometer data
# using the CosinorAge method proposed by Shim, Fleisch and Barata
# (https://www.nature.com/articles/s41746-024-01111-x)
#
# Authors: Jacob Leo Oskar Hunecke
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##########################################################################
import os
from typing import Union
import pandas as pd
from claid.data_collection.load.load_sensor_data import *
from .calc_enmo import calculate_enmo
from .calibration import calibrate_accelerometer
from .filtering import filter_consecutive_days, filter_incomplete_days
from .frequency_detection import detect_frequency_from_timestamps
from .noise_removal import remove_noise
from .wear_detection import calc_weartime, detect_wear_periods
[docs]
def read_galaxy_binary_data(
galaxy_file_dir: str,
meta_dict: dict,
time_column: str = "unix_timestamp_in_ms",
data_columns: Union[list, None] = None,
verbose: bool = False,
) -> pd.DataFrame:
"""
Read accelerometer data from Galaxy Watch binary files.
Parameters
----------
galaxy_file_dir : str
Directory containing Galaxy Watch data files
meta_dict : dict
Dictionary to store metadata about the loaded data
time_column : str
Name of the timestamp column in the binary data
data_columns : list
Names of the data columns in the binary data
verbose : bool
Whether to print progress information
Returns
-------
pd.DataFrame
DataFrame containing accelerometer data with columns ['x', 'y', 'z']
"""
# Set default data_columns if not provided
if data_columns is None:
data_columns = ["acceleration_x", "acceleration_y", "acceleration_z"]
data = pd.DataFrame()
n_files = 0
for day_dir in os.listdir(galaxy_file_dir):
if os.path.isdir(galaxy_file_dir + day_dir):
for file in os.listdir(galaxy_file_dir + day_dir):
# only consider binary files
if file.endswith(".binary") and file.startswith(
"acceleration_data"
):
_temp = acceleration_data_to_dataframe(
load_acceleration_data(
galaxy_file_dir + day_dir + "/" + file
)
)
data = pd.concat([data, _temp])
n_files += 1
if verbose:
print(f"Read {n_files} files from {galaxy_file_dir}")
# Rename columns to standard format
column_mapping = {time_column: "timestamp"}
for i, col in enumerate(data_columns):
if i == 0:
column_mapping[col] = "x"
elif i == 1:
column_mapping[col] = "y"
elif i == 2:
column_mapping[col] = "z"
data = data.rename(columns=column_mapping)
data["timestamp"] = pd.to_datetime(data["timestamp"], unit="ms")
data.set_index("timestamp", inplace=True)
data.drop(
columns=["effective_time_frame", "sensor_body_location"], inplace=True
)
data = data.fillna(0)
data.sort_index(inplace=True)
if verbose:
print(
f"Loaded {data.shape[0]} accelerometer data records from {galaxy_file_dir}"
)
meta_dict["raw_n_datapoints"] = data.shape[0]
meta_dict["raw_start_datetime"] = data.index.min()
meta_dict["raw_end_datetime"] = data.index.max()
meta_dict["sf"] = detect_frequency_from_timestamps(data.index)
meta_dict["raw_data_frequency"] = f'{meta_dict["sf"]:.3g}Hz'
meta_dict["raw_data_unit"] = "Custom"
return data
[docs]
def filter_galaxy_binary_data(
data: pd.DataFrame,
meta_dict: dict = {},
verbose: bool = False,
preprocess_args: dict = {},
) -> pd.DataFrame:
"""
Filter Galaxy Watch accelerometer data by removing incomplete days and selecting longest consecutive sequence.
Parameters
----------
data : pd.DataFrame
Raw accelerometer data
meta_dict : dict
Dictionary to store metadata about the filtering process
verbose : bool
Whether to print progress information
Returns
-------
pd.DataFrame
Filtered accelerometer data
"""
_data = data.copy()
# filter out first and last day
n_old = _data.shape[0]
_data = _data.loc[
(_data.index.date != _data.index.date.min())
& (_data.index.date != _data.index.date.max())
]
if verbose:
print(
f"Filtered out {n_old - _data.shape[0]}/{_data.shape[0]} accelerometer records due to filtering out first and last day"
)
# filter out sparse days
required_points_per_day = (
preprocess_args.get("required_daily_coverage", 0.5) * 2160000
)
n_old = _data.shape[0]
sf = meta_dict.get("sf", 25) # Default to 25Hz if not specified
_data = filter_incomplete_days(
_data, data_freq=sf, expected_points_per_day=required_points_per_day
)
if verbose:
print(
f"Filtered out {n_old - _data.shape[0]}/{n_old} accelerometer records due to incomplete daily coverage"
)
# filter for longest consecutive sequence of days
old_n = _data.shape[0]
_data = filter_consecutive_days(_data)
if verbose:
print(
f"Filtered out {old_n - _data.shape[0]}/{old_n} minute-level accelerometer records due to filtering for longest consecutive sequence of days"
)
return _data
[docs]
def resample_galaxy_binary_data(
data: pd.DataFrame, meta_dict: dict = {}, verbose: bool = False
) -> pd.DataFrame:
"""
Resample Galaxy Watch accelerometer data to a regular interval.
Parameters
----------
data : pd.DataFrame
Filtered accelerometer data
meta_dict : dict
Dictionary to store metadata about the resampling process
verbose : bool
Whether to print progress information
Returns
-------
pd.DataFrame
Resampled accelerometer data at regular frequency.
"""
_data = data.copy()
n_old = _data.shape[0]
_data = _data.resample("40ms").mean().interpolate(method="linear").bfill()
if verbose:
print(f"Resampled {n_old} to {_data.shape[0]} timestamps")
return _data
[docs]
def preprocess_galaxy_binary_data(
data: pd.DataFrame,
preprocess_args: dict = {},
meta_dict: dict = {},
verbose: bool = False,
) -> pd.DataFrame:
"""
Preprocess Galaxy Watch accelerometer data including rescaling, calibration, noise removal, and wear detection.
Parameters
----------
data : pd.DataFrame
Resampled accelerometer data
preprocess_args : dict
Dictionary containing preprocessing parameters
meta_dict : dict
Dictionary to store metadata about the preprocessing
verbose : bool
Whether to print progress information
Returns
-------
pd.DataFrame
Preprocessed accelerometer data with additional columns for raw values and wear detection
"""
_data = data.copy()
_data[["x_raw", "y_raw", "z_raw"]] = _data[["x", "y", "z"]]
# recaling of accelerometer data according to blog post: https://developer.samsung.com/sdp/blog/en/2025/04/10/understanding-and-converting-galaxy-watch-accelerometer-data
_data[["x", "y", "z"]] = _data[["x", "y", "z"]] / 4096
# calibration
sphere_crit = preprocess_args.get("autocalib_sphere_crit", 1)
sd_criter = preprocess_args.get("autocalib_sd_criter", 0.3)
_data[["x", "y", "z"]] = calibrate_accelerometer(
_data,
sphere_crit=sphere_crit,
sd_criteria=sd_criter,
meta_dict=meta_dict,
verbose=verbose,
)
# noise removal
type = preprocess_args.get("filter_type", "highpass")
cutoff = preprocess_args.get("filter_cutoff", 15)
_data[["x", "y", "z"]] = remove_noise(
_data,
sf=meta_dict["sf"],
filter_type=type,
filter_cutoff=cutoff,
verbose=verbose,
)
# wear detection
sd_crit = preprocess_args.get("wear_sd_crit", 0.00013)
range_crit = preprocess_args.get("wear_range_crit", 0.00067)
window_length = preprocess_args.get("wear_window_length", 30)
window_skip = preprocess_args.get("wear_window_skip", 7)
_data["wear"] = detect_wear_periods(
_data,
meta_dict["sf"],
sd_crit,
range_crit,
window_length,
window_skip,
meta_dict=meta_dict,
verbose=verbose,
)
# calculate total, wear, and non-wear time
calc_weartime(
_data, sf=meta_dict["sf"], meta_dict=meta_dict, verbose=verbose
)
_data["enmo"] = calculate_enmo(_data, verbose=verbose) * 1000
if verbose:
print(f"Preprocessed accelerometer data")
return _data
[docs]
def acceleration_data_to_dataframe(data) -> pd.DataFrame:
"""
Convert binary acceleration data to pandas DataFrame.
This function converts raw binary acceleration data from Samsung Galaxy Watch
into a structured pandas DataFrame format for further processing.
Parameters
----------
data : object
Binary acceleration data object containing samples with the following attributes:
- acceleration_x: X-axis acceleration value
- acceleration_y: Y-axis acceleration value
- acceleration_z: Z-axis acceleration value
- sensor_body_location: Location of the sensor on the body
- unix_timestamp_in_ms: Timestamp in milliseconds since Unix epoch
- effective_time_frame: Effective time frame for the sample
Returns
-------
pd.DataFrame
DataFrame containing accelerometer data with columns:
- 'acceleration_x': X-axis acceleration values
- 'acceleration_y': Y-axis acceleration values
- 'acceleration_z': Z-axis acceleration values
- 'sensor_body_location': Sensor location information
- 'unix_timestamp_in_ms': Timestamps in milliseconds
- 'effective_time_frame': Effective time frame information
Notes
-----
- This function is used internally by read_galaxy_binary_data
- The function iterates through all samples in the binary data object
- Each sample is converted to a dictionary and added to the DataFrame
- The resulting DataFrame maintains the original data structure from the binary file
Examples
--------
>>> # This function is typically called internally by read_galaxy_binary_data
>>> # but can be used directly if you have binary data objects:
>>>
>>> # Load binary data (example)
>>> binary_data = load_acceleration_data("path/to/binary/file")
>>>
>>> # Convert to DataFrame
>>> df = acceleration_data_to_dataframe(binary_data)
>>> print(f"Converted {len(df)} acceleration samples")
>>> print(f"Columns: {df.columns.tolist()}")
"""
rows = []
for sample in data.samples:
rows.append(
{
"acceleration_x": sample.acceleration_x,
"acceleration_y": sample.acceleration_y,
"acceleration_z": sample.acceleration_z,
"sensor_body_location": sample.sensor_body_location,
"unix_timestamp_in_ms": sample.unix_timestamp_in_ms,
"effective_time_frame": sample.effective_time_frame,
}
)
return pd.DataFrame(rows)