Source code for cosinorage.datahandlers.utils.galaxy_binary

###########################################################################
# Copyright (C) 2025 ETH Zurich
# CosinorAge: Prediction of biological age based on accelerometer data
# using the CosinorAge method proposed by Shim, Fleisch and Barata
# (https://www.nature.com/articles/s41746-024-01111-x)
#
# Authors: Jacob Leo Oskar Hunecke
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##########################################################################

import os
from typing import Union

import pandas as pd
from claid.data_collection.load.load_sensor_data import *

from .calc_enmo import calculate_enmo
from .calibration import calibrate_accelerometer
from .filtering import filter_consecutive_days, filter_incomplete_days
from .frequency_detection import detect_frequency_from_timestamps
from .noise_removal import remove_noise
from .wear_detection import calc_weartime, detect_wear_periods



[docs]
def read_galaxy_binary_data(
    galaxy_file_dir: str,
    meta_dict: dict,
    time_column: str = "unix_timestamp_in_ms",
    data_columns: Union[list, None] = None,
    verbose: bool = False,
) -> pd.DataFrame:
    """
    Read accelerometer data from Galaxy Watch binary files.

    Parameters
    ----------
    galaxy_file_dir : str
        Directory containing Galaxy Watch data files
    meta_dict : dict
        Dictionary to store metadata about the loaded data
    time_column : str
        Name of the timestamp column in the binary data
    data_columns : list
        Names of the data columns in the binary data
    verbose : bool
        Whether to print progress information

    Returns
    -------
    pd.DataFrame
        DataFrame containing accelerometer data with columns ['x', 'y', 'z']
    """

    # Set default data_columns if not provided
    if data_columns is None:
        data_columns = ["acceleration_x", "acceleration_y", "acceleration_z"]

    data = pd.DataFrame()

    n_files = 0
    for day_dir in os.listdir(galaxy_file_dir):
        if os.path.isdir(galaxy_file_dir + day_dir):
            for file in os.listdir(galaxy_file_dir + day_dir):
                # only consider binary files
                if file.endswith(".binary") and file.startswith(
                    "acceleration_data"
                ):
                    _temp = acceleration_data_to_dataframe(
                        load_acceleration_data(
                            galaxy_file_dir + day_dir + "/" + file
                        )
                    )
                    data = pd.concat([data, _temp])
                    n_files += 1

    if verbose:
        print(f"Read {n_files} files from {galaxy_file_dir}")

    # Rename columns to standard format
    column_mapping = {time_column: "timestamp"}
    for i, col in enumerate(data_columns):
        if i == 0:
            column_mapping[col] = "x"
        elif i == 1:
            column_mapping[col] = "y"
        elif i == 2:
            column_mapping[col] = "z"

    data = data.rename(columns=column_mapping)
    data["timestamp"] = pd.to_datetime(data["timestamp"], unit="ms")
    data.set_index("timestamp", inplace=True)
    data.drop(
        columns=["effective_time_frame", "sensor_body_location"], inplace=True
    )

    data = data.fillna(0)
    data.sort_index(inplace=True)

    if verbose:
        print(
            f"Loaded {data.shape[0]} accelerometer data records from {galaxy_file_dir}"
        )

    meta_dict["raw_n_datapoints"] = data.shape[0]
    meta_dict["raw_start_datetime"] = data.index.min()
    meta_dict["raw_end_datetime"] = data.index.max()
    meta_dict["sf"] = detect_frequency_from_timestamps(data.index)
    meta_dict["raw_data_frequency"] = f'{meta_dict["sf"]:.3g}Hz'
    meta_dict["raw_data_unit"] = "Custom"

    return data




[docs]
def filter_galaxy_binary_data(
    data: pd.DataFrame,
    meta_dict: dict = {},
    verbose: bool = False,
    preprocess_args: dict = {},
) -> pd.DataFrame:
    """
    Filter Galaxy Watch accelerometer data by removing incomplete days and selecting longest consecutive sequence.

    Parameters
    ----------
    data : pd.DataFrame
        Raw accelerometer data
    meta_dict : dict
        Dictionary to store metadata about the filtering process
    verbose : bool
        Whether to print progress information

    Returns
    -------
    pd.DataFrame
        Filtered accelerometer data
    """
    _data = data.copy()

    # filter out first and last day
    n_old = _data.shape[0]
    _data = _data.loc[
        (_data.index.date != _data.index.date.min())
        & (_data.index.date != _data.index.date.max())
    ]
    if verbose:
        print(
            f"Filtered out {n_old - _data.shape[0]}/{_data.shape[0]} accelerometer records due to filtering out first and last day"
        )

    # filter out sparse days
    required_points_per_day = (
        preprocess_args.get("required_daily_coverage", 0.5) * 2160000
    )
    n_old = _data.shape[0]
    sf = meta_dict.get("sf", 25)  # Default to 25Hz if not specified
    _data = filter_incomplete_days(
        _data, data_freq=sf, expected_points_per_day=required_points_per_day
    )
    if verbose:
        print(
            f"Filtered out {n_old - _data.shape[0]}/{n_old} accelerometer records due to incomplete daily coverage"
        )

    # filter for longest consecutive sequence of days
    old_n = _data.shape[0]
    _data = filter_consecutive_days(_data)
    if verbose:
        print(
            f"Filtered out {old_n - _data.shape[0]}/{old_n} minute-level accelerometer records due to filtering for longest consecutive sequence of days"
        )

    return _data




[docs]
def resample_galaxy_binary_data(
    data: pd.DataFrame, meta_dict: dict = {}, verbose: bool = False
) -> pd.DataFrame:
    """
    Resample Galaxy Watch accelerometer data to a regular interval.

    Parameters
    ----------
    data : pd.DataFrame
        Filtered accelerometer data
    meta_dict : dict
        Dictionary to store metadata about the resampling process
    verbose : bool
        Whether to print progress information

    Returns
    -------
    pd.DataFrame
        Resampled accelerometer data at regular frequency.
    """
    _data = data.copy()

    n_old = _data.shape[0]
    _data = _data.resample("40ms").mean().interpolate(method="linear").bfill()
    if verbose:
        print(f"Resampled {n_old} to {_data.shape[0]} timestamps")

    return _data




[docs]
def preprocess_galaxy_binary_data(
    data: pd.DataFrame,
    preprocess_args: dict = {},
    meta_dict: dict = {},
    verbose: bool = False,
) -> pd.DataFrame:
    """
    Preprocess Galaxy Watch accelerometer data including rescaling, calibration, noise removal, and wear detection.

    Parameters
    ----------
    data : pd.DataFrame
        Resampled accelerometer data
    preprocess_args : dict
        Dictionary containing preprocessing parameters
    meta_dict : dict
        Dictionary to store metadata about the preprocessing
    verbose : bool
        Whether to print progress information

    Returns
    -------
    pd.DataFrame
        Preprocessed accelerometer data with additional columns for raw values and wear detection
    """
    _data = data.copy()
    _data[["x_raw", "y_raw", "z_raw"]] = _data[["x", "y", "z"]]

    # recaling of accelerometer data according to blog post: https://developer.samsung.com/sdp/blog/en/2025/04/10/understanding-and-converting-galaxy-watch-accelerometer-data
    _data[["x", "y", "z"]] = _data[["x", "y", "z"]] / 4096

    # calibration
    sphere_crit = preprocess_args.get("autocalib_sphere_crit", 1)
    sd_criter = preprocess_args.get("autocalib_sd_criter", 0.3)
    _data[["x", "y", "z"]] = calibrate_accelerometer(
        _data,
        sphere_crit=sphere_crit,
        sd_criteria=sd_criter,
        meta_dict=meta_dict,
        verbose=verbose,
    )

    # noise removal
    type = preprocess_args.get("filter_type", "highpass")
    cutoff = preprocess_args.get("filter_cutoff", 15)
    _data[["x", "y", "z"]] = remove_noise(
        _data,
        sf=meta_dict["sf"],
        filter_type=type,
        filter_cutoff=cutoff,
        verbose=verbose,
    )

    # wear detection
    sd_crit = preprocess_args.get("wear_sd_crit", 0.00013)
    range_crit = preprocess_args.get("wear_range_crit", 0.00067)
    window_length = preprocess_args.get("wear_window_length", 30)
    window_skip = preprocess_args.get("wear_window_skip", 7)
    _data["wear"] = detect_wear_periods(
        _data,
        meta_dict["sf"],
        sd_crit,
        range_crit,
        window_length,
        window_skip,
        meta_dict=meta_dict,
        verbose=verbose,
    )

    # calculate total, wear, and non-wear time
    calc_weartime(
        _data, sf=meta_dict["sf"], meta_dict=meta_dict, verbose=verbose
    )

    _data["enmo"] = calculate_enmo(_data, verbose=verbose) * 1000

    if verbose:
        print(f"Preprocessed accelerometer data")

    return _data




[docs]
def acceleration_data_to_dataframe(data) -> pd.DataFrame:
    """
    Convert binary acceleration data to pandas DataFrame.

    This function converts raw binary acceleration data from Samsung Galaxy Watch
    into a structured pandas DataFrame format for further processing.

    Parameters
    ----------
    data : object
        Binary acceleration data object containing samples with the following attributes:
        - acceleration_x: X-axis acceleration value
        - acceleration_y: Y-axis acceleration value
        - acceleration_z: Z-axis acceleration value
        - sensor_body_location: Location of the sensor on the body
        - unix_timestamp_in_ms: Timestamp in milliseconds since Unix epoch
        - effective_time_frame: Effective time frame for the sample

    Returns
    -------
    pd.DataFrame
        DataFrame containing accelerometer data with columns:
        - 'acceleration_x': X-axis acceleration values
        - 'acceleration_y': Y-axis acceleration values
        - 'acceleration_z': Z-axis acceleration values
        - 'sensor_body_location': Sensor location information
        - 'unix_timestamp_in_ms': Timestamps in milliseconds
        - 'effective_time_frame': Effective time frame information

    Notes
    -----
    - This function is used internally by read_galaxy_binary_data
    - The function iterates through all samples in the binary data object
    - Each sample is converted to a dictionary and added to the DataFrame
    - The resulting DataFrame maintains the original data structure from the binary file

    Examples
    --------
    >>> # This function is typically called internally by read_galaxy_binary_data
    >>> # but can be used directly if you have binary data objects:
    >>>
    >>> # Load binary data (example)
    >>> binary_data = load_acceleration_data("path/to/binary/file")
    >>>
    >>> # Convert to DataFrame
    >>> df = acceleration_data_to_dataframe(binary_data)
    >>> print(f"Converted {len(df)} acceleration samples")
    >>> print(f"Columns: {df.columns.tolist()}")
    """
    rows = []
    for sample in data.samples:
        rows.append(
            {
                "acceleration_x": sample.acceleration_x,
                "acceleration_y": sample.acceleration_y,
                "acceleration_z": sample.acceleration_z,
                "sensor_body_location": sample.sensor_body_location,
                "unix_timestamp_in_ms": sample.unix_timestamp_in_ms,
                "effective_time_frame": sample.effective_time_frame,
            }
        )

    return pd.DataFrame(rows)