Source code for cosinorage.datahandlers.genericdatahandler

###########################################################################
# Copyright (C) 2025 ETH Zurich
# CosinorAge: Prediction of biological age based on accelerometer data
# using the CosinorAge method proposed by Shim, Fleisch and Barata
# (https://www.nature.com/articles/s41746-024-01111-x)
#
# Authors: Jacob Leo Oskar Hunecke
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##########################################################################

from typing import Optional

from .datahandler import DataHandler, clock
from .utils.calc_enmo import calculate_minute_level_enmo
from .utils.generic import (filter_generic_data, preprocess_generic_data,
                            read_generic_xD_data, resample_generic_data)



[docs]
class GenericDataHandler(DataHandler):
    """
    Generic data handler for processing accelerometer and ENMO data from CSV files.

    This class provides a flexible interface for loading and processing various types of
    accelerometer data, including ENMO (Euclidean Norm Minus One), raw accelerometer
    data (x, y, z), and alternative count data. It supports automatic data filtering,
    resampling, preprocessing, and ENMO calculation.



    Attributes
    ----------
    file_path : str
        Path to the CSV file containing the data.
    data_format : str
        Format of the data file.
    data_type : str
        Type of data in the file.
    time_format : str
        Format of timestamps.
    time_column : str
        Name of the timestamp column.
    time_zone : str or None
        Timezone for datetime conversion.
    data_columns : list
        Names of the data columns.
    preprocess_args : dict
        Preprocessing arguments.
    raw_data : pd.DataFrame or None
        Raw data loaded from the file with timestamp index.
    sf_data : pd.DataFrame or None
        Data after filtering and resampling (sensor fusion data).
    ml_data : pd.DataFrame or None
        Minute-level ENMO data calculated from the processed data.
    meta_dict : dict
        Metadata dictionary containing information about the data processing.

    Examples
    --------
    Load ENMO data from a CSV file:

    >>> handler = GenericDataHandler(
    ...     file_path='data/enmo_data.csv',
    ...     data_type='enmo',
    ...     time_column='timestamp',
    ...     data_columns=['enmo']
    ... )
    >>> raw_data = handler.get_raw_data()
    >>> ml_data = handler.get_ml_data()

    Load accelerometer data from a CSV file:

    >>> handler = GenericDataHandler(
    ...     file_path='data/accel_data.csv',
    ...     data_type='accelerometer',
    ...     time_column='time',
    ...     data_columns=['x', 'y', 'z']
    ... )
    >>> raw_data = handler.get_raw_data()
    >>> ml_data = handler.get_ml_data()

    Notes
    -----
    The data processing pipeline includes:
    1. Loading raw data from CSV file
    2. Filtering incomplete days and selecting longest consecutive sequence
    3. Resampling to minute-level data
    4. Preprocessing (wear detection, noise removal, etc.)
    5. Calculating minute-level ENMO values

    The class automatically handles column mapping and timestamp processing.
    """


[docs]
    def __init__(
        self,
        file_path: str,
        data_format: str = "csv",
        data_type: str = "accelerometer-mg",
        time_format: str = "unix-ms",
        time_column: str = "timestamp",
        time_zone: Optional[str] = None,
        data_columns: Optional[list] = None,
        preprocess_args: dict = {},
        verbose: bool = False,
    ):
        """
        Initialize GenericDataHandler with CSV data file.

        Parameters
        ----------
        file_path : str
            Path to the CSV file containing the data.
        data_format : str, default='csv'
            Format of the data file. Currently only 'csv' is supported.
        data_type : str, default='accelerometer-mg'
            Type of data in the file. Must be one of:
            - 'enmo-mg', 'enmo-g': ENMO (Euclidean Norm Minus One) data
            - 'accelerometer-mg', 'accelerometer-g', 'accelerometer-ms2': Raw accelerometer data (x, y, z)
            - 'alternative_count': Alternative count data
        time_format : str, default='unix-ms'
            Format of timestamps. Must be one of: 'unix-ms', 'unix-s', 'datetime'.
        time_column : str, default='timestamp'
            Name of the timestamp column in the CSV file.
        time_zone : str, optional
            Timezone for datetime conversion. If None, uses local timezone.
        data_columns : list, optional
            Names of the data columns in the CSV file. If not provided, defaults are:
            - ['enmo'] for data_type='enmo-mg' or 'enmo-g'
            - ['x', 'y', 'z'] for data_type='accelerometer-mg', 'accelerometer-g', or 'accelerometer-ms2'
            - ['counts'] for data_type='alternative_count'
        preprocess_args : dict, default={}
            Additional preprocessing arguments to pass to the filtering and preprocessing functions.
        verbose : bool, default=False
            Whether to print progress information during data loading and processing.
        """

        super().__init__()

        if data_format not in ["csv"]:
            raise ValueError("Data format must be either 'csv'")

        # Handle legacy data types for backward compatibility
        if data_type == "enmo":
            data_type = "enmo-mg"
        elif data_type == "accelerometer":
            data_type = "accelerometer-mg"
        
        if data_type not in ["enmo-mg", "enmo-g", "accelerometer-mg", "accelerometer-g", "accelerometer-ms2", "alternative_count"]:
            raise ValueError(
                "Data type must be either 'enmo-mg', 'enmo-g', 'accelerometer-mg', 'accelerometer-g', 'accelerometer-ms2' or 'alternative_count'"
            )

        if time_format not in ["unix-ms", "unix-s", "datetime"]:
            raise ValueError("time_format must be either 'unix-ms', 'unix-s' or 'datetime'")

        if data_type in ["enmo-mg", "enmo-g"]:
            default_data_columns = ["enmo"]
        elif data_type in ["accelerometer-mg", "accelerometer-g", "accelerometer-ms2"]:
            default_data_columns = ["x", "y", "z"]
        elif data_type == "alternative_count":
            default_data_columns = ["counts"]
        else:
            raise ValueError(
                "Data type must be either 'enmo-mg', 'enmo-g', 'accelerometer-mg', 'accelerometer-g', 'accelerometer-ms2' or 'alternative_count'"
            )

        self.file_path = file_path
        self.data_format = data_format
        self.data_type = data_type
        self.time_format = time_format
        self.time_column = time_column
        self.time_zone = time_zone
        self.data_columns = (
            data_columns if data_columns is not None else default_data_columns
        )
        self.preprocess_args = preprocess_args

        self.meta_dict["datasource"] = "Generic"
        self.meta_dict["data_format"] = "CSV"
        self.meta_dict["time_format"] = time_format
        self.meta_dict["raw_data_type"] = (
            "ENMO"
            if data_type in ["enmo-mg", "enmo-g"]
            else (
                "Accelerometer"
                if data_type in ["accelerometer-mg", "accelerometer-g", "accelerometer-ms2"]
                else (
                    "Alternative Count"
                    if data_type == "alternative_count"
                    else "Unknown"
                )
            )
        )
        self.meta_dict["time_column"] = time_column
        self.meta_dict["time_zone"] = time_zone
        self.meta_dict["data_columns"] = data_columns

        self.__load_data(verbose=verbose)


    @clock
    def __load_data(self, verbose: bool = False):
        if self.data_format == "csv":
            # Determine number of dimensions based on data type
            n_dimensions = 3 if self.data_type in ["accelerometer-mg", "accelerometer-g", "accelerometer-ms2"] else 1

            # Load and process data
            self.raw_data = read_generic_xD_data(
                self.file_path,
                self.data_type,
                meta_dict=self.meta_dict,
                n_dimensions=n_dimensions,
                time_format=self.time_format,
                time_column=self.time_column,
                time_zone=self.time_zone,
                data_columns=self.data_columns,
                verbose=verbose,
            )
            self.sf_data = filter_generic_data(
                self.raw_data,
                self.data_type,
                self.meta_dict,
                verbose=verbose,
                preprocess_args=self.preprocess_args,
            )
            self.ml_data = resample_generic_data(
                self.sf_data, self.data_type, self.meta_dict, verbose=verbose
            )
            self.ml_data = preprocess_generic_data(
                self.ml_data,
                self.data_type,
                preprocess_args=self.preprocess_args,
                meta_dict=self.meta_dict,
                verbose=verbose,
            )
        else:
            raise ValueError("Data format must be either 'csv'")