Source code for cosinorage.datahandlers.genericdatahandler

###########################################################################
# Copyright (C) 2025 ETH Zurich
# CosinorAge: Prediction of biological age based on accelerometer data
# using the CosinorAge method proposed by Shim, Fleisch and Barata
# (https://www.nature.com/articles/s41746-024-01111-x)
#
# Authors: Jacob Leo Oskar Hunecke
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##########################################################################

from typing import Optional

from .datahandler import DataHandler, clock
from .utils.calc_enmo import calculate_minute_level_enmo
from .utils.generic import (filter_generic_data, preprocess_generic_data,
                            read_generic_xD_data, resample_generic_data)


[docs] class GenericDataHandler(DataHandler): """ Generic data handler for processing accelerometer and ENMO data from CSV files. This class provides a flexible interface for loading and processing various types of accelerometer data, including ENMO (Euclidean Norm Minus One), raw accelerometer data (x, y, z), and alternative count data. It supports automatic data filtering, resampling, preprocessing, and ENMO calculation. Attributes ---------- file_path : str Path to the CSV file containing the data. data_format : str Format of the data file. data_type : str Type of data in the file. time_format : str Format of timestamps. time_column : str Name of the timestamp column. time_zone : str or None Timezone for datetime conversion. data_columns : list Names of the data columns. preprocess_args : dict Preprocessing arguments. raw_data : pd.DataFrame or None Raw data loaded from the file with timestamp index. sf_data : pd.DataFrame or None Data after filtering and resampling (sensor fusion data). ml_data : pd.DataFrame or None Minute-level ENMO data calculated from the processed data. meta_dict : dict Metadata dictionary containing information about the data processing. Examples -------- Load ENMO data from a CSV file: >>> handler = GenericDataHandler( ... file_path='data/enmo_data.csv', ... data_type='enmo', ... time_column='timestamp', ... data_columns=['enmo'] ... ) >>> raw_data = handler.get_raw_data() >>> ml_data = handler.get_ml_data() Load accelerometer data from a CSV file: >>> handler = GenericDataHandler( ... file_path='data/accel_data.csv', ... data_type='accelerometer', ... time_column='time', ... data_columns=['x', 'y', 'z'] ... ) >>> raw_data = handler.get_raw_data() >>> ml_data = handler.get_ml_data() Notes ----- The data processing pipeline includes: 1. Loading raw data from CSV file 2. Filtering incomplete days and selecting longest consecutive sequence 3. Resampling to minute-level data 4. Preprocessing (wear detection, noise removal, etc.) 5. Calculating minute-level ENMO values The class automatically handles column mapping and timestamp processing. """
[docs] def __init__( self, file_path: str, data_format: str = "csv", data_type: str = "accelerometer-mg", time_format: str = "unix-ms", time_column: str = "timestamp", time_zone: Optional[str] = None, data_columns: Optional[list] = None, preprocess_args: dict = {}, verbose: bool = False, ): """ Initialize GenericDataHandler with CSV data file. Parameters ---------- file_path : str Path to the CSV file containing the data. data_format : str, default='csv' Format of the data file. Currently only 'csv' is supported. data_type : str, default='accelerometer-mg' Type of data in the file. Must be one of: - 'enmo-mg', 'enmo-g': ENMO (Euclidean Norm Minus One) data - 'accelerometer-mg', 'accelerometer-g', 'accelerometer-ms2': Raw accelerometer data (x, y, z) - 'alternative_count': Alternative count data time_format : str, default='unix-ms' Format of timestamps. Must be one of: 'unix-ms', 'unix-s', 'datetime'. time_column : str, default='timestamp' Name of the timestamp column in the CSV file. time_zone : str, optional Timezone for datetime conversion. If None, uses local timezone. data_columns : list, optional Names of the data columns in the CSV file. If not provided, defaults are: - ['enmo'] for data_type='enmo-mg' or 'enmo-g' - ['x', 'y', 'z'] for data_type='accelerometer-mg', 'accelerometer-g', or 'accelerometer-ms2' - ['counts'] for data_type='alternative_count' preprocess_args : dict, default={} Additional preprocessing arguments to pass to the filtering and preprocessing functions. verbose : bool, default=False Whether to print progress information during data loading and processing. """ super().__init__() if data_format not in ["csv"]: raise ValueError("Data format must be either 'csv'") # Handle legacy data types for backward compatibility if data_type == "enmo": data_type = "enmo-mg" elif data_type == "accelerometer": data_type = "accelerometer-mg" if data_type not in ["enmo-mg", "enmo-g", "accelerometer-mg", "accelerometer-g", "accelerometer-ms2", "alternative_count"]: raise ValueError( "Data type must be either 'enmo-mg', 'enmo-g', 'accelerometer-mg', 'accelerometer-g', 'accelerometer-ms2' or 'alternative_count'" ) if time_format not in ["unix-ms", "unix-s", "datetime"]: raise ValueError("time_format must be either 'unix-ms', 'unix-s' or 'datetime'") if data_type in ["enmo-mg", "enmo-g"]: default_data_columns = ["enmo"] elif data_type in ["accelerometer-mg", "accelerometer-g", "accelerometer-ms2"]: default_data_columns = ["x", "y", "z"] elif data_type == "alternative_count": default_data_columns = ["counts"] else: raise ValueError( "Data type must be either 'enmo-mg', 'enmo-g', 'accelerometer-mg', 'accelerometer-g', 'accelerometer-ms2' or 'alternative_count'" ) self.file_path = file_path self.data_format = data_format self.data_type = data_type self.time_format = time_format self.time_column = time_column self.time_zone = time_zone self.data_columns = ( data_columns if data_columns is not None else default_data_columns ) self.preprocess_args = preprocess_args self.meta_dict["datasource"] = "Generic" self.meta_dict["data_format"] = "CSV" self.meta_dict["time_format"] = time_format self.meta_dict["raw_data_type"] = ( "ENMO" if data_type in ["enmo-mg", "enmo-g"] else ( "Accelerometer" if data_type in ["accelerometer-mg", "accelerometer-g", "accelerometer-ms2"] else ( "Alternative Count" if data_type == "alternative_count" else "Unknown" ) ) ) self.meta_dict["time_column"] = time_column self.meta_dict["time_zone"] = time_zone self.meta_dict["data_columns"] = data_columns self.__load_data(verbose=verbose)
@clock def __load_data(self, verbose: bool = False): if self.data_format == "csv": # Determine number of dimensions based on data type n_dimensions = 3 if self.data_type in ["accelerometer-mg", "accelerometer-g", "accelerometer-ms2"] else 1 # Load and process data self.raw_data = read_generic_xD_data( self.file_path, self.data_type, meta_dict=self.meta_dict, n_dimensions=n_dimensions, time_format=self.time_format, time_column=self.time_column, time_zone=self.time_zone, data_columns=self.data_columns, verbose=verbose, ) self.sf_data = filter_generic_data( self.raw_data, self.data_type, self.meta_dict, verbose=verbose, preprocess_args=self.preprocess_args, ) self.ml_data = resample_generic_data( self.sf_data, self.data_type, self.meta_dict, verbose=verbose ) self.ml_data = preprocess_generic_data( self.ml_data, self.data_type, preprocess_args=self.preprocess_args, meta_dict=self.meta_dict, verbose=verbose, ) else: raise ValueError("Data format must be either 'csv'")