Source code for biomechzoo.statistics.lineval

import os
import pandas as pd
import numpy as np
from typing import Any, Dict, Literal

from biomechzoo.utils.zload import zload



[docs]
def lineval(root_folder: str, channel_name: str, output_format: Literal['array', 'wide'] = 'array',
            subject_level: int = 0, condition_level: int = 1) -> pd.DataFrame:
    """
    Extract time-normalized ``line`` arrays from Zoo files.

    This function recursively searches ``root_folder`` for ``.zoo`` files
    and extracts the ``line`` field from the specified channel. Folder
    levels are used to assign subject and condition labels.

    Data must already be time-normalized. The function will raise
    an error if inconsistent signal lengths are detected.

    :param root_folder: Root directory containing data.
    :type root_folder: str
    :param channel_name: Name of the channel to extract.
    :type channel_name: str
    :param output_format: Output format.
                   - ``'array'``: one column containing the full array (default)
                   - ``'wide'``: one column per timepoint (p0, p1, ...)
    :type output_format: Literal['array', 'wide']
    :param subject_level: Folder index used to define subject label
                          (0 = first folder below root).
    :type subject_level: int
    :param condition_level: Folder index used to define condition label
                            (0 = first folder below root).
    :type condition_level: int

    :raises KeyError: If the specified channel or ``line`` field is missing.
    :raises ValueError: If signals are not equal length (not normalized).
    :raises ValueError: If invalid format is provided.
    :raises IndexError: If folder depth is insufficient for specified levels.

    :return: DataFrame containing extracted line data with subject,
             condition, and trial references.
    :rtype: pandas.DataFrame
    """

    if output_format not in ['array', 'wide']:
        raise ValueError("format must be 'array' or 'wide'")

    results = []
    line_lengths = []

    for dirpath, _, files in os.walk(root_folder):

        for file in files:

            if not file.endswith('.zoo'):
                continue

            file_path = os.path.join(dirpath, file)

            relative_path = os.path.relpath(file_path, root_folder)
            parts = relative_path.split(os.sep)

            # Remove filename from parts
            folder_parts = parts[:-1]

            if len(folder_parts) <= max(subject_level, condition_level):
                raise IndexError(
                    'Folder depth is insufficient for specified '
                    'subject_level or condition_level.'
                )

            subject = folder_parts[subject_level]
            condition = folder_parts[condition_level]

            data = zload(file_path)

            if channel_name not in data:
                raise KeyError(
                    'Channel {} not found in {}'.format(channel_name, file_path)
                )

            line_array = np.asarray(data[channel_name]['line']).squeeze()

            line_lengths.append(len(line_array))

            base_row: Dict[str, Any] = {
                'subject': subject,
                'condition': condition,
                'trial': file
            }

            if output_format == 'array':
                base_row['line'] = line_array
                results.append(base_row)

            elif output_format == 'wide':
                for i in range(len(line_array)):
                    base_row['p{}'.format(i)] = line_array[i]

                results.append(base_row)

            print('Line extracted from {}'.format(file_path))

    # Strict normalization check
    if len(set(line_lengths)) > 1:
        raise ValueError(
            'Line arrays are not equal length. '
            'Data must be time-normalized before calling lineval().'
        )

    df = pd.DataFrame(results)

    return df