Source code for fusionlab.datasets.utils

import torch
from torchvision.datasets.utils import download_and_extract_archive, download_url
import os
from typing import Optional
from glob import glob
import json
import pandas as pd
import numpy as np


[docs]
def download_file(url: str,
                  download_root: str,
                  extract_root: Optional[str] = None,
                  filename: Optional[str] = None,
                  extract=False) -> None:
    """
    Download a file from a url and optionally extract it to a target directory.
    Args:
        url (str): URL to download file from
        download_root (str): Directory to place downloaded file in
        extract_root (str, optional): Directory to extract downloaded file to
        filename (str, optional): Name to save the file under. If None, use the basename of the URL
        extract (bool, optional): If True, extract the downloaded file. Otherwise, do not extract.
    """
    if not extract:  # if extract is False
        download_root = os.path.expanduser(download_root)  # expand the user's home directory in download_root
        if not filename:  # if filename is not provided
            filename = os.path.basename(url)  # set filename to the basename of the URL

        download_url(url, download_root, filename, md5=None)  # download the file
    else:
        if extract_root is None:  # if extract_root is not provided
            extract_root = download_root  # set extract_root to download_root
        download_and_extract_archive(url, download_root, extract_root, filename=filename)  # download and extract the file to extract_root



[docs]
class HFDataset(torch.utils.data.Dataset):
    """
    Base Hugginface dataset wrapper class
    Args:
        dataset: a dataset object that contains a getitem method
    """
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, index):
        x, labels = self.dataset[index]  # Forward pass the dataset
        return {'x': x, 'labels': labels}




[docs]
class LSTimeSegDataset(torch.utils.data.Dataset):
    """
    Dataset for label-studio timeseries segmentation task
    """
    

[docs]
    def __init__(self, data_dir, annotation_path, class_map, column_names):
        """
        Dataset for label-studio timeseries segmentation task
        
        Args:
            data_dir (str): directory of csv files
            annotation_path (str): path to annotation json file
            class_map (dict): a dictionary mapping class names to class indices
            column_names (List[str]): A list of column names for the signal data in the CSV files.
        
        Examples::
            >>> ds = LSTimeSegDataset(data_dir="./12",
            >>>                       annotation_path="./12.json",
            >>>                       class_map={"N": 1, "p": 2, "t": 3},
            >>>                       column_names=['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', 'v2', 'v3', 'v4', 'v5', 'v6'])
            >>> signals, mask = ds[0]
        """
        super().__init__()
        self.data_dir = data_dir
        self.annotation_path = annotation_path
        self.class_map = class_map
        self.column_names = column_names
        with open(annotation_path, "r") as f:
            self.annotations = json.load(f)


    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, index):
        """
        Returns:
            signals (torch.Tensor): shape: (num_samples, Channels)
            mask (torch.Tensor): shape: (num_samples, )

        """
        annotation = self.annotations[index]
        signal_filename = annotation["csv"].split(os.sep)[-1]
        signal_path = os.path.join(self.data_dir, signal_filename)
        df = pd.read_csv(signal_path)
        mask = np.zeros(len(df), dtype=np.int32)
        for segment in annotation["label"]:
            start_time = segment["start"]
            end_time = segment["end"]
            class_name = segment["timeserieslabels"][0]
            start_idx = df[df["time"] == start_time].index[0]
            end_idx = df[df["time"] == end_time].index[0]
            mask[start_idx: end_idx] = self.class_map[class_name]
        signals = df[self.column_names].values

        signals = torch.from_numpy(signals).float().permute(1, 0)
        signals = self.preprocess(signals)
        mask = torch.from_numpy(mask).long()
        return signals, mask
    

[docs]
    def preprocess(self, signals):
        return standardize_tensor(signals, dim=1)




[docs]
class LSTimeClassificationDataset(torch.utils.data.Dataset):
    """
    Dataset for label-studio timeseries classification task
    """
    

[docs]
    def __init__(self, data_dir, annotation_path, class_map, column_names):
        """
        Dataset for label-studio timeseries segmentation task
        
        Args:
            data_dir (str): directory of csv files
            annotation_path (str): path to annotation json file
            class_map (dict): a dictionary mapping class names to class indices
            column_names (List[str]): A list of column names for the signal data in the CSV files.
        
        Examples::
            >>> ds = LSTimeClassificationDataset(
            >>>     data_dir=DATA_DIR,
            >>>     annotation_path=ANNOTATION_PATH,
            >>>     class_map={"Normal": 1, "AF": 2, "AV Block": 3, "Noise": 4},
            >>>     column_names=['i', 'ii', 'iii'])
            >>> signals, label = ds[0]
        """
        super().__init__()
        self.data_dir = data_dir
        self.annotation_path = annotation_path
        self.class_map = class_map
        self.column_names = column_names
        with open(annotation_path, "r") as f:
            self.annotations = json.load(f)


    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, index):
        """
        Returns:
            signals (torch.Tensor): shape: (num_samples, Channels)
            mask (torch.Tensor): shape: (num_samples, )
            
        """
        annotation = self.annotations[index]
        signal_filename = annotation["csv"].split(os.sep)[-1]
        signal_path = os.path.join(self.data_dir, signal_filename)
        df = pd.read_csv(signal_path)
        signals = df[self.column_names].values
        # signals = self.preprocess(signals)
        label_name = annotation['pattern']
        label = self.class_map[label_name]

        signals = torch.from_numpy(signals).float().permute(1, 0) # (L, C) -> (C, L)
        signals = self.preprocess(signals)
        label = torch.tensor(label).long()
        return signals, label
    


[docs]
    def preprocess(self, signals):
        return standardize_tensor(signals, dim=1)





[docs]
def standardize_tensor(tensor, dim=0):
    """
    Standardize a tensor by channel

    Args:
        tensor (torch.Tensor): shape: (Channels, num_samples)
        dim (int): dimension to standardize
    Returns:
        tensor (torch.Tensor): shape: (Channels, num_samples,)
    """
    mean = tensor.mean(dim=dim, keepdim=True)
    std = tensor.std(dim=dim, keepdim=True)
    tensor = (tensor - mean) / std
    return tensor



# TODO: add test

[docs]
def count_parameters(
        model: torch.nn.Module, 
        trainable_only: bool = False
    ) -> int:
    """
    Returns the number of parameters in a model

    Args:
        model: a pytorch model
        trainable_only: if True, only count trainable parameters

    Returns:
        num_parameters: number of parameters in the model

    Reference: https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/9
    """
    if trainable_only:
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in model.parameters())



if __name__ == '__main__':
    import torch.nn as nn
    layer = nn.Linear(10, 10)
    
    for p in layer.parameters():
        p.requires_grad = False
    print(count_parameters(layer, trainable_only=False))
    print(count_parameters(layer, trainable_only=True))