Module hdlib.parser

Utility to parse input files.

This module provides a set of utilities to parse input tables and split the dataset into training and test sets as a simple percentage split or cross validation.

Expand source code
"""Utility to parse input files.

This module provides a set of utilities to parse input tables and split the dataset 
into training and test sets as a simple percentage split or cross validation."""

import errno
import os
from typing import List, Tuple

import numpy as np


def load_dataset(
    filepath: os.path.abspath,
    sep: str="\t"
) -> Tuple[List[str], List[List[float]], List[str]]:
    """Load the input numerical dataset.

    Parameters
    ----------
    filepath : str
        Path to the input dataset.
    sep : str
        Filed separator for the input dataset.

    Returns
    -------
    tuple
        A tuple with a list of sample IDs, a list of features, a list of lists with the
        actual numerical data (floats), and a list with class labels.

    Raises
    ------
    FileNotFoundError
        If the input file does not exist.
    ValueError
        If the input dataset does not contain number only.
    """

    if not os.path.isfile(filepath):
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filepath)

    samples = list()
    content = list()
    classes = list()

    try:
        with open(filepath) as infile:
            # Trip the first and last column out
            features = infile.readline().rstrip().split(sep)[1:-1]

            for line in infile:
                line = line.strip()

                if line and not line.startswith("#"):
                    line_split = line.split(sep)

                    # Add sample ID
                    samples.append(line_split[0])

                    row_data = [float(value) for value in line_split[1:-1]]

                    # Add row
                    content.append(row_data)

                    # Take track of the class
                    classes.append(line_split[-1])

    except ValueError as e:
        raise Exception("The input dataset must contain numbers only!").with_traceback(e.__traceback__)

    return samples, features, content, classes


def percentage_split(labels: List[List[str]], percentage: float, seed: int=0) -> List[int]:
    """Given list of classes as appear in the original dataset and a percentage number, split a dataset and 
    report the indices of the selected data points.

    Parameters
    ----------
    labels : list
        List of class labels as they appear in the original dataset.
    percentage : float
        Percentage of points to split out of the original dataset.
    seed : int
        Random seed for reproducing the same results.

    Returns
    -------
    list
        A list with the indices of selected points.

    Raises
    ------
    ValueError
        - if the input `percentage` is lower than or equal to 0.0 or greater than 100.0;
        - if the input `seed` is not an integer number.

    Examples
    --------
    >>> from hdlib.parser import percentage_split
    >>> labels = [1, 2, 2, 2, 1, 1, 1, 1, 2, 2]
    >>> percentage_split(labels, 20.0, seed=0)
    [6, 9]

    Consider a dataset with 10 data points, select 20% of the points (2 points in this case),
    and report their indices in the original dataset.
    """

    if percentage <= 0.0 or percentage > 100.0:
        raise ValueError("Percentage must be greater than 0 and lower than or equal to 100")

    if not isinstance(seed, int):
        raise ValueError("The input seed must be an integer number")

    unique_labels = list(set(labels))

    if len(unique_labels) < 2:
        raise ValueError("The list of class labels must contain at least two unique lables")

    rand = np.random.default_rng(seed=seed)

    selection = list()

    for label in unique_labels:
        # Get a specific percentage of the data points for a specific class
        select_points = percentage * labels.count(label) / 100.0

        # Retrieve the indices of the samples under a specific class in the original dataset
        indices = [pos for pos, val in enumerate(labels) if val == label]

        # Finally subsample the list of indices according to the specific percentage
        selection.extend([indices[i] for i in rand.choice(len(indices), int(select_points), replace=False)])

    return sorted(selection)

Functions

def load_dataset(filepath: , sep: str = '\t') ‑> Tuple[List[str], List[List[float]], List[str]]

Load the input numerical dataset.

Parameters

filepath : str
Path to the input dataset.
sep : str
Filed separator for the input dataset.

Returns

tuple
A tuple with a list of sample IDs, a list of features, a list of lists with the actual numerical data (floats), and a list with class labels.

Raises

FileNotFoundError
If the input file does not exist.
ValueError
If the input dataset does not contain number only.
Expand source code
def load_dataset(
    filepath: os.path.abspath,
    sep: str="\t"
) -> Tuple[List[str], List[List[float]], List[str]]:
    """Load the input numerical dataset.

    Parameters
    ----------
    filepath : str
        Path to the input dataset.
    sep : str
        Filed separator for the input dataset.

    Returns
    -------
    tuple
        A tuple with a list of sample IDs, a list of features, a list of lists with the
        actual numerical data (floats), and a list with class labels.

    Raises
    ------
    FileNotFoundError
        If the input file does not exist.
    ValueError
        If the input dataset does not contain number only.
    """

    if not os.path.isfile(filepath):
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filepath)

    samples = list()
    content = list()
    classes = list()

    try:
        with open(filepath) as infile:
            # Trip the first and last column out
            features = infile.readline().rstrip().split(sep)[1:-1]

            for line in infile:
                line = line.strip()

                if line and not line.startswith("#"):
                    line_split = line.split(sep)

                    # Add sample ID
                    samples.append(line_split[0])

                    row_data = [float(value) for value in line_split[1:-1]]

                    # Add row
                    content.append(row_data)

                    # Take track of the class
                    classes.append(line_split[-1])

    except ValueError as e:
        raise Exception("The input dataset must contain numbers only!").with_traceback(e.__traceback__)

    return samples, features, content, classes
def percentage_split(labels: List[List[str]], percentage: float, seed: int = 0) ‑> List[int]

Given list of classes as appear in the original dataset and a percentage number, split a dataset and report the indices of the selected data points.

Parameters

labels : list
List of class labels as they appear in the original dataset.
percentage : float
Percentage of points to split out of the original dataset.
seed : int
Random seed for reproducing the same results.

Returns

list
A list with the indices of selected points.

Raises

ValueError
  • if the input percentage is lower than or equal to 0.0 or greater than 100.0;
  • if the input seed is not an integer number.

Examples

>>> from hdlib.parser import percentage_split
>>> labels = [1, 2, 2, 2, 1, 1, 1, 1, 2, 2]
>>> percentage_split(labels, 20.0, seed=0)
[6, 9]

Consider a dataset with 10 data points, select 20% of the points (2 points in this case), and report their indices in the original dataset.

Expand source code
def percentage_split(labels: List[List[str]], percentage: float, seed: int=0) -> List[int]:
    """Given list of classes as appear in the original dataset and a percentage number, split a dataset and 
    report the indices of the selected data points.

    Parameters
    ----------
    labels : list
        List of class labels as they appear in the original dataset.
    percentage : float
        Percentage of points to split out of the original dataset.
    seed : int
        Random seed for reproducing the same results.

    Returns
    -------
    list
        A list with the indices of selected points.

    Raises
    ------
    ValueError
        - if the input `percentage` is lower than or equal to 0.0 or greater than 100.0;
        - if the input `seed` is not an integer number.

    Examples
    --------
    >>> from hdlib.parser import percentage_split
    >>> labels = [1, 2, 2, 2, 1, 1, 1, 1, 2, 2]
    >>> percentage_split(labels, 20.0, seed=0)
    [6, 9]

    Consider a dataset with 10 data points, select 20% of the points (2 points in this case),
    and report their indices in the original dataset.
    """

    if percentage <= 0.0 or percentage > 100.0:
        raise ValueError("Percentage must be greater than 0 and lower than or equal to 100")

    if not isinstance(seed, int):
        raise ValueError("The input seed must be an integer number")

    unique_labels = list(set(labels))

    if len(unique_labels) < 2:
        raise ValueError("The list of class labels must contain at least two unique lables")

    rand = np.random.default_rng(seed=seed)

    selection = list()

    for label in unique_labels:
        # Get a specific percentage of the data points for a specific class
        select_points = percentage * labels.count(label) / 100.0

        # Retrieve the indices of the samples under a specific class in the original dataset
        indices = [pos for pos, val in enumerate(labels) if val == label]

        # Finally subsample the list of indices according to the specific percentage
        selection.extend([indices[i] for i in rand.choice(len(indices), int(select_points), replace=False)])

    return sorted(selection)