Source code for neurox.data.loader

"""Loading functions for activations, input tokens/sentences and labels

This module contains functions to load activations as well as source files with
tokens and labels. Functions that support tokenized data are also provided.
"""
import pickle
import json

import h5py
import numpy as np
import torch


[docs]def load_activations(activations_path, num_neurons_per_layer=None, is_brnn=False):
    """Load extracted activations.

    Parameters
    ----------
    activations_path : str
        Path to the activations file. Can be of type t7, pt, acts, json or hdf5
    num_neurons_per_layer : int, optional
        Number of neurons per layer - used to compute total number of layers.
        This is only necessary in the case of t7/p5/acts activations.
    is_brnn : bool, optional
        If the model used to extract activations was bidirectional (default: False)

    Returns
    -------
    activations : list of numpy.ndarray
        List of *sentence representations*, where each *sentence representation*
        is a numpy matrix of shape ``[num tokens in sentence x concatenated representation size]``
    num_layers : int
        Number of layers. This is usually representation_size/num_neurons_per_layer.
        Divide again by 2 if model was bidirectional

    """
    file_ext = activations_path.split(".")[-1]

    activations = None
    num_layers = None

    # Load activations based on type
    # Also ensure everything is on the CPU
    #   as activations may have been saved as CUDA variables
    if file_ext == "t7":
        # t7 loading requires torch < 1.0
        print("Loading seq2seq-attn activations from %s..." % (activations_path))
        assert (
            num_neurons_per_layer is not None
        ), "t7 activations require num_neurons_per_layer"
        from torch.utils.serialization import load_lua

        activations = load_lua(activations_path)["encodings"]
        activations = [a.cpu() for a in activations]
        num_layers = len(activations[0][0]) / num_neurons_per_layer
        if is_brnn:
            num_layers /= 2
    elif file_ext == "pt":
        print("Loading OpenNMT-py activations from %s..." % (activations_path))
        assert (
            num_neurons_per_layer is not None
        ), "pt activations require num_neurons_per_layer"
        activations = torch.load(activations_path)
        activations = [
            torch.stack([torch.cat(token) for token in sentence]).cpu()
            for sentence in activations
        ]
        num_layers = len(activations[0][0]) / num_neurons_per_layer
    elif file_ext == "acts":
        print("Loading generic activations from %s..." % (activations_path))
        assert (
            num_neurons_per_layer is not None
        ), "acts activations require num_neurons_per_layer"
        with open(activations_path, "rb") as activations_file:
            activations = pickle.load(activations_file)

        # Combine all layers sequentially
        print("Combining layers " + str([a[0] for a in activations]))
        activations = [a[1] for a in activations]
        num_layers = len(activations)
        num_sentences = len(activations[0])
        concatenated_activations = []
        for sentence_idx in range(num_sentences):
            sentence_acts = []
            for layer_idx in range(num_layers):
                sentence_acts.append(np.vstack(activations[layer_idx][sentence_idx]))
            concatenated_activations.append(np.concatenate(sentence_acts, axis=1))
        activations = concatenated_activations
    elif file_ext == "hdf5":
        print("Loading hdf5 activations from %s..." % (activations_path))
        representations = h5py.File(activations_path, "r")
        sentence_to_index = json.loads(representations.get("sentence_to_index")[0])
        activations = []
        # TODO: Check order
        for _, value in sentence_to_index.items():
            sentence_acts = torch.FloatTensor(representations[value])
            num_layers, sentence_length, embedding_size = (
                sentence_acts.shape[0],
                sentence_acts.shape[1],
                sentence_acts.shape[2],
            )
            num_neurons_per_layer = embedding_size
            sentence_acts = np.swapaxes(sentence_acts, 0, 1)
            sentence_acts = sentence_acts.reshape(
                sentence_length, num_layers * embedding_size
            )
            activations.append(sentence_acts.numpy())
        num_layers = len(activations[0][0]) / num_neurons_per_layer
    elif file_ext == "json":
        print("Loading json activations from %s..." % (activations_path))
        activations = []
        with open(activations_path) as fp:
            for line in fp:
                token_acts = []
                sentence_activations = json.loads(line)["features"]
                for act in sentence_activations:
                    num_neurons_per_layer = len(act["layers"][0]["values"])
                    token_acts.append(
                        np.concatenate([l["values"] for l in act["layers"]])
                    )
                activations.append(np.vstack(token_acts))

        num_layers = activations[0].shape[1] / num_neurons_per_layer
        print(len(activations), num_layers)
    else:
        assert False, "Activations must be of type t7, pt, acts, json or hdf5"

    return activations, int(num_layers)

[docs]def filter_activations_by_layers(
    train_activations, test_activations, filter_layers, rnn_size, num_layers, is_brnn
):
    """
    Filter activations so that they only contain specific layers.

    Useful for performing layer-wise analysis.

    .. warning::
        This function is deprecated and will be removed in future versions.

    Parameters
    ----------
    train_activations : list of numpy.ndarray
        List of *sentence representations* from the train set, where each
        *sentence representation* is a numpy matrix of shape
        ``[NUM_TOKENS x NUM_NEURONS]``. The method assumes that neurons from
        all layers are present, with the number of neurons in every layer given
        by ``rnn_size``
    test_activations : list of numpy.ndarray
        Similar to ``train_activations`` but with sentences from a test set.
    filter_layers : str
        A comma-separated string of the form "f1,f2,f10". "f" indicates a "forward"
        layer while "b" indicates a backword layer in a Bidirectional RNN. If the
        activations are from different kind of model, set ``is_brnn`` to ``False``
        and provide only "f" entries. The number next to "f" is the layer number,
        1-indexed. So "f1" corresponds to the embedding layer and so on.
    rnn_size : int
        Number of neurons in every layer.
    num_layers : int
        Total number of layers in the original model.
    is_brnn : bool
        Boolean indicating if the neuron activations are from a bidirectional model.

    Returns
    -------
    filtered_train_activations : list of numpy.ndarray
        Filtered train activations
    filtered_test_activations : list of numpy.ndarray
        Filtered test activations

    Notes
    -----
    For bidirectional models, the method assumes that the internal structure is
    as follows: forward layer 1 neurons, backward layer 1 neurons, forward layer
    2 neurons ...

    """
    _layers = filter_layers.split(",")

    layer_prefixes = ["f"]
    if is_brnn:
        layer_prefixes = ["f", "b"]

    # FILTER settings
    layers = list(
        range(1, num_layers + 1)
    )  # choose which layers you need the activations
    filtered_train_activations = None
    filtered_test_activations = None

    layers_idx = []
    for brnn_idx, b in enumerate(layer_prefixes):
        for l in layers:
            if "%s%d" % (b, l) in _layers:
                start_idx = brnn_idx * (num_layers * rnn_size) + (l - 1) * rnn_size
                end_idx = brnn_idx * (num_layers * rnn_size) + (l) * rnn_size

                print(
                    "Including neurons from %s%d(#%d to #%d)"
                    % (b, l, start_idx, end_idx)
                )
                layers_idx.append(np.arange(start_idx, end_idx))
    layers_idx = np.concatenate(layers_idx)

    filtered_train_activations = [a[:, layers_idx] for a in train_activations]
    filtered_test_activations = [a[:, layers_idx] for a in test_activations]

    return filtered_train_activations, filtered_test_activations


[docs]def load_aux_data(
    source_path,
    labels_path,
    source_aux_path,
    activations,
    max_sent_l,
    ignore_start_token=False,
):
    """Load word-annotated text-label pairs data represented as sentences, where
    activation extraction was performed on tokenized text. This function loads
    the source text, source tokenized text, target labels, and activations and
    tries to make them perfectly parallel, i.e. number of tokens in line N of
    source would match the number of tokens in line N of target, and number of
    tokens in source_aux will match the number of activations at index N.
    The method will delete non-matching activation/source/source_aix/target
    pairs, up to a maximum of 100 before failing. The method will also ignore
    sentences longer than the provided maximum. The activations will be modified
    in place.

    .. warning::
        This function is deprecated and will be removed in future versions.

    Parameters
    ----------
    source_path : str
        Path to the source text file, one sentence per line
    labels_path : str
        Path to the annotated labels file, one sentence per line corresponding to
        the sentences in the ``source_path`` file.
    source_aux_path : str
        Path to the source text file with tokenization, one sentence per line
    activations : list of numpy.ndarray
        Activations returned from ``loader.load_activations``
    max_sent_l : int
        Maximum length of sentences. Sentences containing more tokens will be
        ignored.
    ignore_start_token : bool, optional
        Ignore the first token. Useful if there is some line position markers
        in the source text.

    Returns
    -------
    tokens : dict
        Dictionary containing three lists, ``source``, ``source_aux`` and
        ``target``. ``source`` contains all of the sentences from``source_path``
        that were not ignored. ``source_aux`` contains all tokenized sentences
        from ``source_aux_path``. ``target`` contains the parallel set of
        annotated labels.
    """
    tokens = {"source_aux": [], "source": [], "target": []}

    skipped_lines = set()
    with open(source_aux_path) as source_aux_fp:
        for line_idx, line in enumerate(source_aux_fp):
            line_tokens = line.strip().split()
            if len(line_tokens) > max_sent_l:
                print("Skipping line #%d because of length (aux)" % (line_idx))
                skipped_lines.add(line_idx)
            if ignore_start_token:
                line_tokens = line_tokens[1:]
                activations[line_idx] = activations[line_idx][1:, :]
            tokens["source_aux"].append(line_tokens)
    with open(source_path) as source_fp:
        for line_idx, line in enumerate(source_fp):
            line_tokens = line.strip().split()
            if len(line_tokens) > max_sent_l:
                print("Skipping line #%d because of length (source)" % (line_idx))
                skipped_lines.add(line_idx)
            if ignore_start_token:
                line_tokens = line_tokens[1:]
            tokens["source"].append(line_tokens)

    with open(labels_path) as labels_fp:
        for line_idx, line in enumerate(labels_fp):
            line_tokens = line.strip().split()
            if len(line_tokens) > max_sent_l:
                print("Skipping line #%d because of length (label)" % (line_idx))
                skipped_lines.add(line_idx)
            if ignore_start_token:
                line_tokens = line_tokens[1:]
            tokens["target"].append(line_tokens)

    assert len(tokens["source_aux"]) == len(tokens["source"]) and len(
        tokens["source_aux"]
    ) == len(tokens["target"]), (
        "Number of lines do not match (source: %d, aux: %d, target: %d)!"
        % (len(tokens["source"]), len(tokens["source_aux"]), len(tokens["target"]))
    )

    assert len(activations) == len(tokens["source"]), (
        "Number of lines do not match (activations: %d, source: %d)!"
        % (len(activations), len(tokens["source"]))
    )

    for num_deleted, line_idx in enumerate(sorted(skipped_lines)):
        print("Deleting skipped line %d" % (line_idx))
        del tokens["source_aux"][line_idx]
        del tokens["source"][line_idx]
        del tokens["target"][line_idx]

    # Check if all data is well formed (whether we have activations + labels for each
    # and every word)
    invalid_activation_idx = []
    for idx, activation in enumerate(activations):
        if activation.shape[0] == len(tokens["source_aux"][idx]) and len(
            tokens["source"][idx]
        ) == len(tokens["target"][idx]):
            pass
        else:
            invalid_activation_idx.append(idx)
            print(
                "Skipping line: ",
                idx,
                "A: %d, aux: %d, src: %d, tgt: %s"
                % (
                    activation.shape[0],
                    len(tokens["source_aux"][idx]),
                    len(tokens["source"][idx]),
                    len(tokens["target"][idx]),
                ),
            )

    assert len(invalid_activation_idx) < 100, (
        "Too many mismatches (%d) - your paths are probably incorrect or something is wrong in the data!"
        % (len(invalid_activation_idx))
    )

    for num_deleted, idx in enumerate(invalid_activation_idx):
        print(
            "Deleting line %d: %d activations, %s aux, %d source, %d target"
            % (
                idx - num_deleted,
                activations[idx - num_deleted].shape[0],
                len(tokens["source_aux"][idx - num_deleted]),
                len(tokens["source"][idx - num_deleted]),
                len(tokens["target"][idx - num_deleted]),
            )
        )
        del activations[idx - num_deleted]
        del tokens["source_aux"][idx - num_deleted]
        del tokens["source"][idx - num_deleted]
        del tokens["target"][idx - num_deleted]

    for idx, activation in enumerate(activations):
        assert activation.shape[0] == len(tokens["source_aux"][idx])
        assert len(tokens["source"][idx]) == len(tokens["target"][idx])

    return tokens


[docs]def load_data(
    source_path,
    labels_path,
    activations,
    max_sent_l,
    ignore_start_token=False,
    sentence_classification=False,
):
    """Load word-annotated text-label pairs data represented as sentences. This
    function loads the source text, target labels, and activations and tries to
    make them perfectly parallel, i.e. number of tokens in line N of source would
    match the number of tokens in line N of target, and also match the number of
    activations at index N. The method will delete non-matching activation/source/target
    pairs, up to a maximum of 100 before failing. The method will also ignore
    sentences longer than the provided maximum. The activations will be modified
    in place.

    Parameters
    ----------
    source_path : str
        Path to the source text file, one sentence per line
    labels_path : str
        Path to the annotated labels file, one sentence per line corresponding to
        the sentences in the ``source_path`` file.
    activations : list of numpy.ndarray
        Activations returned from ``loader.load_activations``
    max_sent_l : int
        Maximum length of sentences. Sentences containing more tokens will be
        ignored.
    ignore_start_token : bool, optional
        Ignore the first token. Useful if there is some line position markers
        in the source text.
    sentence_classification : bool, optional
        Flag to indicate if this is a sentence classification task, where every
        sentence actually has only a single activation (e.g. [CLS] token's
        activations in the case of BERT)

    Returns
    -------
    tokens : dict
        Dictionary containing two lists, ``source`` and ``target``. ``source``
        contains all of the sentences from ``source_path`` that were not ignored.
        ``target`` contains the parallel set of annotated labels.

    """
    tokens = {"source": [], "target": []}

    with open(source_path) as source_fp:
        for line_idx, line in enumerate(source_fp):
            line_tokens = line.strip().split()
            if len(line_tokens) > max_sent_l:
                continue
            if ignore_start_token:
                line_tokens = line_tokens[1:]
                activations[line_idx] = activations[line_idx][1:, :]
            tokens["source"].append(line_tokens)

    with open(labels_path) as labels_fp:
        for line in labels_fp:
            line_tokens = line.strip().split()
            if len(line_tokens) > max_sent_l:
                continue
            if ignore_start_token:
                line_tokens = line_tokens[1:]
            tokens["target"].append(line_tokens)

    assert len(tokens["source"]) == len(tokens["target"]), (
        "Number of lines do not match (source: %d, target: %d)!"
        % (len(tokens["source"]), len(tokens["target"]))
    )

    assert len(activations) == len(tokens["source"]), (
        "Number of lines do not match (activations: %d, source: %d)!"
        % (len(activations), len(tokens["source"]))
    )

    # Check if all data is well formed (whether we have activations + labels for
    # each and every word)
    invalid_activation_idx = []
    for idx, activation in enumerate(activations):
        if activation.shape[0] == len(tokens["source"][idx]) and (
            sentence_classification or activation.shape[0] == len(tokens["target"][idx])
        ):
            pass
        else:
            invalid_activation_idx.append(idx)
            print("Skipping line: ", idx)
            print(
                "A: %d, S: %d, T: %d"
                % (
                    activation.shape[0],
                    len(tokens["source"][idx]),
                    len(tokens["target"][idx]),
                )
            )

    assert len(invalid_activation_idx) < 100, (
        "Too many mismatches (%d) - your paths are probably incorrect or something is wrong in the data!"
        % (len(invalid_activation_idx))
    )

    for num_deleted, idx in enumerate(invalid_activation_idx):
        print(
            "Deleting line %d: %d activations, %d source, %d target"
            % (
                idx - num_deleted,
                activations[idx - num_deleted].shape[0],
                len(tokens["source"][idx - num_deleted]),
                len(tokens["target"][idx - num_deleted]),
            )
        )
        del activations[idx - num_deleted]
        del tokens["source"][idx - num_deleted]
        del tokens["target"][idx - num_deleted]

    for idx, activation in enumerate(activations):
        assert activation.shape[0] == len(tokens["source"][idx])
        if not sentence_classification:
            assert activation.shape[0] == len(tokens["target"][idx])

    # TODO: Return activations
    return tokens


[docs]def load_sentence_data(source_path, labels_path, activations):
    """Loads sentence-annotated text-label pairs. This function loads the source
    text, target labels, and activations and tries to make them perfectly
    parallel, i.e. number of tokens in line N of source would
    match the number of activations at index N. The method will delete
    non-matching activation/source pairs. The activations will be modified
    in place.

    Parameters
    ----------
    source_path : str
        Path to the source text file, one sentence per line
    labels_path : str
        Path to the annotated labels file, one sentence per line corresponding to
        the sentences in the ``source_path`` file.
    activations : list of numpy.ndarray
        Activations returned from ``loader.load_activations``

    Returns
    -------
    tokens : dict
        Dictionary containing two lists, ``source`` and ``target``. ``source``
        contains all of the sentences from ``source_path`` that were not ignored.
        ``target`` contains the parallel set of annotated labels.

    """
    tokens = {"source": [], "target": []}

    with open(source_path) as source_fp:
        for line_idx, line in enumerate(source_fp):
            tokens["source"].append(["sentence_%d" % (line_idx)])

    with open(labels_path) as labels_fp:
        for line in labels_fp:
            line_tokens = line.strip().split()
            tokens["target"].append(line_tokens)

    assert len(tokens["source"]) == len(tokens["target"]), (
        "Number of lines do not match (source: %d, target: %d)!"
        % (len(tokens["source"]), len(tokens["target"]))
    )

    assert len(activations) == len(tokens["source"]), (
        "Number of lines do not match (activations: %d, source: %d)!"
        % (len(activations), len(tokens["source"]))
    )

    # Check if all data is well formed (whether we have activations + labels for
    # each and every word)
    for idx, activation in enumerate(activations):
        assert activation.shape[0] == len(tokens["source"][idx])

    return tokens
NeuroX toolkit documentation

Source code for neurox.data.loader